configs/example/apu_se.py

   1 # Copyright (c) 2015 Advanced Micro Devices, Inc.
   2 # All rights reserved.
   3 #
   4 # For use for simulation and test purposes only
   5 #
   6 # Redistribution and use in source and binary forms, with or without
   7 # modification, are permitted provided that the following conditions are met:
   8 #
   9 # 1. Redistributions of source code must retain the above copyright notice,
  10 # this list of conditions and the following disclaimer.
  11 #
  12 # 2. Redistributions in binary form must reproduce the above copyright notice,
  13 # this list of conditions and the following disclaimer in the documentation
  14 # and/or other materials provided with the distribution.
  15 #
  16 # 3. Neither the name of the copyright holder nor the names of its
  17 # contributors may be used to endorse or promote products derived from this
  18 # software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  21 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  24 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30 # POSSIBILITY OF SUCH DAMAGE.
  31
  32 from __future__ import print_function
  33 from __future__ import absolute_import
  34
  35 import optparse, os, re, getpass
  36 import math
  37 import glob
  38 import inspect
  39
  40 import m5
  41 from m5.objects import *
  42 from m5.util import addToPath
  43
  44 addToPath('../')
  45
  46 from ruby import Ruby
  47
  48 from common import Options
  49 from common import Simulation
  50 from common import GPUTLBOptions, GPUTLBConfig
  51
  52 import hsaTopology
  53 from common import FileSystemConfig
  54
  55 ########################## Script Options ########################
  56 def setOption(parser, opt_str, value = 1):
  57     # check to make sure the option actually exists
  58     if not parser.has_option(opt_str):
  59         raise Exception("cannot find %s in list of possible options" % opt_str)
  60
  61     opt = parser.get_option(opt_str)
  62     # set the value
  63     exec("parser.values.%s = %s" % (opt.dest, value))
  64
  65 def getOption(parser, opt_str):
  66     # check to make sure the option actually exists
  67     if not parser.has_option(opt_str):
  68         raise Exception("cannot find %s in list of possible options" % opt_str)
  69
  70     opt = parser.get_option(opt_str)
  71     # get the value
  72     exec("return_value = parser.values.%s" % opt.dest)
  73     return return_value
  74
  75 # Adding script options
  76 parser = optparse.OptionParser()
  77 Options.addCommonOptions(parser)
  78 Options.addSEOptions(parser)
  79
  80 parser.add_option("--cpu-only-mode", action="store_true", default=False,
  81                   help="APU mode. Used to take care of problems in "\
  82                        "Ruby.py while running APU protocols")
  83 parser.add_option("-u", "--num-compute-units", type="int", default=4,
  84                   help="number of GPU compute units"),
  85 parser.add_option("--num-cp", type="int", default=0,
  86                   help="Number of GPU Command Processors (CP)")
  87 parser.add_option("--benchmark-root", help="Root of benchmark directory tree")
  88
  89 # not super important now, but to avoid putting the number 4 everywhere, make
  90 # it an option/knob
  91 parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \
  92                   "sharing an SQC (icache, and thus icache TLB)")
  93 parser.add_option('--cu-per-scalar-cache', type='int', default=4,
  94                   help='Number of CUs sharing a scalar cache')
  95 parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
  96                   "per CU")
  97 parser.add_option('--cu-per-sa', type='int', default=4,
  98                   help='Number of CUs per shader array. This must be a '
  99                   'multiple of options.cu-per-sqc and options.cu-per-scalar')
 100 parser.add_option('--sa-per-complex', type='int', default=1,
 101                   help='Number of shader arrays per complex')
 102 parser.add_option('--num-gpu-complexes', type='int', default=1,
 103                   help='Number of GPU complexes')
 104 parser.add_option("--wf-size", type="int", default=64,
 105                   help="Wavefront size(in workitems)")
 106 parser.add_option("--sp-bypass-path-length", type="int", default=4, \
 107                   help="Number of stages of bypass path in vector ALU for "
 108                   "Single Precision ops")
 109 parser.add_option("--dp-bypass-path-length", type="int", default=4, \
 110                   help="Number of stages of bypass path in vector ALU for "
 111                   "Double Precision ops")
 112 # issue period per SIMD unit: number of cycles before issuing another vector
 113 parser.add_option("--issue-period", type="int", default=4, \
 114                   help="Number of cycles per vector instruction issue period")
 115 parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \
 116                   help="VGPR to Coalescer (Global Memory) data bus width "
 117                   "in bytes")
 118 parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \
 119                   help="Coalescer to VGPR (Global Memory) data bus width in "
 120                   "bytes")
 121 # Currently we only support 1 local memory pipe
 122 parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \
 123                   help="Number of Shared Memory pipelines per CU")
 124 # Currently we only support 1 global memory pipe
 125 parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \
 126                   help="Number of Global Memory pipelines per CU")
 127 parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
 128                   "WF slots per SIMD")
 129
 130 parser.add_option("--registerManagerPolicy", type="string", default="static",
 131                   help="Register manager policy")
 132 parser.add_option("--vreg-file-size", type="int", default=2048,
 133                   help="number of physical vector registers per SIMD")
 134 parser.add_option("--vreg-min-alloc", type="int", default=4,
 135                   help="Minimum number of registers that can be allocated "
 136                   "from the VRF. The total number of registers will be "
 137                   "aligned to this value.")
 138
 139 parser.add_option("--sreg-file-size", type="int", default=2048,
 140                   help="number of physical vector registers per SIMD")
 141 parser.add_option("--sreg-min-alloc", type="int", default=4,
 142                   help="Minimum number of registers that can be allocated "
 143                   "from the SRF. The total number of registers will be "
 144                   "aligned to this value.")
 145
 146 parser.add_option("--bw-scalor", type="int", default=0,
 147                   help="bandwidth scalor for scalability analysis")
 148 parser.add_option("--CPUClock", type="string", default="2GHz",
 149                   help="CPU clock")
 150 parser.add_option("--gpu-clock", type="string", default="1GHz",
 151                   help="GPU clock")
 152 parser.add_option("--cpu-voltage", action="store", type="string",
 153                   default='1.0V',
 154                   help = """CPU  voltage domain""")
 155 parser.add_option("--gpu-voltage", action="store", type="string",
 156                   default='1.0V',
 157                   help = """CPU  voltage domain""")
 158 parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST",
 159                   help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)")
 160 parser.add_option("--SegFaultDebug",action="store_true",
 161                  help="checks for GPU seg fault before TLB access")
 162 parser.add_option("--FunctionalTLB",action="store_true",
 163                  help="Assumes TLB has no latency")
 164 parser.add_option("--LocalMemBarrier",action="store_true",
 165                  help="Barrier does not wait for writethroughs to complete")
 166 parser.add_option("--countPages", action="store_true",
 167                  help="Count Page Accesses and output in per-CU output files")
 168 parser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\
 169                   "TLBs")
 170 parser.add_option("--pf-type", type="string", help="type of prefetch: "\
 171                   "PF_CU, PF_WF, PF_PHASE, PF_STRIDE")
 172 parser.add_option("--pf-stride", type="int", help="set prefetch stride")
 173 parser.add_option("--numLdsBanks", type="int", default=32,
 174                   help="number of physical banks per LDS module")
 175 parser.add_option("--ldsBankConflictPenalty", type="int", default=1,
 176                   help="number of cycles per LDS bank conflict")
 177 parser.add_options("--lds-size", type="int", default=65536,
 178                    help="Size of the LDS in bytes")
 179 parser.add_option('--fast-forward-pseudo-op', action='store_true',
 180                   help = 'fast forward using kvm until the m5_switchcpu'
 181                   ' pseudo-op is encountered, then switch cpus. subsequent'
 182                   ' m5_switchcpu pseudo-ops will toggle back and forth')
 183 parser.add_option("--num-hw-queues", type="int", default=10,
 184                   help="number of hw queues in packet processor")
 185
 186 Ruby.define_options(parser)
 187
 188 #add TLB options to the parser
 189 GPUTLBOptions.tlb_options(parser)
 190
 191 (options, args) = parser.parse_args()
 192
 193 # The GPU cache coherence protocols only work with the backing store
 194 setOption(parser, "--access-backing-store")
 195
 196 # if benchmark root is specified explicitly, that overrides the search path
 197 if options.benchmark_root:
 198     benchmark_path = [options.benchmark_root]
 199 else:
 200     # Set default benchmark search path to current dir
 201     benchmark_path = ['.']
 202
 203 ########################## Sanity Check ########################
 204
 205 # Currently the gpu model requires ruby
 206 if buildEnv['PROTOCOL'] == 'None':
 207     fatal("GPU model requires ruby")
 208
 209 # Currently the gpu model requires only timing or detailed CPU
 210 if not (options.cpu_type == "TimingSimpleCPU" or
 211    options.cpu_type == "DerivO3CPU"):
 212     fatal("GPU model requires TimingSimpleCPU or DerivO3CPU")
 213
 214 # This file can support multiple compute units
 215 assert(options.num_compute_units >= 1)
 216
 217 # Currently, the sqc (I-Cache of GPU) is shared by
 218 # multiple compute units(CUs). The protocol works just fine
 219 # even if sqc is not shared. Overriding this option here
 220 # so that the user need not explicitly set this (assuming
 221 # sharing sqc is the common usage)
 222 n_cu = options.num_compute_units
 223 num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc))
 224 options.num_sqc = num_sqc # pass this to Ruby
 225 num_scalar_cache = int(math.ceil(float(n_cu) / options.cu_per_scalar_cache))
 226 options.num_scalar_cache = num_scalar_cache
 227
 228 print('Num SQC = ', num_sqc, 'Num scalar caches = ', num_scalar_cache,
 229       'Num CU = ', n_cu)
 230
 231 ########################## Creating the GPU system ########################
 232 # shader is the GPU
 233 shader = Shader(n_wf = options.wfs_per_simd,
 234                 clk_domain = SrcClockDomain(
 235                     clock = options.gpu_clock,
 236                     voltage_domain = VoltageDomain(
 237                         voltage = options.gpu_voltage)))
 238
 239 # GPU_RfO(Read For Ownership) implements SC/TSO memory model.
 240 # Other GPU protocols implement release consistency at GPU side.
 241 # So, all GPU protocols other than GPU_RfO should make their writes
 242 # visible to the global memory and should read from global memory
 243 # during kernal boundary. The pipeline initiates(or do not initiate)
 244 # the acquire/release operation depending on these impl_kern_launch_rel
 245 # and impl_kern_end_rel flags.  The flag=true means pipeline initiates
 246 # a acquire/release operation at kernel launch/end.
 247 # VIPER protocols (GPU_VIPER, GPU_VIPER_Region and GPU_VIPER_Baseline)
 248 # are write-through based, and thus only imple_kern_launch_acq needs to
 249 # set.
 250 if buildEnv['PROTOCOL'] == 'GPU_RfO':
 251     shader.impl_kern_launch_acq = False
 252     shader.impl_kern_end_rel = False
 253 elif (buildEnv['PROTOCOL'] != 'GPU_VIPER' or
 254         buildEnv['PROTOCOL'] != 'GPU_VIPER_Region' or
 255         buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline'):
 256     shader.impl_kern_launch_acq = True
 257     shader.impl_kern_end_rel = False
 258 else:
 259     shader.impl_kern_launch_acq = True
 260     shader.impl_kern_end_rel = True
 261
 262 # Switching off per-lane TLB by default
 263 per_lane = False
 264 if options.TLB_config == "perLane":
 265     per_lane = True
 266
 267 # List of compute units; one GPU can have multiple compute units
 268 compute_units = []
 269 for i in range(n_cu):
 270     compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
 271                                      num_SIMDs = options.simds_per_cu,
 272                                      wf_size = options.wf_size,
 273                                      spbypass_pipe_length = \
 274                                      options.sp_bypass_path_length,
 275                                      dpbypass_pipe_length = \
 276                                      options.dp_bypass_path_length,
 277                                      issue_period = options.issue_period,
 278                                      coalescer_to_vrf_bus_width = \
 279                                      options.glbmem_rd_bus_width,
 280                                      vrf_to_coalescer_bus_width = \
 281                                      options.glbmem_wr_bus_width,
 282                                      num_global_mem_pipes = \
 283                                      options.glb_mem_pipes_per_cu,
 284                                      num_shared_mem_pipes = \
 285                                      options.shr_mem_pipes_per_cu,
 286                                      n_wf = options.wfs_per_simd,
 287                                      execPolicy = options.CUExecPolicy,
 288                                      debugSegFault = options.SegFaultDebug,
 289                                      functionalTLB = options.FunctionalTLB,
 290                                      localMemBarrier = options.LocalMemBarrier,
 291                                      countPages = options.countPages,
 292                                      localDataStore = \
 293                                      LdsState(banks = options.numLdsBanks,
 294                                               bankConflictPenalty = \
 295                                               options.ldsBankConflictPenalty,
 296                                               size = options.lds_size)))
 297     wavefronts = []
 298     vrfs = []
 299     vrf_pool_mgrs = []
 300     srfs = []
 301     srf_pool_mgrs = []
 302     for j in xrange(options.simds_per_cu):
 303         for k in xrange(shader.n_wf):
 304             wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
 305                                         wf_size = options.wf_size))
 306         vrf_pool_mgrs.append(SimplePoolManager(pool_size = \
 307                                                options.vreg_file_size,
 308                                                min_alloc = \
 309                                                options.vreg_min_alloc))
 310
 311         vrfs.append(VectorRegisterFile(simd_id=j, wf_size=options.wf_size,
 312                                        num_regs=options.vreg_file_size))
 313
 314         srf_pool_mgrs.append(SimplePoolManager(pool_size = \
 315                                                options.sreg_file_size,
 316                                                min_alloc = \
 317                                                options.vreg_min_alloc))
 318         srfs.append(ScalarRegisterFile(simd_id=j, wf_size=options.wf_size,
 319                                        num_regs=options.sreg_file_size))
 320
 321     compute_units[-1].wavefronts = wavefronts
 322     compute_units[-1].vector_register_file = vrfs
 323     compute_units[-1].scalar_register_file = srfs
 324     compute_units[-1].register_manager = \
 325         RegisterManager(policy=options.registerManagerPolicy,
 326                         vrf_pool_managers=vrf_pool_mgrs,
 327                         srf_pool_managers=srf_pool_mgrs)
 328     if options.TLB_prefetch:
 329         compute_units[-1].prefetch_depth = options.TLB_prefetch
 330         compute_units[-1].prefetch_prev_type = options.pf_type
 331
 332     # attach the LDS and the CU to the bus (actually a Bridge)
 333     compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave
 334     compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort
 335
 336 # Attach compute units to GPU
 337 shader.CUs = compute_units
 338
 339 ########################## Creating the CPU system ########################
 340 # The shader core will be whatever is after the CPU cores are accounted for
 341 shader_idx = options.num_cpus
 342
 343 # The command processor will be whatever is after the shader is accounted for
 344 cp_idx = shader_idx + 1
 345 cp_list = []
 346
 347 # List of CPUs
 348 cpu_list = []
 349
 350 CpuClass, mem_mode = Simulation.getCPUClass(options.cpu_type)
 351 if CpuClass == AtomicSimpleCPU:
 352     fatal("AtomicSimpleCPU is not supported")
 353 if mem_mode != 'timing':
 354     fatal("Only the timing memory mode is supported")
 355 shader.timing = True
 356
 357 if options.fast_forward and options.fast_forward_pseudo_op:
 358     fatal("Cannot fast-forward based both on the number of instructions and"
 359           " on pseudo-ops")
 360 fast_forward = options.fast_forward or options.fast_forward_pseudo_op
 361
 362 if fast_forward:
 363     FutureCpuClass, future_mem_mode = CpuClass, mem_mode
 364
 365     CpuClass = X86KvmCPU
 366     mem_mode = 'atomic_noncaching'
 367     # Leave shader.timing untouched, because its value only matters at the
 368     # start of the simulation and because we require switching cpus
 369     # *before* the first kernel launch.
 370
 371     future_cpu_list = []
 372
 373     # Initial CPUs to be used during fast-forwarding.
 374     for i in range(options.num_cpus):
 375         cpu = CpuClass(cpu_id = i,
 376                        clk_domain = SrcClockDomain(
 377                            clock = options.CPUClock,
 378                            voltage_domain = VoltageDomain(
 379                                voltage = options.cpu_voltage)))
 380         cpu_list.append(cpu)
 381
 382         if options.fast_forward:
 383             cpu.max_insts_any_thread = int(options.fast_forward)
 384
 385 if fast_forward:
 386     MainCpuClass = FutureCpuClass
 387 else:
 388     MainCpuClass = CpuClass
 389
 390 # CPs to be used throughout the simulation.
 391 for i in range(options.num_cp):
 392     cp = MainCpuClass(cpu_id = options.num_cpus + i,
 393                       clk_domain = SrcClockDomain(
 394                           clock = options.CPUClock,
 395                           voltage_domain = VoltageDomain(
 396                               voltage = options.cpu_voltage)))
 397     cp_list.append(cp)
 398
 399 # Main CPUs (to be used after fast-forwarding if fast-forwarding is specified).
 400 for i in range(options.num_cpus):
 401     cpu = MainCpuClass(cpu_id = i,
 402                        clk_domain = SrcClockDomain(
 403                            clock = options.CPUClock,
 404                            voltage_domain = VoltageDomain(
 405                                voltage = options.cpu_voltage)))
 406     if fast_forward:
 407         cpu.switched_out = True
 408         future_cpu_list.append(cpu)
 409     else:
 410         cpu_list.append(cpu)
 411
 412 host_cpu = cpu_list[0]
 413
 414 hsapp_gpu_map_vaddr = 0x200000000
 415 hsapp_gpu_map_size = 0x1000
 416 hsapp_gpu_map_paddr = int(Addr(options.mem_size))
 417
 418 # HSA kernel mode driver
 419 gpu_driver = GPUComputeDriver(filename="kfd")
 420
 421 # Creating the GPU kernel launching components: that is the HSA
 422 # packet processor (HSAPP), GPU command processor (CP), and the
 423 # dispatcher.
 424 gpu_hsapp = HSAPacketProcessor(pioAddr=hsapp_gpu_map_paddr,
 425                                numHWQueues=options.num_hw_queues)
 426 dispatcher = GPUDispatcher()
 427 gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp,
 428                                    dispatcher=dispatcher)
 429 gpu_driver.device = gpu_cmd_proc
 430 shader.dispatcher = dispatcher
 431 shader.gpu_cmd_proc = gpu_cmd_proc
 432
 433 # Create and assign the workload Check for rel_path in elements of
 434 # base_list using test, returning the first full path that satisfies test
 435 def find_path(base_list, rel_path, test):
 436     for base in base_list:
 437         if not base:
 438             # base could be None if environment var not set
 439             continue
 440         full_path = os.path.join(base, rel_path)
 441         if test(full_path):
 442             return full_path
 443     fatal("%s not found in %s" % (rel_path, base_list))
 444
 445 def find_file(base_list, rel_path):
 446     return find_path(base_list, rel_path, os.path.isfile)
 447
 448 executable = find_path(benchmark_path, options.cmd, os.path.exists)
 449 # It's common for a benchmark to be in a directory with the same
 450 # name as the executable, so we handle that automatically
 451 if os.path.isdir(executable):
 452     benchmark_path = [executable]
 453     executable = find_file(benchmark_path, options.cmd)
 454
 455 if options.env:
 456     with open(options.env, 'r') as f:
 457         env = [line.rstrip() for line in f]
 458 else:
 459     env = ['LD_LIBRARY_PATH=%s' % ':'.join([
 460                os.getenv('ROCM_PATH','/opt/rocm')+'/lib',
 461                os.getenv('HCC_HOME','/opt/rocm/hcc')+'/lib',
 462                os.getenv('HSA_PATH','/opt/rocm/hsa')+'/lib',
 463                os.getenv('HIP_PATH','/opt/rocm/hip')+'/lib',
 464                os.getenv('ROCM_PATH','/opt/rocm')+'/libhsakmt/lib',
 465                os.getenv('ROCM_PATH','/opt/rocm')+'/miopen/lib',
 466                os.getenv('ROCM_PATH','/opt/rocm')+'/miopengemm/lib',
 467                os.getenv('ROCM_PATH','/opt/rocm')+'/hipblas/lib',
 468                os.getenv('ROCM_PATH','/opt/rocm')+'/rocblas/lib',
 469                "/usr/lib/x86_64-linux-gnu"
 470            ]),
 471            'HOME=%s' % os.getenv('HOME','/'),
 472            "HSA_ENABLE_INTERRUPT=0"]
 473
 474 process = Process(executable = executable, cmd = [options.cmd]
 475                   + options.options.split(), drivers = [gpu_driver], env = env)
 476
 477 for cpu in cpu_list:
 478     cpu.createThreads()
 479     cpu.workload = process
 480
 481 for cp in cp_list:
 482     cp.workload = host_cpu.workload
 483
 484 if fast_forward:
 485     for i in range(len(future_cpu_list)):
 486         future_cpu_list[i].workload = cpu_list[i].workload
 487         future_cpu_list[i].createThreads()
 488
 489 ########################## Create the overall system ########################
 490 # List of CPUs that must be switched when moving between KVM and simulation
 491 if fast_forward:
 492     switch_cpu_list = \
 493         [(cpu_list[i], future_cpu_list[i]) for i in range(options.num_cpus)]
 494
 495 # Full list of processing cores in the system.
 496 cpu_list = cpu_list + [shader] + cp_list
 497
 498 # creating the overall system
 499 # notice the cpu list is explicitly added as a parameter to System
 500 system = System(cpu = cpu_list,
 501                 mem_ranges = [AddrRange(options.mem_size)],
 502                 cache_line_size = options.cacheline_size,
 503                 mem_mode = mem_mode)
 504 if fast_forward:
 505     system.future_cpu = future_cpu_list
 506 system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
 507 system.clk_domain = SrcClockDomain(clock =  options.sys_clock,
 508                                    voltage_domain = system.voltage_domain)
 509
 510 if fast_forward:
 511     have_kvm_support = 'BaseKvmCPU' in globals()
 512     if have_kvm_support and buildEnv['TARGET_ISA'] == "x86":
 513         system.vm = KvmVM()
 514         for i in range(len(host_cpu.workload)):
 515             host_cpu.workload[i].useArchPT = True
 516             host_cpu.workload[i].kvmInSE = True
 517     else:
 518         fatal("KvmCPU can only be used in SE mode with x86")
 519
 520 # configure the TLB hierarchy
 521 GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
 522
 523 # create Ruby system
 524 system.piobus = IOXBar(width=32, response_latency=0,
 525                        frontend_latency=0, forward_latency=0)
 526 dma_list = [gpu_hsapp, gpu_cmd_proc]
 527 Ruby.create_system(options, None, system, None, dma_list, None)
 528 system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock,
 529                                     voltage_domain = system.voltage_domain)
 530 gpu_cmd_proc.pio = system.piobus.master
 531 gpu_hsapp.pio = system.piobus.master
 532
 533 for i, dma_device in enumerate(dma_list):
 534     exec('system.dma_cntrl%d.clk_domain = system.ruby.clk_domain' % i)
 535
 536 # attach the CPU ports to Ruby
 537 for i in range(options.num_cpus):
 538     ruby_port = system.ruby._cpu_ports[i]
 539
 540     # Create interrupt controller
 541     system.cpu[i].createInterruptController()
 542
 543     # Connect cache port's to ruby
 544     system.cpu[i].icache_port = ruby_port.slave
 545     system.cpu[i].dcache_port = ruby_port.slave
 546
 547     ruby_port.mem_master_port = system.piobus.slave
 548     if buildEnv['TARGET_ISA'] == "x86":
 549         system.cpu[i].interrupts[0].pio = system.piobus.master
 550         system.cpu[i].interrupts[0].int_master = system.piobus.slave
 551         system.cpu[i].interrupts[0].int_slave = system.piobus.master
 552         if fast_forward:
 553             system.cpu[i].itb.walker.port = ruby_port.slave
 554             system.cpu[i].dtb.walker.port = ruby_port.slave
 555
 556 # attach CU ports to Ruby
 557 # Because of the peculiarities of the CP core, you may have 1 CPU but 2
 558 # sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be
 559 # hooked up until after the CP. To make this script generic, figure out
 560 # the index as below, but note that this assumes there is one sequencer
 561 # per compute unit and one sequencer per SQC for the math to work out
 562 # correctly.
 563 gpu_port_idx = len(system.ruby._cpu_ports) \
 564                - options.num_compute_units - options.num_sqc \
 565                - options.num_scalar_cache
 566 gpu_port_idx = gpu_port_idx - options.num_cp * 2
 567
 568 wavefront_size = options.wf_size
 569 for i in range(n_cu):
 570     # The pipeline issues wavefront_size number of uncoalesced requests
 571     # in one GPU issue cycle. Hence wavefront_size mem ports.
 572     for j in range(wavefront_size):
 573         system.cpu[shader_idx].CUs[i].memory_port[j] = \
 574                   system.ruby._cpu_ports[gpu_port_idx].slave[j]
 575     system.cpu[shader_idx].CUs[i].gmTokenPort = \
 576             system.ruby._cpu_ports[gpu_port_idx].gmTokenPort
 577     gpu_port_idx += 1
 578
 579 for i in range(n_cu):
 580     if i > 0 and not i % options.cu_per_sqc:
 581         print("incrementing idx on ", i)
 582         gpu_port_idx += 1
 583     system.cpu[shader_idx].CUs[i].sqc_port = \
 584             system.ruby._cpu_ports[gpu_port_idx].slave
 585 gpu_port_idx = gpu_port_idx + 1
 586
 587 for i in xrange(n_cu):
 588     if i > 0 and not i % options.cu_per_scalar_cache:
 589         print("incrementing idx on ", i)
 590         gpu_port_idx += 1
 591     system.cpu[shader_idx].CUs[i].scalar_port = \
 592         system.ruby._cpu_ports[gpu_port_idx].slave
 593 gpu_port_idx = gpu_port_idx + 1
 594
 595 # attach CP ports to Ruby
 596 for i in range(options.num_cp):
 597     system.cpu[cp_idx].createInterruptController()
 598     system.cpu[cp_idx].dcache_port = \
 599                 system.ruby._cpu_ports[gpu_port_idx + i * 2].slave
 600     system.cpu[cp_idx].icache_port = \
 601                 system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave
 602     system.cpu[cp_idx].interrupts[0].pio = system.piobus.master
 603     system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave
 604     system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master
 605     cp_idx = cp_idx + 1
 606
 607 ################# Connect the CPU and GPU via GPU Dispatcher ##################
 608 # CPU rings the GPU doorbell to notify a pending task
 609 # using this interface.
 610 # And GPU uses this interface to notify the CPU of task completion
 611 # The communcation happens through emulated driver.
 612
 613 # Note this implicit setting of the cpu_pointer, shader_pointer and tlb array
 614 # parameters must be after the explicit setting of the System cpu list
 615 if fast_forward:
 616     shader.cpu_pointer = future_cpu_list[0]
 617 else:
 618     shader.cpu_pointer = host_cpu
 619
 620 ########################## Start simulation ########################
 621
 622 redirect_paths = [RedirectPath(app_path = "/proc",
 623                                host_paths =
 624                                 ["%s/fs/proc" % m5.options.outdir]),
 625                   RedirectPath(app_path = "/sys",
 626                                host_paths =
 627                                 ["%s/fs/sys"  % m5.options.outdir]),
 628                   RedirectPath(app_path = "/tmp",
 629                                host_paths =
 630                                 ["%s/fs/tmp"  % m5.options.outdir])]
 631
 632 system.redirect_paths = redirect_paths
 633
 634 root = Root(system=system, full_system=False)
 635
 636 hsaTopology.createHsaTopology(options)
 637
 638 m5.ticks.setGlobalFrequency('1THz')
 639 if options.abs_max_tick:
 640     maxtick = options.abs_max_tick
 641 else:
 642     maxtick = m5.MaxTick
 643
 644 # Benchmarks support work item annotations
 645 Simulation.setWorkCountOptions(system, options)
 646
 647 # Checkpointing is not supported by APU model
 648 if (options.checkpoint_dir != None or
 649     options.checkpoint_restore != None):
 650     fatal("Checkpointing not supported by apu model")
 651
 652 checkpoint_dir = None
 653 m5.instantiate(checkpoint_dir)
 654
 655 # Map workload to this address space
 656 host_cpu.workload[0].map(0x10000000, 0x200000000, 4096)
 657
 658 if options.fast_forward:
 659     print("Switch at instruction count: %d" % cpu_list[0].max_insts_any_thread)
 660
 661 exit_event = m5.simulate(maxtick)
 662
 663 if options.fast_forward:
 664     if exit_event.getCause() == "a thread reached the max instruction count":
 665         m5.switchCpus(system, switch_cpu_list)
 666         print("Switched CPUS @ tick %s" % (m5.curTick()))
 667         m5.stats.reset()
 668         exit_event = m5.simulate(maxtick - m5.curTick())
 669 elif options.fast_forward_pseudo_op:
 670     while exit_event.getCause() == "switchcpu":
 671         # If we are switching *to* kvm, then the current stats are meaningful
 672         # Note that we don't do any warmup by default
 673         if type(switch_cpu_list[0][0]) == FutureCpuClass:
 674             print("Dumping stats...")
 675             m5.stats.dump()
 676         m5.switchCpus(system, switch_cpu_list)
 677         print("Switched CPUS @ tick %s" % (m5.curTick()))
 678         m5.stats.reset()
 679         # This lets us switch back and forth without keeping a counter
 680         switch_cpu_list = [(x[1], x[0]) for x in switch_cpu_list]
 681         exit_event = m5.simulate(maxtick - m5.curTick())
 682
 683 print("Ticks:", m5.curTick())
 684 print('Exiting because ', exit_event.getCause())
 685
 686 sys.exit(exit_event.getCode())