configs/example/apu_se.py

   1 # Copyright (c) 2015 Advanced Micro Devices, Inc.
   2 # All rights reserved.
   3 #
   4 # For use for simulation and test purposes only
   5 #
   6 # Redistribution and use in source and binary forms, with or without
   7 # modification, are permitted provided that the following conditions are met:
   8 #
   9 # 1. Redistributions of source code must retain the above copyright notice,
  10 # this list of conditions and the following disclaimer.
  11 #
  12 # 2. Redistributions in binary form must reproduce the above copyright notice,
  13 # this list of conditions and the following disclaimer in the documentation
  14 # and/or other materials provided with the distribution.
  15 #
  16 # 3. Neither the name of the copyright holder nor the names of its
  17 # contributors may be used to endorse or promote products derived from this
  18 # software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  21 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  24 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30 # POSSIBILITY OF SUCH DAMAGE.
  31
  32 from __future__ import print_function
  33 from __future__ import absolute_import
  34
  35 import optparse, os, re, getpass
  36 import math
  37 import glob
  38 import inspect
  39
  40 import m5
  41 from m5.objects import *
  42 from m5.util import addToPath
  43
  44 addToPath('../')
  45
  46 from ruby import Ruby
  47
  48 from common import Options
  49 from common import Simulation
  50 from common import GPUTLBOptions, GPUTLBConfig
  51
  52 import hsaTopology
  53 from common import FileSystemConfig
  54
  55 ########################## Script Options ########################
  56 def setOption(parser, opt_str, value = 1):
  57     # check to make sure the option actually exists
  58     if not parser.has_option(opt_str):
  59         raise Exception("cannot find %s in list of possible options" % opt_str)
  60
  61     opt = parser.get_option(opt_str)
  62     # set the value
  63     exec("parser.values.%s = %s" % (opt.dest, value))
  64
  65 def getOption(parser, opt_str):
  66     # check to make sure the option actually exists
  67     if not parser.has_option(opt_str):
  68         raise Exception("cannot find %s in list of possible options" % opt_str)
  69
  70     opt = parser.get_option(opt_str)
  71     # get the value
  72     exec("return_value = parser.values.%s" % opt.dest)
  73     return return_value
  74
  75 # Adding script options
  76 parser = optparse.OptionParser()
  77 Options.addCommonOptions(parser)
  78 Options.addSEOptions(parser)
  79
  80 parser.add_option("--cpu-only-mode", action="store_true", default=False,
  81                   help="APU mode. Used to take care of problems in "\
  82                        "Ruby.py while running APU protocols")
  83 parser.add_option("-u", "--num-compute-units", type="int", default=4,
  84                   help="number of GPU compute units"),
  85 parser.add_option("--num-cp", type="int", default=0,
  86                   help="Number of GPU Command Processors (CP)")
  87 parser.add_option("--benchmark-root", help="Root of benchmark directory tree")
  88
  89 # not super important now, but to avoid putting the number 4 everywhere, make
  90 # it an option/knob
  91 parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \
  92                   "sharing an SQC (icache, and thus icache TLB)")
  93 parser.add_option('--cu-per-scalar-cache', type='int', default=4,
  94                   help='Number of CUs sharing a scalar cache')
  95 parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
  96                   "per CU")
  97 parser.add_option('--cu-per-sa', type='int', default=4,
  98                   help='Number of CUs per shader array. This must be a '
  99                   'multiple of options.cu-per-sqc and options.cu-per-scalar')
 100 parser.add_option('--sa-per-complex', type='int', default=1,
 101                   help='Number of shader arrays per complex')
 102 parser.add_option('--num-gpu-complexes', type='int', default=1,
 103                   help='Number of GPU complexes')
 104 parser.add_option("--wf-size", type="int", default=64,
 105                   help="Wavefront size(in workitems)")
 106 parser.add_option("--sp-bypass-path-length", type="int", default=4, \
 107                   help="Number of stages of bypass path in vector ALU for "
 108                   "Single Precision ops")
 109 parser.add_option("--dp-bypass-path-length", type="int", default=4, \
 110                   help="Number of stages of bypass path in vector ALU for "
 111                   "Double Precision ops")
 112 # issue period per SIMD unit: number of cycles before issuing another vector
 113 parser.add_option("--issue-period", type="int", default=4, \
 114                   help="Number of cycles per vector instruction issue period")
 115 parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \
 116                   help="VGPR to Coalescer (Global Memory) data bus width "
 117                   "in bytes")
 118 parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \
 119                   help="Coalescer to VGPR (Global Memory) data bus width in "
 120                   "bytes")
 121 # Currently we only support 1 local memory pipe
 122 parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \
 123                   help="Number of Shared Memory pipelines per CU")
 124 # Currently we only support 1 global memory pipe
 125 parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \
 126                   help="Number of Global Memory pipelines per CU")
 127 parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
 128                   "WF slots per SIMD")
 129
 130 parser.add_option("--registerManagerPolicy", type="string", default="static",
 131                   help="Register manager policy")
 132 parser.add_option("--vreg-file-size", type="int", default=2048,
 133                   help="number of physical vector registers per SIMD")
 134 parser.add_option("--vreg-min-alloc", type="int", default=4,
 135                   help="Minimum number of registers that can be allocated "
 136                   "from the VRF. The total number of registers will be "
 137                   "aligned to this value.")
 138
 139 parser.add_option("--sreg-file-size", type="int", default=2048,
 140                   help="number of physical vector registers per SIMD")
 141 parser.add_option("--sreg-min-alloc", type="int", default=4,
 142                   help="Minimum number of registers that can be allocated "
 143                   "from the SRF. The total number of registers will be "
 144                   "aligned to this value.")
 145
 146 parser.add_option("--bw-scalor", type="int", default=0,
 147                   help="bandwidth scalor for scalability analysis")
 148 parser.add_option("--CPUClock", type="string", default="2GHz",
 149                   help="CPU clock")
 150 parser.add_option("--gpu-clock", type="string", default="1GHz",
 151                   help="GPU clock")
 152 parser.add_option("--cpu-voltage", action="store", type="string",
 153                   default='1.0V',
 154                   help = """CPU  voltage domain""")
 155 parser.add_option("--gpu-voltage", action="store", type="string",
 156                   default='1.0V',
 157                   help = """CPU  voltage domain""")
 158 parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST",
 159                   help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)")
 160 parser.add_option("--SegFaultDebug",action="store_true",
 161                  help="checks for GPU seg fault before TLB access")
 162 parser.add_option("--FunctionalTLB",action="store_true",
 163                  help="Assumes TLB has no latency")
 164 parser.add_option("--LocalMemBarrier",action="store_true",
 165                  help="Barrier does not wait for writethroughs to complete")
 166 parser.add_option("--countPages", action="store_true",
 167                  help="Count Page Accesses and output in per-CU output files")
 168 parser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\
 169                   "TLBs")
 170 parser.add_option("--pf-type", type="string", help="type of prefetch: "\
 171                   "PF_CU, PF_WF, PF_PHASE, PF_STRIDE")
 172 parser.add_option("--pf-stride", type="int", help="set prefetch stride")
 173 parser.add_option("--numLdsBanks", type="int", default=32,
 174                   help="number of physical banks per LDS module")
 175 parser.add_option("--ldsBankConflictPenalty", type="int", default=1,
 176                   help="number of cycles per LDS bank conflict")
 177 parser.add_option('--fast-forward-pseudo-op', action='store_true',
 178                   help = 'fast forward using kvm until the m5_switchcpu'
 179                   ' pseudo-op is encountered, then switch cpus. subsequent'
 180                   ' m5_switchcpu pseudo-ops will toggle back and forth')
 181 parser.add_option("--num-hw-queues", type="int", default=10,
 182                   help="number of hw queues in packet processor")
 183
 184 Ruby.define_options(parser)
 185
 186 #add TLB options to the parser
 187 GPUTLBOptions.tlb_options(parser)
 188
 189 (options, args) = parser.parse_args()
 190
 191 # The GPU cache coherence protocols only work with the backing store
 192 setOption(parser, "--access-backing-store")
 193
 194 # if benchmark root is specified explicitly, that overrides the search path
 195 if options.benchmark_root:
 196     benchmark_path = [options.benchmark_root]
 197 else:
 198     # Set default benchmark search path to current dir
 199     benchmark_path = ['.']
 200
 201 ########################## Sanity Check ########################
 202
 203 # Currently the gpu model requires ruby
 204 if buildEnv['PROTOCOL'] == 'None':
 205     fatal("GPU model requires ruby")
 206
 207 # Currently the gpu model requires only timing or detailed CPU
 208 if not (options.cpu_type == "TimingSimpleCPU" or
 209    options.cpu_type == "DerivO3CPU"):
 210     fatal("GPU model requires TimingSimpleCPU or DerivO3CPU")
 211
 212 # This file can support multiple compute units
 213 assert(options.num_compute_units >= 1)
 214
 215 # Currently, the sqc (I-Cache of GPU) is shared by
 216 # multiple compute units(CUs). The protocol works just fine
 217 # even if sqc is not shared. Overriding this option here
 218 # so that the user need not explicitly set this (assuming
 219 # sharing sqc is the common usage)
 220 n_cu = options.num_compute_units
 221 num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc))
 222 options.num_sqc = num_sqc # pass this to Ruby
 223 num_scalar_cache = int(math.ceil(float(n_cu) / options.cu_per_scalar_cache))
 224 options.num_scalar_cache = num_scalar_cache
 225
 226 print('Num SQC = ', num_sqc, 'Num scalar caches = ', num_scalar_cache,
 227       'Num CU = ', n_cu)
 228
 229 ########################## Creating the GPU system ########################
 230 # shader is the GPU
 231 shader = Shader(n_wf = options.wfs_per_simd,
 232                 clk_domain = SrcClockDomain(
 233                     clock = options.gpu_clock,
 234                     voltage_domain = VoltageDomain(
 235                         voltage = options.gpu_voltage)))
 236
 237 # GPU_RfO(Read For Ownership) implements SC/TSO memory model.
 238 # Other GPU protocols implement release consistency at GPU side.
 239 # So, all GPU protocols other than GPU_RfO should make their writes
 240 # visible to the global memory and should read from global memory
 241 # during kernal boundary. The pipeline initiates(or do not initiate)
 242 # the acquire/release operation depending on these impl_kern_launch_rel
 243 # and impl_kern_end_rel flags.  The flag=true means pipeline initiates
 244 # a acquire/release operation at kernel launch/end.
 245 # VIPER protocols (GPU_VIPER, GPU_VIPER_Region and GPU_VIPER_Baseline)
 246 # are write-through based, and thus only imple_kern_launch_acq needs to
 247 # set.
 248 if buildEnv['PROTOCOL'] == 'GPU_RfO':
 249     shader.impl_kern_launch_acq = False
 250     shader.impl_kern_end_rel = False
 251 elif (buildEnv['PROTOCOL'] != 'GPU_VIPER' or
 252         buildEnv['PROTOCOL'] != 'GPU_VIPER_Region' or
 253         buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline'):
 254     shader.impl_kern_launch_acq = True
 255     shader.impl_kern_end_rel = False
 256 else:
 257     shader.impl_kern_launch_acq = True
 258     shader.impl_kern_end_rel = True
 259
 260 # Switching off per-lane TLB by default
 261 per_lane = False
 262 if options.TLB_config == "perLane":
 263     per_lane = True
 264
 265 # List of compute units; one GPU can have multiple compute units
 266 compute_units = []
 267 for i in range(n_cu):
 268     compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
 269                                      num_SIMDs = options.simds_per_cu,
 270                                      wf_size = options.wf_size,
 271                                      spbypass_pipe_length = \
 272                                      options.sp_bypass_path_length,
 273                                      dpbypass_pipe_length = \
 274                                      options.dp_bypass_path_length,
 275                                      issue_period = options.issue_period,
 276                                      coalescer_to_vrf_bus_width = \
 277                                      options.glbmem_rd_bus_width,
 278                                      vrf_to_coalescer_bus_width = \
 279                                      options.glbmem_wr_bus_width,
 280                                      num_global_mem_pipes = \
 281                                      options.glb_mem_pipes_per_cu,
 282                                      num_shared_mem_pipes = \
 283                                      options.shr_mem_pipes_per_cu,
 284                                      n_wf = options.wfs_per_simd,
 285                                      execPolicy = options.CUExecPolicy,
 286                                      debugSegFault = options.SegFaultDebug,
 287                                      functionalTLB = options.FunctionalTLB,
 288                                      localMemBarrier = options.LocalMemBarrier,
 289                                      countPages = options.countPages,
 290                                      localDataStore = \
 291                                      LdsState(banks = options.numLdsBanks,
 292                                               bankConflictPenalty = \
 293                                               options.ldsBankConflictPenalty)))
 294     wavefronts = []
 295     vrfs = []
 296     vrf_pool_mgrs = []
 297     srfs = []
 298     srf_pool_mgrs = []
 299     for j in xrange(options.simds_per_cu):
 300         for k in xrange(shader.n_wf):
 301             wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
 302                                         wf_size = options.wf_size))
 303         vrf_pool_mgrs.append(SimplePoolManager(pool_size = \
 304                                                options.vreg_file_size,
 305                                                min_alloc = \
 306                                                options.vreg_min_alloc))
 307
 308         vrfs.append(VectorRegisterFile(simd_id=j, wf_size=options.wf_size,
 309                                        num_regs=options.vreg_file_size))
 310
 311         srf_pool_mgrs.append(SimplePoolManager(pool_size = \
 312                                                options.sreg_file_size,
 313                                                min_alloc = \
 314                                                options.vreg_min_alloc))
 315         srfs.append(ScalarRegisterFile(simd_id=j, wf_size=options.wf_size,
 316                                        num_regs=options.sreg_file_size))
 317
 318     compute_units[-1].wavefronts = wavefronts
 319     compute_units[-1].vector_register_file = vrfs
 320     compute_units[-1].scalar_register_file = srfs
 321     compute_units[-1].register_manager = \
 322         RegisterManager(policy=options.registerManagerPolicy,
 323                         vrf_pool_managers=vrf_pool_mgrs,
 324                         srf_pool_managers=srf_pool_mgrs)
 325     if options.TLB_prefetch:
 326         compute_units[-1].prefetch_depth = options.TLB_prefetch
 327         compute_units[-1].prefetch_prev_type = options.pf_type
 328
 329     # attach the LDS and the CU to the bus (actually a Bridge)
 330     compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave
 331     compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort
 332
 333 # Attach compute units to GPU
 334 shader.CUs = compute_units
 335
 336 ########################## Creating the CPU system ########################
 337 options.num_cpus = options.num_cpus
 338
 339 # The shader core will be whatever is after the CPU cores are accounted for
 340 shader_idx = options.num_cpus
 341
 342 # The command processor will be whatever is after the shader is accounted for
 343 cp_idx = shader_idx + 1
 344 cp_list = []
 345
 346 # List of CPUs
 347 cpu_list = []
 348
 349 CpuClass, mem_mode = Simulation.getCPUClass(options.cpu_type)
 350 if CpuClass == AtomicSimpleCPU:
 351     fatal("AtomicSimpleCPU is not supported")
 352 if mem_mode != 'timing':
 353     fatal("Only the timing memory mode is supported")
 354 shader.timing = True
 355
 356 if options.fast_forward and options.fast_forward_pseudo_op:
 357     fatal("Cannot fast-forward based both on the number of instructions and"
 358           " on pseudo-ops")
 359 fast_forward = options.fast_forward or options.fast_forward_pseudo_op
 360
 361 if fast_forward:
 362     FutureCpuClass, future_mem_mode = CpuClass, mem_mode
 363
 364     CpuClass = X86KvmCPU
 365     mem_mode = 'atomic_noncaching'
 366     # Leave shader.timing untouched, because its value only matters at the
 367     # start of the simulation and because we require switching cpus
 368     # *before* the first kernel launch.
 369
 370     future_cpu_list = []
 371
 372     # Initial CPUs to be used during fast-forwarding.
 373     for i in range(options.num_cpus):
 374         cpu = CpuClass(cpu_id = i,
 375                        clk_domain = SrcClockDomain(
 376                            clock = options.CPUClock,
 377                            voltage_domain = VoltageDomain(
 378                                voltage = options.cpu_voltage)))
 379         cpu_list.append(cpu)
 380
 381         if options.fast_forward:
 382             cpu.max_insts_any_thread = int(options.fast_forward)
 383
 384 if fast_forward:
 385     MainCpuClass = FutureCpuClass
 386 else:
 387     MainCpuClass = CpuClass
 388
 389 # CPs to be used throughout the simulation.
 390 for i in range(options.num_cp):
 391     cp = MainCpuClass(cpu_id = options.num_cpus + i,
 392                       clk_domain = SrcClockDomain(
 393                           clock = options.CPUClock,
 394                           voltage_domain = VoltageDomain(
 395                               voltage = options.cpu_voltage)))
 396     cp_list.append(cp)
 397
 398 # Main CPUs (to be used after fast-forwarding if fast-forwarding is specified).
 399 for i in range(options.num_cpus):
 400     cpu = MainCpuClass(cpu_id = i,
 401                        clk_domain = SrcClockDomain(
 402                            clock = options.CPUClock,
 403                            voltage_domain = VoltageDomain(
 404                                voltage = options.cpu_voltage)))
 405     if fast_forward:
 406         cpu.switched_out = True
 407         future_cpu_list.append(cpu)
 408     else:
 409         cpu_list.append(cpu)
 410
 411 host_cpu = cpu_list[0]
 412
 413 hsapp_gpu_map_vaddr = 0x200000000
 414 hsapp_gpu_map_size = 0x1000
 415 hsapp_gpu_map_paddr = int(Addr(options.mem_size))
 416
 417 # HSA kernel mode driver
 418 gpu_driver = GPUComputeDriver(filename="kfd")
 419
 420 # Creating the GPU kernel launching components: that is the HSA
 421 # packet processor (HSAPP), GPU command processor (CP), and the
 422 # dispatcher.
 423 gpu_hsapp = HSAPacketProcessor(pioAddr=hsapp_gpu_map_paddr,
 424                                numHWQueues=options.num_hw_queues)
 425 dispatcher = GPUDispatcher()
 426 gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp,
 427                                    dispatcher=dispatcher)
 428 gpu_driver.device = gpu_cmd_proc
 429 shader.dispatcher = dispatcher
 430 shader.gpu_cmd_proc = gpu_cmd_proc
 431
 432 # Create and assign the workload Check for rel_path in elements of
 433 # base_list using test, returning the first full path that satisfies test
 434 def find_path(base_list, rel_path, test):
 435     for base in base_list:
 436         if not base:
 437             # base could be None if environment var not set
 438             continue
 439         full_path = os.path.join(base, rel_path)
 440         if test(full_path):
 441             return full_path
 442     fatal("%s not found in %s" % (rel_path, base_list))
 443
 444 def find_file(base_list, rel_path):
 445     return find_path(base_list, rel_path, os.path.isfile)
 446
 447 executable = find_path(benchmark_path, options.cmd, os.path.exists)
 448 # It's common for a benchmark to be in a directory with the same
 449 # name as the executable, so we handle that automatically
 450 if os.path.isdir(executable):
 451     benchmark_path = [executable]
 452     executable = find_file(benchmark_path, options.cmd)
 453
 454 if options.env:
 455     with open(options.env, 'r') as f:
 456         env = [line.rstrip() for line in f]
 457 else:
 458     env = ['LD_LIBRARY_PATH=%s' % ':'.join([
 459                "/proj/radl_tools/rocm-1.6/lib",
 460                "/proj/radl_tools/rocm-1.6/hcc/lib64",
 461                "/tool/pandora64/.package/libunwind-1.1/lib",
 462                "/tool/pandora64/.package/gcc-6.4.0/lib64"
 463            ]),
 464            "HSA_ENABLE_INTERRUPT=0"]
 465
 466 process = Process(executable = executable, cmd = [options.cmd]
 467                   + options.options.split(), drivers = [gpu_driver], env = env)
 468
 469 for cpu in cpu_list:
 470     cpu.createThreads()
 471     cpu.workload = process
 472
 473 for cp in cp_list:
 474     cp.workload = host_cpu.workload
 475
 476 if fast_forward:
 477     for i in range(len(future_cpu_list)):
 478         future_cpu_list[i].workload = cpu_list[i].workload
 479         future_cpu_list[i].createThreads()
 480
 481 ########################## Create the overall system ########################
 482 # List of CPUs that must be switched when moving between KVM and simulation
 483 if fast_forward:
 484     switch_cpu_list = \
 485         [(cpu_list[i], future_cpu_list[i]) for i in range(options.num_cpus)]
 486
 487 # Full list of processing cores in the system.
 488 cpu_list = cpu_list + [shader] + cp_list
 489
 490 # creating the overall system
 491 # notice the cpu list is explicitly added as a parameter to System
 492 system = System(cpu = cpu_list,
 493                 mem_ranges = [AddrRange(options.mem_size)],
 494                 cache_line_size = options.cacheline_size,
 495                 mem_mode = mem_mode)
 496 if fast_forward:
 497     system.future_cpu = future_cpu_list
 498 system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
 499 system.clk_domain = SrcClockDomain(clock =  options.sys_clock,
 500                                    voltage_domain = system.voltage_domain)
 501
 502 if fast_forward:
 503     have_kvm_support = 'BaseKvmCPU' in globals()
 504     if have_kvm_support and buildEnv['TARGET_ISA'] == "x86":
 505         system.vm = KvmVM()
 506         for i in range(len(host_cpu.workload)):
 507             host_cpu.workload[i].useArchPT = True
 508             host_cpu.workload[i].kvmInSE = True
 509     else:
 510         fatal("KvmCPU can only be used in SE mode with x86")
 511
 512 # configure the TLB hierarchy
 513 GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
 514
 515 # create Ruby system
 516 system.piobus = IOXBar(width=32, response_latency=0,
 517                        frontend_latency=0, forward_latency=0)
 518 dma_list = [gpu_hsapp, gpu_cmd_proc]
 519 Ruby.create_system(options, None, system, None, dma_list, None)
 520 system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock,
 521                                     voltage_domain = system.voltage_domain)
 522 gpu_cmd_proc.pio = system.piobus.master
 523 gpu_hsapp.pio = system.piobus.master
 524
 525 for i, dma_device in enumerate(dma_list):
 526     exec('system.dma_cntrl%d.clk_domain = system.ruby.clk_domain' % i)
 527
 528 # attach the CPU ports to Ruby
 529 for i in range(options.num_cpus):
 530     ruby_port = system.ruby._cpu_ports[i]
 531
 532     # Create interrupt controller
 533     system.cpu[i].createInterruptController()
 534
 535     # Connect cache port's to ruby
 536     system.cpu[i].icache_port = ruby_port.slave
 537     system.cpu[i].dcache_port = ruby_port.slave
 538
 539     ruby_port.mem_master_port = system.piobus.slave
 540     if buildEnv['TARGET_ISA'] == "x86":
 541         system.cpu[i].interrupts[0].pio = system.piobus.master
 542         system.cpu[i].interrupts[0].int_master = system.piobus.slave
 543         system.cpu[i].interrupts[0].int_slave = system.piobus.master
 544         if fast_forward:
 545             system.cpu[i].itb.walker.port = ruby_port.slave
 546             system.cpu[i].dtb.walker.port = ruby_port.slave
 547
 548 # attach CU ports to Ruby
 549 # Because of the peculiarities of the CP core, you may have 1 CPU but 2
 550 # sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be
 551 # hooked up until after the CP. To make this script generic, figure out
 552 # the index as below, but note that this assumes there is one sequencer
 553 # per compute unit and one sequencer per SQC for the math to work out
 554 # correctly.
 555 gpu_port_idx = len(system.ruby._cpu_ports) \
 556                - options.num_compute_units - options.num_sqc \
 557                - options.num_scalar_cache
 558 gpu_port_idx = gpu_port_idx - options.num_cp * 2
 559
 560 wavefront_size = options.wf_size
 561 for i in range(n_cu):
 562     # The pipeline issues wavefront_size number of uncoalesced requests
 563     # in one GPU issue cycle. Hence wavefront_size mem ports.
 564     for j in range(wavefront_size):
 565         system.cpu[shader_idx].CUs[i].memory_port[j] = \
 566                   system.ruby._cpu_ports[gpu_port_idx].slave[j]
 567     gpu_port_idx += 1
 568
 569 for i in range(n_cu):
 570     if i > 0 and not i % options.cu_per_sqc:
 571         print("incrementing idx on ", i)
 572         gpu_port_idx += 1
 573     system.cpu[shader_idx].CUs[i].sqc_port = \
 574             system.ruby._cpu_ports[gpu_port_idx].slave
 575 gpu_port_idx = gpu_port_idx + 1
 576
 577 for i in xrange(n_cu):
 578     if i > 0 and not i % options.cu_per_scalar_cache:
 579         print("incrementing idx on ", i)
 580         gpu_port_idx += 1
 581     system.cpu[shader_idx].CUs[i].scalar_port = \
 582         system.ruby._cpu_ports[gpu_port_idx].slave
 583 gpu_port_idx = gpu_port_idx + 1
 584
 585 # attach CP ports to Ruby
 586 for i in range(options.num_cp):
 587     system.cpu[cp_idx].createInterruptController()
 588     system.cpu[cp_idx].dcache_port = \
 589                 system.ruby._cpu_ports[gpu_port_idx + i * 2].slave
 590     system.cpu[cp_idx].icache_port = \
 591                 system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave
 592     system.cpu[cp_idx].interrupts[0].pio = system.piobus.master
 593     system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave
 594     system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master
 595     cp_idx = cp_idx + 1
 596
 597 ################# Connect the CPU and GPU via GPU Dispatcher ##################
 598 # CPU rings the GPU doorbell to notify a pending task
 599 # using this interface.
 600 # And GPU uses this interface to notify the CPU of task completion
 601 # The communcation happens through emulated driver.
 602
 603 # Note this implicit setting of the cpu_pointer, shader_pointer and tlb array
 604 # parameters must be after the explicit setting of the System cpu list
 605 if fast_forward:
 606     shader.cpu_pointer = future_cpu_list[0]
 607 else:
 608     shader.cpu_pointer = host_cpu
 609
 610 ########################## Start simulation ########################
 611
 612 chroot = os.path.expanduser(options.chroot)
 613 redirect_paths = [RedirectPath(src = "/proc",
 614                                dests = ["%s/fs/proc" % m5.options.outdir]),
 615                   RedirectPath(src = "/sys",
 616                                dests = ["%s/fs/sys"  % m5.options.outdir]),
 617                   RedirectPath(src = "/tmp",
 618                                dests = ["%s/fs/tmp"  % m5.options.outdir]),
 619                   RedirectPath(src = "/dev/shm",
 620                                dests = ["/dev/shm/%s/gem5_%s"  %
 621                                    (getpass.getuser(), os.getpid())])]
 622
 623 system.redirect_paths = redirect_paths
 624
 625 root = Root(system=system, full_system=False)
 626
 627 hsaTopology.createHsaTopology(options)
 628
 629 m5.ticks.setGlobalFrequency('1THz')
 630 if options.abs_max_tick:
 631     maxtick = options.abs_max_tick
 632 else:
 633     maxtick = m5.MaxTick
 634
 635 # Benchmarks support work item annotations
 636 Simulation.setWorkCountOptions(system, options)
 637
 638 # Checkpointing is not supported by APU model
 639 if (options.checkpoint_dir != None or
 640     options.checkpoint_restore != None):
 641     fatal("Checkpointing not supported by apu model")
 642
 643 checkpoint_dir = None
 644 m5.instantiate(checkpoint_dir)
 645
 646 # Map workload to this address space
 647 host_cpu.workload[0].map(0x10000000, 0x200000000, 4096)
 648
 649 if options.fast_forward:
 650     print("Switch at instruction count: %d" % cpu_list[0].max_insts_any_thread)
 651
 652 exit_event = m5.simulate(maxtick)
 653
 654 if options.fast_forward:
 655     if exit_event.getCause() == "a thread reached the max instruction count":
 656         m5.switchCpus(system, switch_cpu_list)
 657         print("Switched CPUS @ tick %s" % (m5.curTick()))
 658         m5.stats.reset()
 659         exit_event = m5.simulate(maxtick - m5.curTick())
 660 elif options.fast_forward_pseudo_op:
 661     while exit_event.getCause() == "switchcpu":
 662         # If we are switching *to* kvm, then the current stats are meaningful
 663         # Note that we don't do any warmup by default
 664         if type(switch_cpu_list[0][0]) == FutureCpuClass:
 665             print("Dumping stats...")
 666             m5.stats.dump()
 667         m5.switchCpus(system, switch_cpu_list)
 668         print("Switched CPUS @ tick %s" % (m5.curTick()))
 669         m5.stats.reset()
 670         # This lets us switch back and forth without keeping a counter
 671         switch_cpu_list = [(x[1], x[0]) for x in switch_cpu_list]
 672         exit_event = m5.simulate(maxtick - m5.curTick())
 673
 674 print("Ticks:", m5.curTick())
 675 print('Exiting because ', exit_event.getCause())
 676
 677 FileSystemConfig.cleanup_filesystem(options)
 678
 679 sys.exit(exit_event.getCode())