dev-hsa: enable interruptible hsa signal support
[gem5.git] / configs / example / apu_se.py
1 # Copyright (c) 2015 Advanced Micro Devices, Inc.
2 # All rights reserved.
3 #
4 # For use for simulation and test purposes only
5 #
6 # Redistribution and use in source and binary forms, with or without
7 # modification, are permitted provided that the following conditions are met:
8 #
9 # 1. Redistributions of source code must retain the above copyright notice,
10 # this list of conditions and the following disclaimer.
11 #
12 # 2. Redistributions in binary form must reproduce the above copyright notice,
13 # this list of conditions and the following disclaimer in the documentation
14 # and/or other materials provided with the distribution.
15 #
16 # 3. Neither the name of the copyright holder nor the names of its
17 # contributors may be used to endorse or promote products derived from this
18 # software without specific prior written permission.
19 #
20 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
24 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 # POSSIBILITY OF SUCH DAMAGE.
31
32 import optparse, os, re, getpass
33 import math
34 import glob
35 import inspect
36
37 import m5
38 from m5.objects import *
39 from m5.util import addToPath
40
41 addToPath('../')
42
43 from ruby import Ruby
44
45 from common import Options
46 from common import Simulation
47 from common import GPUTLBOptions, GPUTLBConfig
48
49 import hsaTopology
50 from common import FileSystemConfig
51
52 ########################## Script Options ########################
53 def setOption(parser, opt_str, value = 1):
54 # check to make sure the option actually exists
55 if not parser.has_option(opt_str):
56 raise Exception("cannot find %s in list of possible options" % opt_str)
57
58 opt = parser.get_option(opt_str)
59 # set the value
60 exec("parser.values.%s = %s" % (opt.dest, value))
61
62 def getOption(parser, opt_str):
63 # check to make sure the option actually exists
64 if not parser.has_option(opt_str):
65 raise Exception("cannot find %s in list of possible options" % opt_str)
66
67 opt = parser.get_option(opt_str)
68 # get the value
69 exec("return_value = parser.values.%s" % opt.dest)
70 return return_value
71
72 # Adding script options
73 parser = optparse.OptionParser()
74 Options.addCommonOptions(parser)
75 Options.addSEOptions(parser)
76
77 parser.add_option("--cpu-only-mode", action="store_true", default=False,
78 help="APU mode. Used to take care of problems in "\
79 "Ruby.py while running APU protocols")
80 parser.add_option("-u", "--num-compute-units", type="int", default=4,
81 help="number of GPU compute units"),
82 parser.add_option("--num-cp", type="int", default=0,
83 help="Number of GPU Command Processors (CP)")
84 parser.add_option("--benchmark-root", help="Root of benchmark directory tree")
85
86 # not super important now, but to avoid putting the number 4 everywhere, make
87 # it an option/knob
88 parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \
89 "sharing an SQC (icache, and thus icache TLB)")
90 parser.add_option('--cu-per-scalar-cache', type='int', default=4,
91 help='Number of CUs sharing a scalar cache')
92 parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
93 "per CU")
94 parser.add_option('--cu-per-sa', type='int', default=4,
95 help='Number of CUs per shader array. This must be a '
96 'multiple of options.cu-per-sqc and options.cu-per-scalar')
97 parser.add_option('--sa-per-complex', type='int', default=1,
98 help='Number of shader arrays per complex')
99 parser.add_option('--num-gpu-complexes', type='int', default=1,
100 help='Number of GPU complexes')
101 parser.add_option("--wf-size", type="int", default=64,
102 help="Wavefront size(in workitems)")
103 parser.add_option("--sp-bypass-path-length", type="int", default=4, \
104 help="Number of stages of bypass path in vector ALU for "
105 "Single Precision ops")
106 parser.add_option("--dp-bypass-path-length", type="int", default=4, \
107 help="Number of stages of bypass path in vector ALU for "
108 "Double Precision ops")
109 # issue period per SIMD unit: number of cycles before issuing another vector
110 parser.add_option("--issue-period", type="int", default=4, \
111 help="Number of cycles per vector instruction issue period")
112 parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \
113 help="VGPR to Coalescer (Global Memory) data bus width "
114 "in bytes")
115 parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \
116 help="Coalescer to VGPR (Global Memory) data bus width in "
117 "bytes")
118 # Currently we only support 1 local memory pipe
119 parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \
120 help="Number of Shared Memory pipelines per CU")
121 # Currently we only support 1 global memory pipe
122 parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \
123 help="Number of Global Memory pipelines per CU")
124 parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
125 "WF slots per SIMD")
126
127 parser.add_option("--registerManagerPolicy", type="string", default="static",
128 help="Register manager policy")
129 parser.add_option("--vreg-file-size", type="int", default=2048,
130 help="number of physical vector registers per SIMD")
131 parser.add_option("--vreg-min-alloc", type="int", default=4,
132 help="Minimum number of registers that can be allocated "
133 "from the VRF. The total number of registers will be "
134 "aligned to this value.")
135
136 parser.add_option("--sreg-file-size", type="int", default=2048,
137 help="number of physical vector registers per SIMD")
138 parser.add_option("--sreg-min-alloc", type="int", default=4,
139 help="Minimum number of registers that can be allocated "
140 "from the SRF. The total number of registers will be "
141 "aligned to this value.")
142
143 parser.add_option("--bw-scalor", type="int", default=0,
144 help="bandwidth scalor for scalability analysis")
145 parser.add_option("--CPUClock", type="string", default="2GHz",
146 help="CPU clock")
147 parser.add_option("--gpu-clock", type="string", default="1GHz",
148 help="GPU clock")
149 parser.add_option("--cpu-voltage", action="store", type="string",
150 default='1.0V',
151 help = """CPU voltage domain""")
152 parser.add_option("--gpu-voltage", action="store", type="string",
153 default='1.0V',
154 help = """CPU voltage domain""")
155 parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST",
156 help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)")
157 parser.add_option("--SegFaultDebug",action="store_true",
158 help="checks for GPU seg fault before TLB access")
159 parser.add_option("--FunctionalTLB",action="store_true",
160 help="Assumes TLB has no latency")
161 parser.add_option("--LocalMemBarrier",action="store_true",
162 help="Barrier does not wait for writethroughs to complete")
163 parser.add_option("--countPages", action="store_true",
164 help="Count Page Accesses and output in per-CU output files")
165 parser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\
166 "TLBs")
167 parser.add_option("--pf-type", type="string", help="type of prefetch: "\
168 "PF_CU, PF_WF, PF_PHASE, PF_STRIDE")
169 parser.add_option("--pf-stride", type="int", help="set prefetch stride")
170 parser.add_option("--numLdsBanks", type="int", default=32,
171 help="number of physical banks per LDS module")
172 parser.add_option("--ldsBankConflictPenalty", type="int", default=1,
173 help="number of cycles per LDS bank conflict")
174 parser.add_option("--lds-size", type="int", default=65536,
175 help="Size of the LDS in bytes")
176 parser.add_option('--fast-forward-pseudo-op', action='store_true',
177 help = 'fast forward using kvm until the m5_switchcpu'
178 ' pseudo-op is encountered, then switch cpus. subsequent'
179 ' m5_switchcpu pseudo-ops will toggle back and forth')
180 parser.add_option("--num-hw-queues", type="int", default=10,
181 help="number of hw queues in packet processor")
182 parser.add_option("--reg-alloc-policy",type="string", default="simple",
183 help="register allocation policy (simple/dynamic)")
184
185 Ruby.define_options(parser)
186
187 #add TLB options to the parser
188 GPUTLBOptions.tlb_options(parser)
189
190 (options, args) = parser.parse_args()
191
192 # The GPU cache coherence protocols only work with the backing store
193 setOption(parser, "--access-backing-store")
194
195 # if benchmark root is specified explicitly, that overrides the search path
196 if options.benchmark_root:
197 benchmark_path = [options.benchmark_root]
198 else:
199 # Set default benchmark search path to current dir
200 benchmark_path = ['.']
201
202 ########################## Sanity Check ########################
203
204 # Currently the gpu model requires ruby
205 if buildEnv['PROTOCOL'] == 'None':
206 fatal("GPU model requires ruby")
207
208 # Currently the gpu model requires only timing or detailed CPU
209 if not (options.cpu_type == "TimingSimpleCPU" or
210 options.cpu_type == "DerivO3CPU"):
211 fatal("GPU model requires TimingSimpleCPU or DerivO3CPU")
212
213 # This file can support multiple compute units
214 assert(options.num_compute_units >= 1)
215
216 # Currently, the sqc (I-Cache of GPU) is shared by
217 # multiple compute units(CUs). The protocol works just fine
218 # even if sqc is not shared. Overriding this option here
219 # so that the user need not explicitly set this (assuming
220 # sharing sqc is the common usage)
221 n_cu = options.num_compute_units
222 num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc))
223 options.num_sqc = num_sqc # pass this to Ruby
224 num_scalar_cache = int(math.ceil(float(n_cu) / options.cu_per_scalar_cache))
225 options.num_scalar_cache = num_scalar_cache
226
227 print('Num SQC = ', num_sqc, 'Num scalar caches = ', num_scalar_cache,
228 'Num CU = ', n_cu)
229
230 ########################## Creating the GPU system ########################
231 # shader is the GPU
232 shader = Shader(n_wf = options.wfs_per_simd,
233 clk_domain = SrcClockDomain(
234 clock = options.gpu_clock,
235 voltage_domain = VoltageDomain(
236 voltage = options.gpu_voltage)))
237
238 # VIPER GPU protocol implements release consistency at GPU side. So,
239 # we make their writes visible to the global memory and should read
240 # from global memory during kernal boundary. The pipeline initiates
241 # (or do not initiate) the acquire/release operation depending on
242 # these impl_kern_launch_rel and impl_kern_end_rel flags. The flag=true
243 # means pipeline initiates a acquire/release operation at kernel launch/end.
244 # VIPER protocol is write-through based, and thus only impl_kern_launch_acq
245 # needs to set.
246 if (buildEnv['PROTOCOL'] == 'GPU_VIPER'):
247 shader.impl_kern_launch_acq = True
248 shader.impl_kern_end_rel = False
249 else:
250 shader.impl_kern_launch_acq = True
251 shader.impl_kern_end_rel = True
252
253 # Switching off per-lane TLB by default
254 per_lane = False
255 if options.TLB_config == "perLane":
256 per_lane = True
257
258 # List of compute units; one GPU can have multiple compute units
259 compute_units = []
260 for i in range(n_cu):
261 compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
262 num_SIMDs = options.simds_per_cu,
263 wf_size = options.wf_size,
264 spbypass_pipe_length = \
265 options.sp_bypass_path_length,
266 dpbypass_pipe_length = \
267 options.dp_bypass_path_length,
268 issue_period = options.issue_period,
269 coalescer_to_vrf_bus_width = \
270 options.glbmem_rd_bus_width,
271 vrf_to_coalescer_bus_width = \
272 options.glbmem_wr_bus_width,
273 num_global_mem_pipes = \
274 options.glb_mem_pipes_per_cu,
275 num_shared_mem_pipes = \
276 options.shr_mem_pipes_per_cu,
277 n_wf = options.wfs_per_simd,
278 execPolicy = options.CUExecPolicy,
279 debugSegFault = options.SegFaultDebug,
280 functionalTLB = options.FunctionalTLB,
281 localMemBarrier = options.LocalMemBarrier,
282 countPages = options.countPages,
283 localDataStore = \
284 LdsState(banks = options.numLdsBanks,
285 bankConflictPenalty = \
286 options.ldsBankConflictPenalty,
287 size = options.lds_size)))
288 wavefronts = []
289 vrfs = []
290 vrf_pool_mgrs = []
291 srfs = []
292 srf_pool_mgrs = []
293 for j in range(options.simds_per_cu):
294 for k in range(shader.n_wf):
295 wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
296 wf_size = options.wf_size))
297
298 if options.reg_alloc_policy == "simple":
299 vrf_pool_mgrs.append(SimplePoolManager(pool_size = \
300 options.vreg_file_size,
301 min_alloc = \
302 options.vreg_min_alloc))
303 srf_pool_mgrs.append(SimplePoolManager(pool_size = \
304 options.sreg_file_size,
305 min_alloc = \
306 options.vreg_min_alloc))
307 elif options.reg_alloc_policy == "dynamic":
308 vrf_pool_mgrs.append(DynPoolManager(pool_size = \
309 options.vreg_file_size,
310 min_alloc = \
311 options.vreg_min_alloc))
312 srf_pool_mgrs.append(DynPoolManager(pool_size = \
313 options.sreg_file_size,
314 min_alloc = \
315 options.vreg_min_alloc))
316
317 vrfs.append(VectorRegisterFile(simd_id=j, wf_size=options.wf_size,
318 num_regs=options.vreg_file_size))
319 srfs.append(ScalarRegisterFile(simd_id=j, wf_size=options.wf_size,
320 num_regs=options.sreg_file_size))
321
322 compute_units[-1].wavefronts = wavefronts
323 compute_units[-1].vector_register_file = vrfs
324 compute_units[-1].scalar_register_file = srfs
325 compute_units[-1].register_manager = \
326 RegisterManager(policy=options.registerManagerPolicy,
327 vrf_pool_managers=vrf_pool_mgrs,
328 srf_pool_managers=srf_pool_mgrs)
329 if options.TLB_prefetch:
330 compute_units[-1].prefetch_depth = options.TLB_prefetch
331 compute_units[-1].prefetch_prev_type = options.pf_type
332
333 # attach the LDS and the CU to the bus (actually a Bridge)
334 compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave
335 compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort
336
337 # Attach compute units to GPU
338 shader.CUs = compute_units
339
340 ########################## Creating the CPU system ########################
341 # The shader core will be whatever is after the CPU cores are accounted for
342 shader_idx = options.num_cpus
343
344 # The command processor will be whatever is after the shader is accounted for
345 cp_idx = shader_idx + 1
346 cp_list = []
347
348 # List of CPUs
349 cpu_list = []
350
351 CpuClass, mem_mode = Simulation.getCPUClass(options.cpu_type)
352 if CpuClass == AtomicSimpleCPU:
353 fatal("AtomicSimpleCPU is not supported")
354 if mem_mode != 'timing':
355 fatal("Only the timing memory mode is supported")
356 shader.timing = True
357
358 if options.fast_forward and options.fast_forward_pseudo_op:
359 fatal("Cannot fast-forward based both on the number of instructions and"
360 " on pseudo-ops")
361 fast_forward = options.fast_forward or options.fast_forward_pseudo_op
362
363 if fast_forward:
364 FutureCpuClass, future_mem_mode = CpuClass, mem_mode
365
366 CpuClass = X86KvmCPU
367 mem_mode = 'atomic_noncaching'
368 # Leave shader.timing untouched, because its value only matters at the
369 # start of the simulation and because we require switching cpus
370 # *before* the first kernel launch.
371
372 future_cpu_list = []
373
374 # Initial CPUs to be used during fast-forwarding.
375 for i in range(options.num_cpus):
376 cpu = CpuClass(cpu_id = i,
377 clk_domain = SrcClockDomain(
378 clock = options.CPUClock,
379 voltage_domain = VoltageDomain(
380 voltage = options.cpu_voltage)))
381 cpu_list.append(cpu)
382
383 if options.fast_forward:
384 cpu.max_insts_any_thread = int(options.fast_forward)
385
386 if fast_forward:
387 MainCpuClass = FutureCpuClass
388 else:
389 MainCpuClass = CpuClass
390
391 # CPs to be used throughout the simulation.
392 for i in range(options.num_cp):
393 cp = MainCpuClass(cpu_id = options.num_cpus + i,
394 clk_domain = SrcClockDomain(
395 clock = options.CPUClock,
396 voltage_domain = VoltageDomain(
397 voltage = options.cpu_voltage)))
398 cp_list.append(cp)
399
400 # Main CPUs (to be used after fast-forwarding if fast-forwarding is specified).
401 for i in range(options.num_cpus):
402 cpu = MainCpuClass(cpu_id = i,
403 clk_domain = SrcClockDomain(
404 clock = options.CPUClock,
405 voltage_domain = VoltageDomain(
406 voltage = options.cpu_voltage)))
407 if fast_forward:
408 cpu.switched_out = True
409 future_cpu_list.append(cpu)
410 else:
411 cpu_list.append(cpu)
412
413 host_cpu = cpu_list[0]
414
415 hsapp_gpu_map_vaddr = 0x200000000
416 hsapp_gpu_map_size = 0x1000
417 hsapp_gpu_map_paddr = int(Addr(options.mem_size))
418
419 # HSA kernel mode driver
420 gpu_driver = GPUComputeDriver(filename="kfd")
421
422 # Creating the GPU kernel launching components: that is the HSA
423 # packet processor (HSAPP), GPU command processor (CP), and the
424 # dispatcher.
425 gpu_hsapp = HSAPacketProcessor(pioAddr=hsapp_gpu_map_paddr,
426 numHWQueues=options.num_hw_queues)
427 dispatcher = GPUDispatcher()
428 gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp,
429 dispatcher=dispatcher)
430 gpu_driver.device = gpu_cmd_proc
431 shader.dispatcher = dispatcher
432 shader.gpu_cmd_proc = gpu_cmd_proc
433
434 # Create and assign the workload Check for rel_path in elements of
435 # base_list using test, returning the first full path that satisfies test
436 def find_path(base_list, rel_path, test):
437 for base in base_list:
438 if not base:
439 # base could be None if environment var not set
440 continue
441 full_path = os.path.join(base, rel_path)
442 if test(full_path):
443 return full_path
444 fatal("%s not found in %s" % (rel_path, base_list))
445
446 def find_file(base_list, rel_path):
447 return find_path(base_list, rel_path, os.path.isfile)
448
449 executable = find_path(benchmark_path, options.cmd, os.path.exists)
450 # It's common for a benchmark to be in a directory with the same
451 # name as the executable, so we handle that automatically
452 if os.path.isdir(executable):
453 benchmark_path = [executable]
454 executable = find_file(benchmark_path, options.cmd)
455
456 if options.env:
457 with open(options.env, 'r') as f:
458 env = [line.rstrip() for line in f]
459 else:
460 env = ['LD_LIBRARY_PATH=%s' % ':'.join([
461 os.getenv('ROCM_PATH','/opt/rocm')+'/lib',
462 os.getenv('HCC_HOME','/opt/rocm/hcc')+'/lib',
463 os.getenv('HSA_PATH','/opt/rocm/hsa')+'/lib',
464 os.getenv('HIP_PATH','/opt/rocm/hip')+'/lib',
465 os.getenv('ROCM_PATH','/opt/rocm')+'/libhsakmt/lib',
466 os.getenv('ROCM_PATH','/opt/rocm')+'/miopen/lib',
467 os.getenv('ROCM_PATH','/opt/rocm')+'/miopengemm/lib',
468 os.getenv('ROCM_PATH','/opt/rocm')+'/hipblas/lib',
469 os.getenv('ROCM_PATH','/opt/rocm')+'/rocblas/lib',
470 "/usr/lib/x86_64-linux-gnu"
471 ]),
472 'HOME=%s' % os.getenv('HOME','/'),
473 "HSA_ENABLE_INTERRUPT=1"]
474
475 process = Process(executable = executable, cmd = [options.cmd]
476 + options.options.split(), drivers = [gpu_driver], env = env)
477
478 for cpu in cpu_list:
479 cpu.createThreads()
480 cpu.workload = process
481
482 for cp in cp_list:
483 cp.workload = host_cpu.workload
484
485 if fast_forward:
486 for i in range(len(future_cpu_list)):
487 future_cpu_list[i].workload = cpu_list[i].workload
488 future_cpu_list[i].createThreads()
489
490 ########################## Create the overall system ########################
491 # List of CPUs that must be switched when moving between KVM and simulation
492 if fast_forward:
493 switch_cpu_list = \
494 [(cpu_list[i], future_cpu_list[i]) for i in range(options.num_cpus)]
495
496 # Full list of processing cores in the system.
497 cpu_list = cpu_list + [shader] + cp_list
498
499 # creating the overall system
500 # notice the cpu list is explicitly added as a parameter to System
501 system = System(cpu = cpu_list,
502 mem_ranges = [AddrRange(options.mem_size)],
503 cache_line_size = options.cacheline_size,
504 mem_mode = mem_mode,
505 workload = SEWorkload.init_compatible(executable))
506 if fast_forward:
507 system.future_cpu = future_cpu_list
508 system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
509 system.clk_domain = SrcClockDomain(clock = options.sys_clock,
510 voltage_domain = system.voltage_domain)
511
512 if fast_forward:
513 have_kvm_support = 'BaseKvmCPU' in globals()
514 if have_kvm_support and buildEnv['TARGET_ISA'] == "x86":
515 system.vm = KvmVM()
516 for i in range(len(host_cpu.workload)):
517 host_cpu.workload[i].useArchPT = True
518 host_cpu.workload[i].kvmInSE = True
519 else:
520 fatal("KvmCPU can only be used in SE mode with x86")
521
522 # configure the TLB hierarchy
523 GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
524
525 # create Ruby system
526 system.piobus = IOXBar(width=32, response_latency=0,
527 frontend_latency=0, forward_latency=0)
528 dma_list = [gpu_hsapp, gpu_cmd_proc]
529 Ruby.create_system(options, None, system, None, dma_list, None)
530 system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock,
531 voltage_domain = system.voltage_domain)
532 gpu_cmd_proc.pio = system.piobus.master
533 gpu_hsapp.pio = system.piobus.master
534
535 for i, dma_device in enumerate(dma_list):
536 exec('system.dma_cntrl%d.clk_domain = system.ruby.clk_domain' % i)
537
538 # attach the CPU ports to Ruby
539 for i in range(options.num_cpus):
540 ruby_port = system.ruby._cpu_ports[i]
541
542 # Create interrupt controller
543 system.cpu[i].createInterruptController()
544
545 # Connect cache port's to ruby
546 system.cpu[i].icache_port = ruby_port.slave
547 system.cpu[i].dcache_port = ruby_port.slave
548
549 ruby_port.mem_master_port = system.piobus.slave
550 if buildEnv['TARGET_ISA'] == "x86":
551 system.cpu[i].interrupts[0].pio = system.piobus.master
552 system.cpu[i].interrupts[0].int_master = system.piobus.slave
553 system.cpu[i].interrupts[0].int_slave = system.piobus.master
554 if fast_forward:
555 system.cpu[i].mmu.connectWalkerPorts(
556 ruby_port.slave, ruby_port.slave)
557
558 # attach CU ports to Ruby
559 # Because of the peculiarities of the CP core, you may have 1 CPU but 2
560 # sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be
561 # hooked up until after the CP. To make this script generic, figure out
562 # the index as below, but note that this assumes there is one sequencer
563 # per compute unit and one sequencer per SQC for the math to work out
564 # correctly.
565 gpu_port_idx = len(system.ruby._cpu_ports) \
566 - options.num_compute_units - options.num_sqc \
567 - options.num_scalar_cache
568 gpu_port_idx = gpu_port_idx - options.num_cp * 2
569
570 # Connect token ports. For this we need to search through the list of all
571 # sequencers, since the TCP coalescers will not necessarily be first. Only
572 # TCP coalescers use a token port for back pressure.
573 token_port_idx = 0
574 for i in range(len(system.ruby._cpu_ports)):
575 if isinstance(system.ruby._cpu_ports[i], VIPERCoalescer):
576 system.cpu[shader_idx].CUs[token_port_idx].gmTokenPort = \
577 system.ruby._cpu_ports[i].gmTokenPort
578 token_port_idx += 1
579
580 wavefront_size = options.wf_size
581 for i in range(n_cu):
582 # The pipeline issues wavefront_size number of uncoalesced requests
583 # in one GPU issue cycle. Hence wavefront_size mem ports.
584 for j in range(wavefront_size):
585 system.cpu[shader_idx].CUs[i].memory_port[j] = \
586 system.ruby._cpu_ports[gpu_port_idx].slave[j]
587 gpu_port_idx += 1
588
589 for i in range(n_cu):
590 if i > 0 and not i % options.cu_per_sqc:
591 print("incrementing idx on ", i)
592 gpu_port_idx += 1
593 system.cpu[shader_idx].CUs[i].sqc_port = \
594 system.ruby._cpu_ports[gpu_port_idx].slave
595 gpu_port_idx = gpu_port_idx + 1
596
597 for i in range(n_cu):
598 if i > 0 and not i % options.cu_per_scalar_cache:
599 print("incrementing idx on ", i)
600 gpu_port_idx += 1
601 system.cpu[shader_idx].CUs[i].scalar_port = \
602 system.ruby._cpu_ports[gpu_port_idx].slave
603 gpu_port_idx = gpu_port_idx + 1
604
605 # attach CP ports to Ruby
606 for i in range(options.num_cp):
607 system.cpu[cp_idx].createInterruptController()
608 system.cpu[cp_idx].dcache_port = \
609 system.ruby._cpu_ports[gpu_port_idx + i * 2].slave
610 system.cpu[cp_idx].icache_port = \
611 system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave
612 system.cpu[cp_idx].interrupts[0].pio = system.piobus.master
613 system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave
614 system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master
615 cp_idx = cp_idx + 1
616
617 ################# Connect the CPU and GPU via GPU Dispatcher ##################
618 # CPU rings the GPU doorbell to notify a pending task
619 # using this interface.
620 # And GPU uses this interface to notify the CPU of task completion
621 # The communcation happens through emulated driver.
622
623 # Note this implicit setting of the cpu_pointer, shader_pointer and tlb array
624 # parameters must be after the explicit setting of the System cpu list
625 if fast_forward:
626 shader.cpu_pointer = future_cpu_list[0]
627 else:
628 shader.cpu_pointer = host_cpu
629
630 ########################## Start simulation ########################
631
632 redirect_paths = [RedirectPath(app_path = "/proc",
633 host_paths =
634 ["%s/fs/proc" % m5.options.outdir]),
635 RedirectPath(app_path = "/sys",
636 host_paths =
637 ["%s/fs/sys" % m5.options.outdir]),
638 RedirectPath(app_path = "/tmp",
639 host_paths =
640 ["%s/fs/tmp" % m5.options.outdir])]
641
642 system.redirect_paths = redirect_paths
643
644 root = Root(system=system, full_system=False)
645
646 hsaTopology.createHsaTopology(options)
647
648 m5.ticks.setGlobalFrequency('1THz')
649 if options.abs_max_tick:
650 maxtick = options.abs_max_tick
651 else:
652 maxtick = m5.MaxTick
653
654 # Benchmarks support work item annotations
655 Simulation.setWorkCountOptions(system, options)
656
657 # Checkpointing is not supported by APU model
658 if (options.checkpoint_dir != None or
659 options.checkpoint_restore != None):
660 fatal("Checkpointing not supported by apu model")
661
662 checkpoint_dir = None
663 m5.instantiate(checkpoint_dir)
664
665 # Map workload to this address space
666 host_cpu.workload[0].map(0x10000000, 0x200000000, 4096)
667
668 if options.fast_forward:
669 print("Switch at instruction count: %d" % cpu_list[0].max_insts_any_thread)
670
671 exit_event = m5.simulate(maxtick)
672
673 if options.fast_forward:
674 if exit_event.getCause() == "a thread reached the max instruction count":
675 m5.switchCpus(system, switch_cpu_list)
676 print("Switched CPUS @ tick %s" % (m5.curTick()))
677 m5.stats.reset()
678 exit_event = m5.simulate(maxtick - m5.curTick())
679 elif options.fast_forward_pseudo_op:
680 while exit_event.getCause() == "switchcpu":
681 # If we are switching *to* kvm, then the current stats are meaningful
682 # Note that we don't do any warmup by default
683 if type(switch_cpu_list[0][0]) == FutureCpuClass:
684 print("Dumping stats...")
685 m5.stats.dump()
686 m5.switchCpus(system, switch_cpu_list)
687 print("Switched CPUS @ tick %s" % (m5.curTick()))
688 m5.stats.reset()
689 # This lets us switch back and forth without keeping a counter
690 switch_cpu_list = [(x[1], x[0]) for x in switch_cpu_list]
691 exit_event = m5.simulate(maxtick - m5.curTick())
692
693 print("Ticks:", m5.curTick())
694 print('Exiting because ', exit_event.getCause())
695
696 sys.exit(exit_event.getCode())