b629058f0c500f7fb9fc5f138d0f5b44db735693
[gem5.git] / configs / example / apu_se.py
1 # Copyright (c) 2015 Advanced Micro Devices, Inc.
2 # All rights reserved.
3 #
4 # For use for simulation and test purposes only
5 #
6 # Redistribution and use in source and binary forms, with or without
7 # modification, are permitted provided that the following conditions are met:
8 #
9 # 1. Redistributions of source code must retain the above copyright notice,
10 # this list of conditions and the following disclaimer.
11 #
12 # 2. Redistributions in binary form must reproduce the above copyright notice,
13 # this list of conditions and the following disclaimer in the documentation
14 # and/or other materials provided with the distribution.
15 #
16 # 3. Neither the name of the copyright holder nor the names of its
17 # contributors may be used to endorse or promote products derived from this
18 # software without specific prior written permission.
19 #
20 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
24 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 # POSSIBILITY OF SUCH DAMAGE.
31
32 from __future__ import print_function
33 from __future__ import absolute_import
34
35 import optparse, os, re, getpass
36 import math
37 import glob
38 import inspect
39
40 import m5
41 from m5.objects import *
42 from m5.util import addToPath
43
44 addToPath('../')
45
46 from ruby import Ruby
47
48 from common import Options
49 from common import Simulation
50 from common import GPUTLBOptions, GPUTLBConfig
51
52 import hsaTopology
53 from common import FileSystemConfig
54
55 ########################## Script Options ########################
56 def setOption(parser, opt_str, value = 1):
57 # check to make sure the option actually exists
58 if not parser.has_option(opt_str):
59 raise Exception("cannot find %s in list of possible options" % opt_str)
60
61 opt = parser.get_option(opt_str)
62 # set the value
63 exec("parser.values.%s = %s" % (opt.dest, value))
64
65 def getOption(parser, opt_str):
66 # check to make sure the option actually exists
67 if not parser.has_option(opt_str):
68 raise Exception("cannot find %s in list of possible options" % opt_str)
69
70 opt = parser.get_option(opt_str)
71 # get the value
72 exec("return_value = parser.values.%s" % opt.dest)
73 return return_value
74
75 # Adding script options
76 parser = optparse.OptionParser()
77 Options.addCommonOptions(parser)
78 Options.addSEOptions(parser)
79
80 parser.add_option("--cpu-only-mode", action="store_true", default=False,
81 help="APU mode. Used to take care of problems in "\
82 "Ruby.py while running APU protocols")
83 parser.add_option("-u", "--num-compute-units", type="int", default=4,
84 help="number of GPU compute units"),
85 parser.add_option("--num-cp", type="int", default=0,
86 help="Number of GPU Command Processors (CP)")
87 parser.add_option("--benchmark-root", help="Root of benchmark directory tree")
88
89 # not super important now, but to avoid putting the number 4 everywhere, make
90 # it an option/knob
91 parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \
92 "sharing an SQC (icache, and thus icache TLB)")
93 parser.add_option('--cu-per-scalar-cache', type='int', default=4,
94 help='Number of CUs sharing a scalar cache')
95 parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
96 "per CU")
97 parser.add_option('--cu-per-sa', type='int', default=4,
98 help='Number of CUs per shader array. This must be a '
99 'multiple of options.cu-per-sqc and options.cu-per-scalar')
100 parser.add_option('--sa-per-complex', type='int', default=1,
101 help='Number of shader arrays per complex')
102 parser.add_option('--num-gpu-complexes', type='int', default=1,
103 help='Number of GPU complexes')
104 parser.add_option("--wf-size", type="int", default=64,
105 help="Wavefront size(in workitems)")
106 parser.add_option("--sp-bypass-path-length", type="int", default=4, \
107 help="Number of stages of bypass path in vector ALU for "
108 "Single Precision ops")
109 parser.add_option("--dp-bypass-path-length", type="int", default=4, \
110 help="Number of stages of bypass path in vector ALU for "
111 "Double Precision ops")
112 # issue period per SIMD unit: number of cycles before issuing another vector
113 parser.add_option("--issue-period", type="int", default=4, \
114 help="Number of cycles per vector instruction issue period")
115 parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \
116 help="VGPR to Coalescer (Global Memory) data bus width "
117 "in bytes")
118 parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \
119 help="Coalescer to VGPR (Global Memory) data bus width in "
120 "bytes")
121 # Currently we only support 1 local memory pipe
122 parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \
123 help="Number of Shared Memory pipelines per CU")
124 # Currently we only support 1 global memory pipe
125 parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \
126 help="Number of Global Memory pipelines per CU")
127 parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
128 "WF slots per SIMD")
129
130 parser.add_option("--registerManagerPolicy", type="string", default="static",
131 help="Register manager policy")
132 parser.add_option("--vreg-file-size", type="int", default=2048,
133 help="number of physical vector registers per SIMD")
134 parser.add_option("--vreg-min-alloc", type="int", default=4,
135 help="Minimum number of registers that can be allocated "
136 "from the VRF. The total number of registers will be "
137 "aligned to this value.")
138
139 parser.add_option("--sreg-file-size", type="int", default=2048,
140 help="number of physical vector registers per SIMD")
141 parser.add_option("--sreg-min-alloc", type="int", default=4,
142 help="Minimum number of registers that can be allocated "
143 "from the SRF. The total number of registers will be "
144 "aligned to this value.")
145
146 parser.add_option("--bw-scalor", type="int", default=0,
147 help="bandwidth scalor for scalability analysis")
148 parser.add_option("--CPUClock", type="string", default="2GHz",
149 help="CPU clock")
150 parser.add_option("--gpu-clock", type="string", default="1GHz",
151 help="GPU clock")
152 parser.add_option("--cpu-voltage", action="store", type="string",
153 default='1.0V',
154 help = """CPU voltage domain""")
155 parser.add_option("--gpu-voltage", action="store", type="string",
156 default='1.0V',
157 help = """CPU voltage domain""")
158 parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST",
159 help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)")
160 parser.add_option("--SegFaultDebug",action="store_true",
161 help="checks for GPU seg fault before TLB access")
162 parser.add_option("--FunctionalTLB",action="store_true",
163 help="Assumes TLB has no latency")
164 parser.add_option("--LocalMemBarrier",action="store_true",
165 help="Barrier does not wait for writethroughs to complete")
166 parser.add_option("--countPages", action="store_true",
167 help="Count Page Accesses and output in per-CU output files")
168 parser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\
169 "TLBs")
170 parser.add_option("--pf-type", type="string", help="type of prefetch: "\
171 "PF_CU, PF_WF, PF_PHASE, PF_STRIDE")
172 parser.add_option("--pf-stride", type="int", help="set prefetch stride")
173 parser.add_option("--numLdsBanks", type="int", default=32,
174 help="number of physical banks per LDS module")
175 parser.add_option("--ldsBankConflictPenalty", type="int", default=1,
176 help="number of cycles per LDS bank conflict")
177 parser.add_option('--fast-forward-pseudo-op', action='store_true',
178 help = 'fast forward using kvm until the m5_switchcpu'
179 ' pseudo-op is encountered, then switch cpus. subsequent'
180 ' m5_switchcpu pseudo-ops will toggle back and forth')
181 parser.add_option("--num-hw-queues", type="int", default=10,
182 help="number of hw queues in packet processor")
183
184 Ruby.define_options(parser)
185
186 #add TLB options to the parser
187 GPUTLBOptions.tlb_options(parser)
188
189 (options, args) = parser.parse_args()
190
191 # The GPU cache coherence protocols only work with the backing store
192 setOption(parser, "--access-backing-store")
193
194 # if benchmark root is specified explicitly, that overrides the search path
195 if options.benchmark_root:
196 benchmark_path = [options.benchmark_root]
197 else:
198 # Set default benchmark search path to current dir
199 benchmark_path = ['.']
200
201 ########################## Sanity Check ########################
202
203 # Currently the gpu model requires ruby
204 if buildEnv['PROTOCOL'] == 'None':
205 fatal("GPU model requires ruby")
206
207 # Currently the gpu model requires only timing or detailed CPU
208 if not (options.cpu_type == "TimingSimpleCPU" or
209 options.cpu_type == "DerivO3CPU"):
210 fatal("GPU model requires TimingSimpleCPU or DerivO3CPU")
211
212 # This file can support multiple compute units
213 assert(options.num_compute_units >= 1)
214
215 # Currently, the sqc (I-Cache of GPU) is shared by
216 # multiple compute units(CUs). The protocol works just fine
217 # even if sqc is not shared. Overriding this option here
218 # so that the user need not explicitly set this (assuming
219 # sharing sqc is the common usage)
220 n_cu = options.num_compute_units
221 num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc))
222 options.num_sqc = num_sqc # pass this to Ruby
223 num_scalar_cache = int(math.ceil(float(n_cu) / options.cu_per_scalar_cache))
224 options.num_scalar_cache = num_scalar_cache
225
226 print('Num SQC = ', num_sqc, 'Num scalar caches = ', num_scalar_cache,
227 'Num CU = ', n_cu)
228
229 ########################## Creating the GPU system ########################
230 # shader is the GPU
231 shader = Shader(n_wf = options.wfs_per_simd,
232 clk_domain = SrcClockDomain(
233 clock = options.gpu_clock,
234 voltage_domain = VoltageDomain(
235 voltage = options.gpu_voltage)))
236
237 # GPU_RfO(Read For Ownership) implements SC/TSO memory model.
238 # Other GPU protocols implement release consistency at GPU side.
239 # So, all GPU protocols other than GPU_RfO should make their writes
240 # visible to the global memory and should read from global memory
241 # during kernal boundary. The pipeline initiates(or do not initiate)
242 # the acquire/release operation depending on these impl_kern_launch_rel
243 # and impl_kern_end_rel flags. The flag=true means pipeline initiates
244 # a acquire/release operation at kernel launch/end.
245 # VIPER protocols (GPU_VIPER, GPU_VIPER_Region and GPU_VIPER_Baseline)
246 # are write-through based, and thus only imple_kern_launch_acq needs to
247 # set.
248 if buildEnv['PROTOCOL'] == 'GPU_RfO':
249 shader.impl_kern_launch_acq = False
250 shader.impl_kern_end_rel = False
251 elif (buildEnv['PROTOCOL'] != 'GPU_VIPER' or
252 buildEnv['PROTOCOL'] != 'GPU_VIPER_Region' or
253 buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline'):
254 shader.impl_kern_launch_acq = True
255 shader.impl_kern_end_rel = False
256 else:
257 shader.impl_kern_launch_acq = True
258 shader.impl_kern_end_rel = True
259
260 # Switching off per-lane TLB by default
261 per_lane = False
262 if options.TLB_config == "perLane":
263 per_lane = True
264
265 # List of compute units; one GPU can have multiple compute units
266 compute_units = []
267 for i in range(n_cu):
268 compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
269 num_SIMDs = options.simds_per_cu,
270 wf_size = options.wf_size,
271 spbypass_pipe_length = \
272 options.sp_bypass_path_length,
273 dpbypass_pipe_length = \
274 options.dp_bypass_path_length,
275 issue_period = options.issue_period,
276 coalescer_to_vrf_bus_width = \
277 options.glbmem_rd_bus_width,
278 vrf_to_coalescer_bus_width = \
279 options.glbmem_wr_bus_width,
280 num_global_mem_pipes = \
281 options.glb_mem_pipes_per_cu,
282 num_shared_mem_pipes = \
283 options.shr_mem_pipes_per_cu,
284 n_wf = options.wfs_per_simd,
285 execPolicy = options.CUExecPolicy,
286 debugSegFault = options.SegFaultDebug,
287 functionalTLB = options.FunctionalTLB,
288 localMemBarrier = options.LocalMemBarrier,
289 countPages = options.countPages,
290 localDataStore = \
291 LdsState(banks = options.numLdsBanks,
292 bankConflictPenalty = \
293 options.ldsBankConflictPenalty)))
294 wavefronts = []
295 vrfs = []
296 vrf_pool_mgrs = []
297 srfs = []
298 srf_pool_mgrs = []
299 for j in xrange(options.simds_per_cu):
300 for k in xrange(shader.n_wf):
301 wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
302 wf_size = options.wf_size))
303 vrf_pool_mgrs.append(SimplePoolManager(pool_size = \
304 options.vreg_file_size,
305 min_alloc = \
306 options.vreg_min_alloc))
307
308 vrfs.append(VectorRegisterFile(simd_id=j, wf_size=options.wf_size,
309 num_regs=options.vreg_file_size))
310
311 srf_pool_mgrs.append(SimplePoolManager(pool_size = \
312 options.sreg_file_size,
313 min_alloc = \
314 options.vreg_min_alloc))
315 srfs.append(ScalarRegisterFile(simd_id=j, wf_size=options.wf_size,
316 num_regs=options.sreg_file_size))
317
318 compute_units[-1].wavefronts = wavefronts
319 compute_units[-1].vector_register_file = vrfs
320 compute_units[-1].scalar_register_file = srfs
321 compute_units[-1].register_manager = \
322 RegisterManager(policy=options.registerManagerPolicy,
323 vrf_pool_managers=vrf_pool_mgrs,
324 srf_pool_managers=srf_pool_mgrs)
325 if options.TLB_prefetch:
326 compute_units[-1].prefetch_depth = options.TLB_prefetch
327 compute_units[-1].prefetch_prev_type = options.pf_type
328
329 # attach the LDS and the CU to the bus (actually a Bridge)
330 compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave
331 compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort
332
333 # Attach compute units to GPU
334 shader.CUs = compute_units
335
336 ########################## Creating the CPU system ########################
337 # The shader core will be whatever is after the CPU cores are accounted for
338 shader_idx = options.num_cpus
339
340 # The command processor will be whatever is after the shader is accounted for
341 cp_idx = shader_idx + 1
342 cp_list = []
343
344 # List of CPUs
345 cpu_list = []
346
347 CpuClass, mem_mode = Simulation.getCPUClass(options.cpu_type)
348 if CpuClass == AtomicSimpleCPU:
349 fatal("AtomicSimpleCPU is not supported")
350 if mem_mode != 'timing':
351 fatal("Only the timing memory mode is supported")
352 shader.timing = True
353
354 if options.fast_forward and options.fast_forward_pseudo_op:
355 fatal("Cannot fast-forward based both on the number of instructions and"
356 " on pseudo-ops")
357 fast_forward = options.fast_forward or options.fast_forward_pseudo_op
358
359 if fast_forward:
360 FutureCpuClass, future_mem_mode = CpuClass, mem_mode
361
362 CpuClass = X86KvmCPU
363 mem_mode = 'atomic_noncaching'
364 # Leave shader.timing untouched, because its value only matters at the
365 # start of the simulation and because we require switching cpus
366 # *before* the first kernel launch.
367
368 future_cpu_list = []
369
370 # Initial CPUs to be used during fast-forwarding.
371 for i in range(options.num_cpus):
372 cpu = CpuClass(cpu_id = i,
373 clk_domain = SrcClockDomain(
374 clock = options.CPUClock,
375 voltage_domain = VoltageDomain(
376 voltage = options.cpu_voltage)))
377 cpu_list.append(cpu)
378
379 if options.fast_forward:
380 cpu.max_insts_any_thread = int(options.fast_forward)
381
382 if fast_forward:
383 MainCpuClass = FutureCpuClass
384 else:
385 MainCpuClass = CpuClass
386
387 # CPs to be used throughout the simulation.
388 for i in range(options.num_cp):
389 cp = MainCpuClass(cpu_id = options.num_cpus + i,
390 clk_domain = SrcClockDomain(
391 clock = options.CPUClock,
392 voltage_domain = VoltageDomain(
393 voltage = options.cpu_voltage)))
394 cp_list.append(cp)
395
396 # Main CPUs (to be used after fast-forwarding if fast-forwarding is specified).
397 for i in range(options.num_cpus):
398 cpu = MainCpuClass(cpu_id = i,
399 clk_domain = SrcClockDomain(
400 clock = options.CPUClock,
401 voltage_domain = VoltageDomain(
402 voltage = options.cpu_voltage)))
403 if fast_forward:
404 cpu.switched_out = True
405 future_cpu_list.append(cpu)
406 else:
407 cpu_list.append(cpu)
408
409 host_cpu = cpu_list[0]
410
411 hsapp_gpu_map_vaddr = 0x200000000
412 hsapp_gpu_map_size = 0x1000
413 hsapp_gpu_map_paddr = int(Addr(options.mem_size))
414
415 # HSA kernel mode driver
416 gpu_driver = GPUComputeDriver(filename="kfd")
417
418 # Creating the GPU kernel launching components: that is the HSA
419 # packet processor (HSAPP), GPU command processor (CP), and the
420 # dispatcher.
421 gpu_hsapp = HSAPacketProcessor(pioAddr=hsapp_gpu_map_paddr,
422 numHWQueues=options.num_hw_queues)
423 dispatcher = GPUDispatcher()
424 gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp,
425 dispatcher=dispatcher)
426 gpu_driver.device = gpu_cmd_proc
427 shader.dispatcher = dispatcher
428 shader.gpu_cmd_proc = gpu_cmd_proc
429
430 # Create and assign the workload Check for rel_path in elements of
431 # base_list using test, returning the first full path that satisfies test
432 def find_path(base_list, rel_path, test):
433 for base in base_list:
434 if not base:
435 # base could be None if environment var not set
436 continue
437 full_path = os.path.join(base, rel_path)
438 if test(full_path):
439 return full_path
440 fatal("%s not found in %s" % (rel_path, base_list))
441
442 def find_file(base_list, rel_path):
443 return find_path(base_list, rel_path, os.path.isfile)
444
445 executable = find_path(benchmark_path, options.cmd, os.path.exists)
446 # It's common for a benchmark to be in a directory with the same
447 # name as the executable, so we handle that automatically
448 if os.path.isdir(executable):
449 benchmark_path = [executable]
450 executable = find_file(benchmark_path, options.cmd)
451
452 if options.env:
453 with open(options.env, 'r') as f:
454 env = [line.rstrip() for line in f]
455 else:
456 env = ['LD_LIBRARY_PATH=%s' % ':'.join([
457 os.getenv('ROCM_PATH','/opt/rocm')+'/lib',
458 os.getenv('HCC_HOME','/opt/rocm/hcc')+'/lib',
459 os.getenv('HSA_PATH','/opt/rocm/hsa')+'/lib',
460 os.getenv('HIP_PATH','/opt/rocm/hip')+'/lib',
461 os.getenv('ROCM_PATH','/opt/rocm')+'/libhsakmt/lib',
462 os.getenv('ROCM_PATH','/opt/rocm')+'/miopen/lib',
463 os.getenv('ROCM_PATH','/opt/rocm')+'/miopengemm/lib',
464 os.getenv('ROCM_PATH','/opt/rocm')+'/hipblas/lib',
465 os.getenv('ROCM_PATH','/opt/rocm')+'/rocblas/lib',
466 "/usr/lib/x86_64-linux-gnu"
467 ]),
468 'HOME=%s' % os.getenv('HOME','/'),
469 "HSA_ENABLE_INTERRUPT=0"]
470
471 process = Process(executable = executable, cmd = [options.cmd]
472 + options.options.split(), drivers = [gpu_driver], env = env)
473
474 for cpu in cpu_list:
475 cpu.createThreads()
476 cpu.workload = process
477
478 for cp in cp_list:
479 cp.workload = host_cpu.workload
480
481 if fast_forward:
482 for i in range(len(future_cpu_list)):
483 future_cpu_list[i].workload = cpu_list[i].workload
484 future_cpu_list[i].createThreads()
485
486 ########################## Create the overall system ########################
487 # List of CPUs that must be switched when moving between KVM and simulation
488 if fast_forward:
489 switch_cpu_list = \
490 [(cpu_list[i], future_cpu_list[i]) for i in range(options.num_cpus)]
491
492 # Full list of processing cores in the system.
493 cpu_list = cpu_list + [shader] + cp_list
494
495 # creating the overall system
496 # notice the cpu list is explicitly added as a parameter to System
497 system = System(cpu = cpu_list,
498 mem_ranges = [AddrRange(options.mem_size)],
499 cache_line_size = options.cacheline_size,
500 mem_mode = mem_mode)
501 if fast_forward:
502 system.future_cpu = future_cpu_list
503 system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
504 system.clk_domain = SrcClockDomain(clock = options.sys_clock,
505 voltage_domain = system.voltage_domain)
506
507 if fast_forward:
508 have_kvm_support = 'BaseKvmCPU' in globals()
509 if have_kvm_support and buildEnv['TARGET_ISA'] == "x86":
510 system.vm = KvmVM()
511 for i in range(len(host_cpu.workload)):
512 host_cpu.workload[i].useArchPT = True
513 host_cpu.workload[i].kvmInSE = True
514 else:
515 fatal("KvmCPU can only be used in SE mode with x86")
516
517 # configure the TLB hierarchy
518 GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
519
520 # create Ruby system
521 system.piobus = IOXBar(width=32, response_latency=0,
522 frontend_latency=0, forward_latency=0)
523 dma_list = [gpu_hsapp, gpu_cmd_proc]
524 Ruby.create_system(options, None, system, None, dma_list, None)
525 system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock,
526 voltage_domain = system.voltage_domain)
527 gpu_cmd_proc.pio = system.piobus.master
528 gpu_hsapp.pio = system.piobus.master
529
530 for i, dma_device in enumerate(dma_list):
531 exec('system.dma_cntrl%d.clk_domain = system.ruby.clk_domain' % i)
532
533 # attach the CPU ports to Ruby
534 for i in range(options.num_cpus):
535 ruby_port = system.ruby._cpu_ports[i]
536
537 # Create interrupt controller
538 system.cpu[i].createInterruptController()
539
540 # Connect cache port's to ruby
541 system.cpu[i].icache_port = ruby_port.slave
542 system.cpu[i].dcache_port = ruby_port.slave
543
544 ruby_port.mem_master_port = system.piobus.slave
545 if buildEnv['TARGET_ISA'] == "x86":
546 system.cpu[i].interrupts[0].pio = system.piobus.master
547 system.cpu[i].interrupts[0].int_master = system.piobus.slave
548 system.cpu[i].interrupts[0].int_slave = system.piobus.master
549 if fast_forward:
550 system.cpu[i].itb.walker.port = ruby_port.slave
551 system.cpu[i].dtb.walker.port = ruby_port.slave
552
553 # attach CU ports to Ruby
554 # Because of the peculiarities of the CP core, you may have 1 CPU but 2
555 # sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be
556 # hooked up until after the CP. To make this script generic, figure out
557 # the index as below, but note that this assumes there is one sequencer
558 # per compute unit and one sequencer per SQC for the math to work out
559 # correctly.
560 gpu_port_idx = len(system.ruby._cpu_ports) \
561 - options.num_compute_units - options.num_sqc \
562 - options.num_scalar_cache
563 gpu_port_idx = gpu_port_idx - options.num_cp * 2
564
565 wavefront_size = options.wf_size
566 for i in range(n_cu):
567 # The pipeline issues wavefront_size number of uncoalesced requests
568 # in one GPU issue cycle. Hence wavefront_size mem ports.
569 for j in range(wavefront_size):
570 system.cpu[shader_idx].CUs[i].memory_port[j] = \
571 system.ruby._cpu_ports[gpu_port_idx].slave[j]
572 gpu_port_idx += 1
573
574 for i in range(n_cu):
575 if i > 0 and not i % options.cu_per_sqc:
576 print("incrementing idx on ", i)
577 gpu_port_idx += 1
578 system.cpu[shader_idx].CUs[i].sqc_port = \
579 system.ruby._cpu_ports[gpu_port_idx].slave
580 gpu_port_idx = gpu_port_idx + 1
581
582 for i in xrange(n_cu):
583 if i > 0 and not i % options.cu_per_scalar_cache:
584 print("incrementing idx on ", i)
585 gpu_port_idx += 1
586 system.cpu[shader_idx].CUs[i].scalar_port = \
587 system.ruby._cpu_ports[gpu_port_idx].slave
588 gpu_port_idx = gpu_port_idx + 1
589
590 # attach CP ports to Ruby
591 for i in range(options.num_cp):
592 system.cpu[cp_idx].createInterruptController()
593 system.cpu[cp_idx].dcache_port = \
594 system.ruby._cpu_ports[gpu_port_idx + i * 2].slave
595 system.cpu[cp_idx].icache_port = \
596 system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave
597 system.cpu[cp_idx].interrupts[0].pio = system.piobus.master
598 system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave
599 system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master
600 cp_idx = cp_idx + 1
601
602 ################# Connect the CPU and GPU via GPU Dispatcher ##################
603 # CPU rings the GPU doorbell to notify a pending task
604 # using this interface.
605 # And GPU uses this interface to notify the CPU of task completion
606 # The communcation happens through emulated driver.
607
608 # Note this implicit setting of the cpu_pointer, shader_pointer and tlb array
609 # parameters must be after the explicit setting of the System cpu list
610 if fast_forward:
611 shader.cpu_pointer = future_cpu_list[0]
612 else:
613 shader.cpu_pointer = host_cpu
614
615 ########################## Start simulation ########################
616
617 redirect_paths = [RedirectPath(app_path = "/proc",
618 host_paths =
619 ["%s/fs/proc" % m5.options.outdir]),
620 RedirectPath(app_path = "/sys",
621 host_paths =
622 ["%s/fs/sys" % m5.options.outdir]),
623 RedirectPath(app_path = "/tmp",
624 host_paths =
625 ["%s/fs/tmp" % m5.options.outdir])]
626
627 system.redirect_paths = redirect_paths
628
629 root = Root(system=system, full_system=False)
630
631 hsaTopology.createHsaTopology(options)
632
633 m5.ticks.setGlobalFrequency('1THz')
634 if options.abs_max_tick:
635 maxtick = options.abs_max_tick
636 else:
637 maxtick = m5.MaxTick
638
639 # Benchmarks support work item annotations
640 Simulation.setWorkCountOptions(system, options)
641
642 # Checkpointing is not supported by APU model
643 if (options.checkpoint_dir != None or
644 options.checkpoint_restore != None):
645 fatal("Checkpointing not supported by apu model")
646
647 checkpoint_dir = None
648 m5.instantiate(checkpoint_dir)
649
650 # Map workload to this address space
651 host_cpu.workload[0].map(0x10000000, 0x200000000, 4096)
652
653 if options.fast_forward:
654 print("Switch at instruction count: %d" % cpu_list[0].max_insts_any_thread)
655
656 exit_event = m5.simulate(maxtick)
657
658 if options.fast_forward:
659 if exit_event.getCause() == "a thread reached the max instruction count":
660 m5.switchCpus(system, switch_cpu_list)
661 print("Switched CPUS @ tick %s" % (m5.curTick()))
662 m5.stats.reset()
663 exit_event = m5.simulate(maxtick - m5.curTick())
664 elif options.fast_forward_pseudo_op:
665 while exit_event.getCause() == "switchcpu":
666 # If we are switching *to* kvm, then the current stats are meaningful
667 # Note that we don't do any warmup by default
668 if type(switch_cpu_list[0][0]) == FutureCpuClass:
669 print("Dumping stats...")
670 m5.stats.dump()
671 m5.switchCpus(system, switch_cpu_list)
672 print("Switched CPUS @ tick %s" % (m5.curTick()))
673 m5.stats.reset()
674 # This lets us switch back and forth without keeping a counter
675 switch_cpu_list = [(x[1], x[0]) for x in switch_cpu_list]
676 exit_event = m5.simulate(maxtick - m5.curTick())
677
678 print("Ticks:", m5.curTick())
679 print('Exiting because ', exit_event.getCause())
680
681 sys.exit(exit_event.getCode())