mem: Make MemCtrl a ClockedObject
[gem5.git] / configs / example / apu_se.py
1 # Copyright (c) 2015 Advanced Micro Devices, Inc.
2 # All rights reserved.
3 #
4 # For use for simulation and test purposes only
5 #
6 # Redistribution and use in source and binary forms, with or without
7 # modification, are permitted provided that the following conditions are met:
8 #
9 # 1. Redistributions of source code must retain the above copyright notice,
10 # this list of conditions and the following disclaimer.
11 #
12 # 2. Redistributions in binary form must reproduce the above copyright notice,
13 # this list of conditions and the following disclaimer in the documentation
14 # and/or other materials provided with the distribution.
15 #
16 # 3. Neither the name of the copyright holder nor the names of its
17 # contributors may be used to endorse or promote products derived from this
18 # software without specific prior written permission.
19 #
20 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
24 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 # POSSIBILITY OF SUCH DAMAGE.
31
32 from __future__ import print_function
33 from __future__ import absolute_import
34
35 import optparse, os, re, getpass
36 import math
37 import glob
38 import inspect
39
40 import m5
41 from m5.objects import *
42 from m5.util import addToPath
43
44 addToPath('../')
45
46 from ruby import Ruby
47
48 from common import Options
49 from common import Simulation
50 from common import GPUTLBOptions, GPUTLBConfig
51
52 import hsaTopology
53 from common import FileSystemConfig
54
55 ########################## Script Options ########################
56 def setOption(parser, opt_str, value = 1):
57 # check to make sure the option actually exists
58 if not parser.has_option(opt_str):
59 raise Exception("cannot find %s in list of possible options" % opt_str)
60
61 opt = parser.get_option(opt_str)
62 # set the value
63 exec("parser.values.%s = %s" % (opt.dest, value))
64
65 def getOption(parser, opt_str):
66 # check to make sure the option actually exists
67 if not parser.has_option(opt_str):
68 raise Exception("cannot find %s in list of possible options" % opt_str)
69
70 opt = parser.get_option(opt_str)
71 # get the value
72 exec("return_value = parser.values.%s" % opt.dest)
73 return return_value
74
75 # Adding script options
76 parser = optparse.OptionParser()
77 Options.addCommonOptions(parser)
78 Options.addSEOptions(parser)
79
80 parser.add_option("--cpu-only-mode", action="store_true", default=False,
81 help="APU mode. Used to take care of problems in "\
82 "Ruby.py while running APU protocols")
83 parser.add_option("-u", "--num-compute-units", type="int", default=4,
84 help="number of GPU compute units"),
85 parser.add_option("--num-cp", type="int", default=0,
86 help="Number of GPU Command Processors (CP)")
87 parser.add_option("--benchmark-root", help="Root of benchmark directory tree")
88
89 # not super important now, but to avoid putting the number 4 everywhere, make
90 # it an option/knob
91 parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \
92 "sharing an SQC (icache, and thus icache TLB)")
93 parser.add_option('--cu-per-scalar-cache', type='int', default=4,
94 help='Number of CUs sharing a scalar cache')
95 parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
96 "per CU")
97 parser.add_option('--cu-per-sa', type='int', default=4,
98 help='Number of CUs per shader array. This must be a '
99 'multiple of options.cu-per-sqc and options.cu-per-scalar')
100 parser.add_option('--sa-per-complex', type='int', default=1,
101 help='Number of shader arrays per complex')
102 parser.add_option('--num-gpu-complexes', type='int', default=1,
103 help='Number of GPU complexes')
104 parser.add_option("--wf-size", type="int", default=64,
105 help="Wavefront size(in workitems)")
106 parser.add_option("--sp-bypass-path-length", type="int", default=4, \
107 help="Number of stages of bypass path in vector ALU for "
108 "Single Precision ops")
109 parser.add_option("--dp-bypass-path-length", type="int", default=4, \
110 help="Number of stages of bypass path in vector ALU for "
111 "Double Precision ops")
112 # issue period per SIMD unit: number of cycles before issuing another vector
113 parser.add_option("--issue-period", type="int", default=4, \
114 help="Number of cycles per vector instruction issue period")
115 parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \
116 help="VGPR to Coalescer (Global Memory) data bus width "
117 "in bytes")
118 parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \
119 help="Coalescer to VGPR (Global Memory) data bus width in "
120 "bytes")
121 # Currently we only support 1 local memory pipe
122 parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \
123 help="Number of Shared Memory pipelines per CU")
124 # Currently we only support 1 global memory pipe
125 parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \
126 help="Number of Global Memory pipelines per CU")
127 parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
128 "WF slots per SIMD")
129
130 parser.add_option("--registerManagerPolicy", type="string", default="static",
131 help="Register manager policy")
132 parser.add_option("--vreg-file-size", type="int", default=2048,
133 help="number of physical vector registers per SIMD")
134 parser.add_option("--vreg-min-alloc", type="int", default=4,
135 help="Minimum number of registers that can be allocated "
136 "from the VRF. The total number of registers will be "
137 "aligned to this value.")
138
139 parser.add_option("--sreg-file-size", type="int", default=2048,
140 help="number of physical vector registers per SIMD")
141 parser.add_option("--sreg-min-alloc", type="int", default=4,
142 help="Minimum number of registers that can be allocated "
143 "from the SRF. The total number of registers will be "
144 "aligned to this value.")
145
146 parser.add_option("--bw-scalor", type="int", default=0,
147 help="bandwidth scalor for scalability analysis")
148 parser.add_option("--CPUClock", type="string", default="2GHz",
149 help="CPU clock")
150 parser.add_option("--gpu-clock", type="string", default="1GHz",
151 help="GPU clock")
152 parser.add_option("--cpu-voltage", action="store", type="string",
153 default='1.0V',
154 help = """CPU voltage domain""")
155 parser.add_option("--gpu-voltage", action="store", type="string",
156 default='1.0V',
157 help = """CPU voltage domain""")
158 parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST",
159 help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)")
160 parser.add_option("--SegFaultDebug",action="store_true",
161 help="checks for GPU seg fault before TLB access")
162 parser.add_option("--FunctionalTLB",action="store_true",
163 help="Assumes TLB has no latency")
164 parser.add_option("--LocalMemBarrier",action="store_true",
165 help="Barrier does not wait for writethroughs to complete")
166 parser.add_option("--countPages", action="store_true",
167 help="Count Page Accesses and output in per-CU output files")
168 parser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\
169 "TLBs")
170 parser.add_option("--pf-type", type="string", help="type of prefetch: "\
171 "PF_CU, PF_WF, PF_PHASE, PF_STRIDE")
172 parser.add_option("--pf-stride", type="int", help="set prefetch stride")
173 parser.add_option("--numLdsBanks", type="int", default=32,
174 help="number of physical banks per LDS module")
175 parser.add_option("--ldsBankConflictPenalty", type="int", default=1,
176 help="number of cycles per LDS bank conflict")
177 parser.add_options("--lds-size", type="int", default=65536,
178 help="Size of the LDS in bytes")
179 parser.add_option('--fast-forward-pseudo-op', action='store_true',
180 help = 'fast forward using kvm until the m5_switchcpu'
181 ' pseudo-op is encountered, then switch cpus. subsequent'
182 ' m5_switchcpu pseudo-ops will toggle back and forth')
183 parser.add_option("--num-hw-queues", type="int", default=10,
184 help="number of hw queues in packet processor")
185
186 Ruby.define_options(parser)
187
188 #add TLB options to the parser
189 GPUTLBOptions.tlb_options(parser)
190
191 (options, args) = parser.parse_args()
192
193 # The GPU cache coherence protocols only work with the backing store
194 setOption(parser, "--access-backing-store")
195
196 # if benchmark root is specified explicitly, that overrides the search path
197 if options.benchmark_root:
198 benchmark_path = [options.benchmark_root]
199 else:
200 # Set default benchmark search path to current dir
201 benchmark_path = ['.']
202
203 ########################## Sanity Check ########################
204
205 # Currently the gpu model requires ruby
206 if buildEnv['PROTOCOL'] == 'None':
207 fatal("GPU model requires ruby")
208
209 # Currently the gpu model requires only timing or detailed CPU
210 if not (options.cpu_type == "TimingSimpleCPU" or
211 options.cpu_type == "DerivO3CPU"):
212 fatal("GPU model requires TimingSimpleCPU or DerivO3CPU")
213
214 # This file can support multiple compute units
215 assert(options.num_compute_units >= 1)
216
217 # Currently, the sqc (I-Cache of GPU) is shared by
218 # multiple compute units(CUs). The protocol works just fine
219 # even if sqc is not shared. Overriding this option here
220 # so that the user need not explicitly set this (assuming
221 # sharing sqc is the common usage)
222 n_cu = options.num_compute_units
223 num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc))
224 options.num_sqc = num_sqc # pass this to Ruby
225 num_scalar_cache = int(math.ceil(float(n_cu) / options.cu_per_scalar_cache))
226 options.num_scalar_cache = num_scalar_cache
227
228 print('Num SQC = ', num_sqc, 'Num scalar caches = ', num_scalar_cache,
229 'Num CU = ', n_cu)
230
231 ########################## Creating the GPU system ########################
232 # shader is the GPU
233 shader = Shader(n_wf = options.wfs_per_simd,
234 clk_domain = SrcClockDomain(
235 clock = options.gpu_clock,
236 voltage_domain = VoltageDomain(
237 voltage = options.gpu_voltage)))
238
239 # GPU_RfO(Read For Ownership) implements SC/TSO memory model.
240 # Other GPU protocols implement release consistency at GPU side.
241 # So, all GPU protocols other than GPU_RfO should make their writes
242 # visible to the global memory and should read from global memory
243 # during kernal boundary. The pipeline initiates(or do not initiate)
244 # the acquire/release operation depending on these impl_kern_launch_rel
245 # and impl_kern_end_rel flags. The flag=true means pipeline initiates
246 # a acquire/release operation at kernel launch/end.
247 # VIPER protocols (GPU_VIPER, GPU_VIPER_Region and GPU_VIPER_Baseline)
248 # are write-through based, and thus only imple_kern_launch_acq needs to
249 # set.
250 if buildEnv['PROTOCOL'] == 'GPU_RfO':
251 shader.impl_kern_launch_acq = False
252 shader.impl_kern_end_rel = False
253 elif (buildEnv['PROTOCOL'] != 'GPU_VIPER' or
254 buildEnv['PROTOCOL'] != 'GPU_VIPER_Region' or
255 buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline'):
256 shader.impl_kern_launch_acq = True
257 shader.impl_kern_end_rel = False
258 else:
259 shader.impl_kern_launch_acq = True
260 shader.impl_kern_end_rel = True
261
262 # Switching off per-lane TLB by default
263 per_lane = False
264 if options.TLB_config == "perLane":
265 per_lane = True
266
267 # List of compute units; one GPU can have multiple compute units
268 compute_units = []
269 for i in range(n_cu):
270 compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
271 num_SIMDs = options.simds_per_cu,
272 wf_size = options.wf_size,
273 spbypass_pipe_length = \
274 options.sp_bypass_path_length,
275 dpbypass_pipe_length = \
276 options.dp_bypass_path_length,
277 issue_period = options.issue_period,
278 coalescer_to_vrf_bus_width = \
279 options.glbmem_rd_bus_width,
280 vrf_to_coalescer_bus_width = \
281 options.glbmem_wr_bus_width,
282 num_global_mem_pipes = \
283 options.glb_mem_pipes_per_cu,
284 num_shared_mem_pipes = \
285 options.shr_mem_pipes_per_cu,
286 n_wf = options.wfs_per_simd,
287 execPolicy = options.CUExecPolicy,
288 debugSegFault = options.SegFaultDebug,
289 functionalTLB = options.FunctionalTLB,
290 localMemBarrier = options.LocalMemBarrier,
291 countPages = options.countPages,
292 localDataStore = \
293 LdsState(banks = options.numLdsBanks,
294 bankConflictPenalty = \
295 options.ldsBankConflictPenalty,
296 size = options.lds_size)))
297 wavefronts = []
298 vrfs = []
299 vrf_pool_mgrs = []
300 srfs = []
301 srf_pool_mgrs = []
302 for j in xrange(options.simds_per_cu):
303 for k in xrange(shader.n_wf):
304 wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
305 wf_size = options.wf_size))
306 vrf_pool_mgrs.append(SimplePoolManager(pool_size = \
307 options.vreg_file_size,
308 min_alloc = \
309 options.vreg_min_alloc))
310
311 vrfs.append(VectorRegisterFile(simd_id=j, wf_size=options.wf_size,
312 num_regs=options.vreg_file_size))
313
314 srf_pool_mgrs.append(SimplePoolManager(pool_size = \
315 options.sreg_file_size,
316 min_alloc = \
317 options.vreg_min_alloc))
318 srfs.append(ScalarRegisterFile(simd_id=j, wf_size=options.wf_size,
319 num_regs=options.sreg_file_size))
320
321 compute_units[-1].wavefronts = wavefronts
322 compute_units[-1].vector_register_file = vrfs
323 compute_units[-1].scalar_register_file = srfs
324 compute_units[-1].register_manager = \
325 RegisterManager(policy=options.registerManagerPolicy,
326 vrf_pool_managers=vrf_pool_mgrs,
327 srf_pool_managers=srf_pool_mgrs)
328 if options.TLB_prefetch:
329 compute_units[-1].prefetch_depth = options.TLB_prefetch
330 compute_units[-1].prefetch_prev_type = options.pf_type
331
332 # attach the LDS and the CU to the bus (actually a Bridge)
333 compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave
334 compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort
335
336 # Attach compute units to GPU
337 shader.CUs = compute_units
338
339 ########################## Creating the CPU system ########################
340 # The shader core will be whatever is after the CPU cores are accounted for
341 shader_idx = options.num_cpus
342
343 # The command processor will be whatever is after the shader is accounted for
344 cp_idx = shader_idx + 1
345 cp_list = []
346
347 # List of CPUs
348 cpu_list = []
349
350 CpuClass, mem_mode = Simulation.getCPUClass(options.cpu_type)
351 if CpuClass == AtomicSimpleCPU:
352 fatal("AtomicSimpleCPU is not supported")
353 if mem_mode != 'timing':
354 fatal("Only the timing memory mode is supported")
355 shader.timing = True
356
357 if options.fast_forward and options.fast_forward_pseudo_op:
358 fatal("Cannot fast-forward based both on the number of instructions and"
359 " on pseudo-ops")
360 fast_forward = options.fast_forward or options.fast_forward_pseudo_op
361
362 if fast_forward:
363 FutureCpuClass, future_mem_mode = CpuClass, mem_mode
364
365 CpuClass = X86KvmCPU
366 mem_mode = 'atomic_noncaching'
367 # Leave shader.timing untouched, because its value only matters at the
368 # start of the simulation and because we require switching cpus
369 # *before* the first kernel launch.
370
371 future_cpu_list = []
372
373 # Initial CPUs to be used during fast-forwarding.
374 for i in range(options.num_cpus):
375 cpu = CpuClass(cpu_id = i,
376 clk_domain = SrcClockDomain(
377 clock = options.CPUClock,
378 voltage_domain = VoltageDomain(
379 voltage = options.cpu_voltage)))
380 cpu_list.append(cpu)
381
382 if options.fast_forward:
383 cpu.max_insts_any_thread = int(options.fast_forward)
384
385 if fast_forward:
386 MainCpuClass = FutureCpuClass
387 else:
388 MainCpuClass = CpuClass
389
390 # CPs to be used throughout the simulation.
391 for i in range(options.num_cp):
392 cp = MainCpuClass(cpu_id = options.num_cpus + i,
393 clk_domain = SrcClockDomain(
394 clock = options.CPUClock,
395 voltage_domain = VoltageDomain(
396 voltage = options.cpu_voltage)))
397 cp_list.append(cp)
398
399 # Main CPUs (to be used after fast-forwarding if fast-forwarding is specified).
400 for i in range(options.num_cpus):
401 cpu = MainCpuClass(cpu_id = i,
402 clk_domain = SrcClockDomain(
403 clock = options.CPUClock,
404 voltage_domain = VoltageDomain(
405 voltage = options.cpu_voltage)))
406 if fast_forward:
407 cpu.switched_out = True
408 future_cpu_list.append(cpu)
409 else:
410 cpu_list.append(cpu)
411
412 host_cpu = cpu_list[0]
413
414 hsapp_gpu_map_vaddr = 0x200000000
415 hsapp_gpu_map_size = 0x1000
416 hsapp_gpu_map_paddr = int(Addr(options.mem_size))
417
418 # HSA kernel mode driver
419 gpu_driver = GPUComputeDriver(filename="kfd")
420
421 # Creating the GPU kernel launching components: that is the HSA
422 # packet processor (HSAPP), GPU command processor (CP), and the
423 # dispatcher.
424 gpu_hsapp = HSAPacketProcessor(pioAddr=hsapp_gpu_map_paddr,
425 numHWQueues=options.num_hw_queues)
426 dispatcher = GPUDispatcher()
427 gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp,
428 dispatcher=dispatcher)
429 gpu_driver.device = gpu_cmd_proc
430 shader.dispatcher = dispatcher
431 shader.gpu_cmd_proc = gpu_cmd_proc
432
433 # Create and assign the workload Check for rel_path in elements of
434 # base_list using test, returning the first full path that satisfies test
435 def find_path(base_list, rel_path, test):
436 for base in base_list:
437 if not base:
438 # base could be None if environment var not set
439 continue
440 full_path = os.path.join(base, rel_path)
441 if test(full_path):
442 return full_path
443 fatal("%s not found in %s" % (rel_path, base_list))
444
445 def find_file(base_list, rel_path):
446 return find_path(base_list, rel_path, os.path.isfile)
447
448 executable = find_path(benchmark_path, options.cmd, os.path.exists)
449 # It's common for a benchmark to be in a directory with the same
450 # name as the executable, so we handle that automatically
451 if os.path.isdir(executable):
452 benchmark_path = [executable]
453 executable = find_file(benchmark_path, options.cmd)
454
455 if options.env:
456 with open(options.env, 'r') as f:
457 env = [line.rstrip() for line in f]
458 else:
459 env = ['LD_LIBRARY_PATH=%s' % ':'.join([
460 os.getenv('ROCM_PATH','/opt/rocm')+'/lib',
461 os.getenv('HCC_HOME','/opt/rocm/hcc')+'/lib',
462 os.getenv('HSA_PATH','/opt/rocm/hsa')+'/lib',
463 os.getenv('HIP_PATH','/opt/rocm/hip')+'/lib',
464 os.getenv('ROCM_PATH','/opt/rocm')+'/libhsakmt/lib',
465 os.getenv('ROCM_PATH','/opt/rocm')+'/miopen/lib',
466 os.getenv('ROCM_PATH','/opt/rocm')+'/miopengemm/lib',
467 os.getenv('ROCM_PATH','/opt/rocm')+'/hipblas/lib',
468 os.getenv('ROCM_PATH','/opt/rocm')+'/rocblas/lib',
469 "/usr/lib/x86_64-linux-gnu"
470 ]),
471 'HOME=%s' % os.getenv('HOME','/'),
472 "HSA_ENABLE_INTERRUPT=0"]
473
474 process = Process(executable = executable, cmd = [options.cmd]
475 + options.options.split(), drivers = [gpu_driver], env = env)
476
477 for cpu in cpu_list:
478 cpu.createThreads()
479 cpu.workload = process
480
481 for cp in cp_list:
482 cp.workload = host_cpu.workload
483
484 if fast_forward:
485 for i in range(len(future_cpu_list)):
486 future_cpu_list[i].workload = cpu_list[i].workload
487 future_cpu_list[i].createThreads()
488
489 ########################## Create the overall system ########################
490 # List of CPUs that must be switched when moving between KVM and simulation
491 if fast_forward:
492 switch_cpu_list = \
493 [(cpu_list[i], future_cpu_list[i]) for i in range(options.num_cpus)]
494
495 # Full list of processing cores in the system.
496 cpu_list = cpu_list + [shader] + cp_list
497
498 # creating the overall system
499 # notice the cpu list is explicitly added as a parameter to System
500 system = System(cpu = cpu_list,
501 mem_ranges = [AddrRange(options.mem_size)],
502 cache_line_size = options.cacheline_size,
503 mem_mode = mem_mode)
504 if fast_forward:
505 system.future_cpu = future_cpu_list
506 system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
507 system.clk_domain = SrcClockDomain(clock = options.sys_clock,
508 voltage_domain = system.voltage_domain)
509
510 if fast_forward:
511 have_kvm_support = 'BaseKvmCPU' in globals()
512 if have_kvm_support and buildEnv['TARGET_ISA'] == "x86":
513 system.vm = KvmVM()
514 for i in range(len(host_cpu.workload)):
515 host_cpu.workload[i].useArchPT = True
516 host_cpu.workload[i].kvmInSE = True
517 else:
518 fatal("KvmCPU can only be used in SE mode with x86")
519
520 # configure the TLB hierarchy
521 GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
522
523 # create Ruby system
524 system.piobus = IOXBar(width=32, response_latency=0,
525 frontend_latency=0, forward_latency=0)
526 dma_list = [gpu_hsapp, gpu_cmd_proc]
527 Ruby.create_system(options, None, system, None, dma_list, None)
528 system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock,
529 voltage_domain = system.voltage_domain)
530 gpu_cmd_proc.pio = system.piobus.master
531 gpu_hsapp.pio = system.piobus.master
532
533 for i, dma_device in enumerate(dma_list):
534 exec('system.dma_cntrl%d.clk_domain = system.ruby.clk_domain' % i)
535
536 # attach the CPU ports to Ruby
537 for i in range(options.num_cpus):
538 ruby_port = system.ruby._cpu_ports[i]
539
540 # Create interrupt controller
541 system.cpu[i].createInterruptController()
542
543 # Connect cache port's to ruby
544 system.cpu[i].icache_port = ruby_port.slave
545 system.cpu[i].dcache_port = ruby_port.slave
546
547 ruby_port.mem_master_port = system.piobus.slave
548 if buildEnv['TARGET_ISA'] == "x86":
549 system.cpu[i].interrupts[0].pio = system.piobus.master
550 system.cpu[i].interrupts[0].int_master = system.piobus.slave
551 system.cpu[i].interrupts[0].int_slave = system.piobus.master
552 if fast_forward:
553 system.cpu[i].itb.walker.port = ruby_port.slave
554 system.cpu[i].dtb.walker.port = ruby_port.slave
555
556 # attach CU ports to Ruby
557 # Because of the peculiarities of the CP core, you may have 1 CPU but 2
558 # sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be
559 # hooked up until after the CP. To make this script generic, figure out
560 # the index as below, but note that this assumes there is one sequencer
561 # per compute unit and one sequencer per SQC for the math to work out
562 # correctly.
563 gpu_port_idx = len(system.ruby._cpu_ports) \
564 - options.num_compute_units - options.num_sqc \
565 - options.num_scalar_cache
566 gpu_port_idx = gpu_port_idx - options.num_cp * 2
567
568 wavefront_size = options.wf_size
569 for i in range(n_cu):
570 # The pipeline issues wavefront_size number of uncoalesced requests
571 # in one GPU issue cycle. Hence wavefront_size mem ports.
572 for j in range(wavefront_size):
573 system.cpu[shader_idx].CUs[i].memory_port[j] = \
574 system.ruby._cpu_ports[gpu_port_idx].slave[j]
575 system.cpu[shader_idx].CUs[i].gmTokenPort = \
576 system.ruby._cpu_ports[gpu_port_idx].gmTokenPort
577 gpu_port_idx += 1
578
579 for i in range(n_cu):
580 if i > 0 and not i % options.cu_per_sqc:
581 print("incrementing idx on ", i)
582 gpu_port_idx += 1
583 system.cpu[shader_idx].CUs[i].sqc_port = \
584 system.ruby._cpu_ports[gpu_port_idx].slave
585 gpu_port_idx = gpu_port_idx + 1
586
587 for i in xrange(n_cu):
588 if i > 0 and not i % options.cu_per_scalar_cache:
589 print("incrementing idx on ", i)
590 gpu_port_idx += 1
591 system.cpu[shader_idx].CUs[i].scalar_port = \
592 system.ruby._cpu_ports[gpu_port_idx].slave
593 gpu_port_idx = gpu_port_idx + 1
594
595 # attach CP ports to Ruby
596 for i in range(options.num_cp):
597 system.cpu[cp_idx].createInterruptController()
598 system.cpu[cp_idx].dcache_port = \
599 system.ruby._cpu_ports[gpu_port_idx + i * 2].slave
600 system.cpu[cp_idx].icache_port = \
601 system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave
602 system.cpu[cp_idx].interrupts[0].pio = system.piobus.master
603 system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave
604 system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master
605 cp_idx = cp_idx + 1
606
607 ################# Connect the CPU and GPU via GPU Dispatcher ##################
608 # CPU rings the GPU doorbell to notify a pending task
609 # using this interface.
610 # And GPU uses this interface to notify the CPU of task completion
611 # The communcation happens through emulated driver.
612
613 # Note this implicit setting of the cpu_pointer, shader_pointer and tlb array
614 # parameters must be after the explicit setting of the System cpu list
615 if fast_forward:
616 shader.cpu_pointer = future_cpu_list[0]
617 else:
618 shader.cpu_pointer = host_cpu
619
620 ########################## Start simulation ########################
621
622 redirect_paths = [RedirectPath(app_path = "/proc",
623 host_paths =
624 ["%s/fs/proc" % m5.options.outdir]),
625 RedirectPath(app_path = "/sys",
626 host_paths =
627 ["%s/fs/sys" % m5.options.outdir]),
628 RedirectPath(app_path = "/tmp",
629 host_paths =
630 ["%s/fs/tmp" % m5.options.outdir])]
631
632 system.redirect_paths = redirect_paths
633
634 root = Root(system=system, full_system=False)
635
636 hsaTopology.createHsaTopology(options)
637
638 m5.ticks.setGlobalFrequency('1THz')
639 if options.abs_max_tick:
640 maxtick = options.abs_max_tick
641 else:
642 maxtick = m5.MaxTick
643
644 # Benchmarks support work item annotations
645 Simulation.setWorkCountOptions(system, options)
646
647 # Checkpointing is not supported by APU model
648 if (options.checkpoint_dir != None or
649 options.checkpoint_restore != None):
650 fatal("Checkpointing not supported by apu model")
651
652 checkpoint_dir = None
653 m5.instantiate(checkpoint_dir)
654
655 # Map workload to this address space
656 host_cpu.workload[0].map(0x10000000, 0x200000000, 4096)
657
658 if options.fast_forward:
659 print("Switch at instruction count: %d" % cpu_list[0].max_insts_any_thread)
660
661 exit_event = m5.simulate(maxtick)
662
663 if options.fast_forward:
664 if exit_event.getCause() == "a thread reached the max instruction count":
665 m5.switchCpus(system, switch_cpu_list)
666 print("Switched CPUS @ tick %s" % (m5.curTick()))
667 m5.stats.reset()
668 exit_event = m5.simulate(maxtick - m5.curTick())
669 elif options.fast_forward_pseudo_op:
670 while exit_event.getCause() == "switchcpu":
671 # If we are switching *to* kvm, then the current stats are meaningful
672 # Note that we don't do any warmup by default
673 if type(switch_cpu_list[0][0]) == FutureCpuClass:
674 print("Dumping stats...")
675 m5.stats.dump()
676 m5.switchCpus(system, switch_cpu_list)
677 print("Switched CPUS @ tick %s" % (m5.curTick()))
678 m5.stats.reset()
679 # This lets us switch back and forth without keeping a counter
680 switch_cpu_list = [(x[1], x[0]) for x in switch_cpu_list]
681 exit_event = m5.simulate(maxtick - m5.curTick())
682
683 print("Ticks:", m5.curTick())
684 print('Exiting because ', exit_event.getCause())
685
686 sys.exit(exit_event.getCode())