4e9c75fd6cea509e81587a7f5073af04936ee9e4
[gem5.git] / configs / example / apu_se.py
1 # Copyright (c) 2015 Advanced Micro Devices, Inc.
2 # All rights reserved.
3 #
4 # For use for simulation and test purposes only
5 #
6 # Redistribution and use in source and binary forms, with or without
7 # modification, are permitted provided that the following conditions are met:
8 #
9 # 1. Redistributions of source code must retain the above copyright notice,
10 # this list of conditions and the following disclaimer.
11 #
12 # 2. Redistributions in binary form must reproduce the above copyright notice,
13 # this list of conditions and the following disclaimer in the documentation
14 # and/or other materials provided with the distribution.
15 #
16 # 3. Neither the name of the copyright holder nor the names of its
17 # contributors may be used to endorse or promote products derived from this
18 # software without specific prior written permission.
19 #
20 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
24 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 # POSSIBILITY OF SUCH DAMAGE.
31
32 from __future__ import print_function
33 from __future__ import absolute_import
34
35 import optparse, os, re, getpass
36 import math
37 import glob
38 import inspect
39
40 import m5
41 from m5.objects import *
42 from m5.util import addToPath
43
44 addToPath('../')
45
46 from ruby import Ruby
47
48 from common import Options
49 from common import Simulation
50 from common import GPUTLBOptions, GPUTLBConfig
51
52 import hsaTopology
53 from common import FileSystemConfig
54
55 ########################## Script Options ########################
56 def setOption(parser, opt_str, value = 1):
57 # check to make sure the option actually exists
58 if not parser.has_option(opt_str):
59 raise Exception("cannot find %s in list of possible options" % opt_str)
60
61 opt = parser.get_option(opt_str)
62 # set the value
63 exec("parser.values.%s = %s" % (opt.dest, value))
64
65 def getOption(parser, opt_str):
66 # check to make sure the option actually exists
67 if not parser.has_option(opt_str):
68 raise Exception("cannot find %s in list of possible options" % opt_str)
69
70 opt = parser.get_option(opt_str)
71 # get the value
72 exec("return_value = parser.values.%s" % opt.dest)
73 return return_value
74
75 # Adding script options
76 parser = optparse.OptionParser()
77 Options.addCommonOptions(parser)
78 Options.addSEOptions(parser)
79
80 parser.add_option("--cpu-only-mode", action="store_true", default=False,
81 help="APU mode. Used to take care of problems in "\
82 "Ruby.py while running APU protocols")
83 parser.add_option("-u", "--num-compute-units", type="int", default=4,
84 help="number of GPU compute units"),
85 parser.add_option("--num-cp", type="int", default=0,
86 help="Number of GPU Command Processors (CP)")
87 parser.add_option("--benchmark-root", help="Root of benchmark directory tree")
88
89 # not super important now, but to avoid putting the number 4 everywhere, make
90 # it an option/knob
91 parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \
92 "sharing an SQC (icache, and thus icache TLB)")
93 parser.add_option('--cu-per-scalar-cache', type='int', default=4,
94 help='Number of CUs sharing a scalar cache')
95 parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
96 "per CU")
97 parser.add_option('--cu-per-sa', type='int', default=4,
98 help='Number of CUs per shader array. This must be a '
99 'multiple of options.cu-per-sqc and options.cu-per-scalar')
100 parser.add_option('--sa-per-complex', type='int', default=1,
101 help='Number of shader arrays per complex')
102 parser.add_option('--num-gpu-complexes', type='int', default=1,
103 help='Number of GPU complexes')
104 parser.add_option("--wf-size", type="int", default=64,
105 help="Wavefront size(in workitems)")
106 parser.add_option("--sp-bypass-path-length", type="int", default=4, \
107 help="Number of stages of bypass path in vector ALU for "
108 "Single Precision ops")
109 parser.add_option("--dp-bypass-path-length", type="int", default=4, \
110 help="Number of stages of bypass path in vector ALU for "
111 "Double Precision ops")
112 # issue period per SIMD unit: number of cycles before issuing another vector
113 parser.add_option("--issue-period", type="int", default=4, \
114 help="Number of cycles per vector instruction issue period")
115 parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \
116 help="VGPR to Coalescer (Global Memory) data bus width "
117 "in bytes")
118 parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \
119 help="Coalescer to VGPR (Global Memory) data bus width in "
120 "bytes")
121 # Currently we only support 1 local memory pipe
122 parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \
123 help="Number of Shared Memory pipelines per CU")
124 # Currently we only support 1 global memory pipe
125 parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \
126 help="Number of Global Memory pipelines per CU")
127 parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
128 "WF slots per SIMD")
129
130 parser.add_option("--registerManagerPolicy", type="string", default="static",
131 help="Register manager policy")
132 parser.add_option("--vreg-file-size", type="int", default=2048,
133 help="number of physical vector registers per SIMD")
134 parser.add_option("--vreg-min-alloc", type="int", default=4,
135 help="Minimum number of registers that can be allocated "
136 "from the VRF. The total number of registers will be "
137 "aligned to this value.")
138
139 parser.add_option("--sreg-file-size", type="int", default=2048,
140 help="number of physical vector registers per SIMD")
141 parser.add_option("--sreg-min-alloc", type="int", default=4,
142 help="Minimum number of registers that can be allocated "
143 "from the SRF. The total number of registers will be "
144 "aligned to this value.")
145
146 parser.add_option("--bw-scalor", type="int", default=0,
147 help="bandwidth scalor for scalability analysis")
148 parser.add_option("--CPUClock", type="string", default="2GHz",
149 help="CPU clock")
150 parser.add_option("--gpu-clock", type="string", default="1GHz",
151 help="GPU clock")
152 parser.add_option("--cpu-voltage", action="store", type="string",
153 default='1.0V',
154 help = """CPU voltage domain""")
155 parser.add_option("--gpu-voltage", action="store", type="string",
156 default='1.0V',
157 help = """CPU voltage domain""")
158 parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST",
159 help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)")
160 parser.add_option("--SegFaultDebug",action="store_true",
161 help="checks for GPU seg fault before TLB access")
162 parser.add_option("--FunctionalTLB",action="store_true",
163 help="Assumes TLB has no latency")
164 parser.add_option("--LocalMemBarrier",action="store_true",
165 help="Barrier does not wait for writethroughs to complete")
166 parser.add_option("--countPages", action="store_true",
167 help="Count Page Accesses and output in per-CU output files")
168 parser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\
169 "TLBs")
170 parser.add_option("--pf-type", type="string", help="type of prefetch: "\
171 "PF_CU, PF_WF, PF_PHASE, PF_STRIDE")
172 parser.add_option("--pf-stride", type="int", help="set prefetch stride")
173 parser.add_option("--numLdsBanks", type="int", default=32,
174 help="number of physical banks per LDS module")
175 parser.add_option("--ldsBankConflictPenalty", type="int", default=1,
176 help="number of cycles per LDS bank conflict")
177 parser.add_option('--fast-forward-pseudo-op', action='store_true',
178 help = 'fast forward using kvm until the m5_switchcpu'
179 ' pseudo-op is encountered, then switch cpus. subsequent'
180 ' m5_switchcpu pseudo-ops will toggle back and forth')
181 parser.add_option("--num-hw-queues", type="int", default=10,
182 help="number of hw queues in packet processor")
183
184 Ruby.define_options(parser)
185
186 #add TLB options to the parser
187 GPUTLBOptions.tlb_options(parser)
188
189 (options, args) = parser.parse_args()
190
191 # The GPU cache coherence protocols only work with the backing store
192 setOption(parser, "--access-backing-store")
193
194 # if benchmark root is specified explicitly, that overrides the search path
195 if options.benchmark_root:
196 benchmark_path = [options.benchmark_root]
197 else:
198 # Set default benchmark search path to current dir
199 benchmark_path = ['.']
200
201 ########################## Sanity Check ########################
202
203 # Currently the gpu model requires ruby
204 if buildEnv['PROTOCOL'] == 'None':
205 fatal("GPU model requires ruby")
206
207 # Currently the gpu model requires only timing or detailed CPU
208 if not (options.cpu_type == "TimingSimpleCPU" or
209 options.cpu_type == "DerivO3CPU"):
210 fatal("GPU model requires TimingSimpleCPU or DerivO3CPU")
211
212 # This file can support multiple compute units
213 assert(options.num_compute_units >= 1)
214
215 # Currently, the sqc (I-Cache of GPU) is shared by
216 # multiple compute units(CUs). The protocol works just fine
217 # even if sqc is not shared. Overriding this option here
218 # so that the user need not explicitly set this (assuming
219 # sharing sqc is the common usage)
220 n_cu = options.num_compute_units
221 num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc))
222 options.num_sqc = num_sqc # pass this to Ruby
223 num_scalar_cache = int(math.ceil(float(n_cu) / options.cu_per_scalar_cache))
224 options.num_scalar_cache = num_scalar_cache
225
226 print('Num SQC = ', num_sqc, 'Num scalar caches = ', num_scalar_cache,
227 'Num CU = ', n_cu)
228
229 ########################## Creating the GPU system ########################
230 # shader is the GPU
231 shader = Shader(n_wf = options.wfs_per_simd,
232 clk_domain = SrcClockDomain(
233 clock = options.gpu_clock,
234 voltage_domain = VoltageDomain(
235 voltage = options.gpu_voltage)))
236
237 # GPU_RfO(Read For Ownership) implements SC/TSO memory model.
238 # Other GPU protocols implement release consistency at GPU side.
239 # So, all GPU protocols other than GPU_RfO should make their writes
240 # visible to the global memory and should read from global memory
241 # during kernal boundary. The pipeline initiates(or do not initiate)
242 # the acquire/release operation depending on these impl_kern_launch_rel
243 # and impl_kern_end_rel flags. The flag=true means pipeline initiates
244 # a acquire/release operation at kernel launch/end.
245 # VIPER protocols (GPU_VIPER, GPU_VIPER_Region and GPU_VIPER_Baseline)
246 # are write-through based, and thus only imple_kern_launch_acq needs to
247 # set.
248 if buildEnv['PROTOCOL'] == 'GPU_RfO':
249 shader.impl_kern_launch_acq = False
250 shader.impl_kern_end_rel = False
251 elif (buildEnv['PROTOCOL'] != 'GPU_VIPER' or
252 buildEnv['PROTOCOL'] != 'GPU_VIPER_Region' or
253 buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline'):
254 shader.impl_kern_launch_acq = True
255 shader.impl_kern_end_rel = False
256 else:
257 shader.impl_kern_launch_acq = True
258 shader.impl_kern_end_rel = True
259
260 # Switching off per-lane TLB by default
261 per_lane = False
262 if options.TLB_config == "perLane":
263 per_lane = True
264
265 # List of compute units; one GPU can have multiple compute units
266 compute_units = []
267 for i in range(n_cu):
268 compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
269 num_SIMDs = options.simds_per_cu,
270 wf_size = options.wf_size,
271 spbypass_pipe_length = \
272 options.sp_bypass_path_length,
273 dpbypass_pipe_length = \
274 options.dp_bypass_path_length,
275 issue_period = options.issue_period,
276 coalescer_to_vrf_bus_width = \
277 options.glbmem_rd_bus_width,
278 vrf_to_coalescer_bus_width = \
279 options.glbmem_wr_bus_width,
280 num_global_mem_pipes = \
281 options.glb_mem_pipes_per_cu,
282 num_shared_mem_pipes = \
283 options.shr_mem_pipes_per_cu,
284 n_wf = options.wfs_per_simd,
285 execPolicy = options.CUExecPolicy,
286 debugSegFault = options.SegFaultDebug,
287 functionalTLB = options.FunctionalTLB,
288 localMemBarrier = options.LocalMemBarrier,
289 countPages = options.countPages,
290 localDataStore = \
291 LdsState(banks = options.numLdsBanks,
292 bankConflictPenalty = \
293 options.ldsBankConflictPenalty)))
294 wavefronts = []
295 vrfs = []
296 vrf_pool_mgrs = []
297 srfs = []
298 srf_pool_mgrs = []
299 for j in xrange(options.simds_per_cu):
300 for k in xrange(shader.n_wf):
301 wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
302 wf_size = options.wf_size))
303 vrf_pool_mgrs.append(SimplePoolManager(pool_size = \
304 options.vreg_file_size,
305 min_alloc = \
306 options.vreg_min_alloc))
307
308 vrfs.append(VectorRegisterFile(simd_id=j, wf_size=options.wf_size,
309 num_regs=options.vreg_file_size))
310
311 srf_pool_mgrs.append(SimplePoolManager(pool_size = \
312 options.sreg_file_size,
313 min_alloc = \
314 options.vreg_min_alloc))
315 srfs.append(ScalarRegisterFile(simd_id=j, wf_size=options.wf_size,
316 num_regs=options.sreg_file_size))
317
318 compute_units[-1].wavefronts = wavefronts
319 compute_units[-1].vector_register_file = vrfs
320 compute_units[-1].scalar_register_file = srfs
321 compute_units[-1].register_manager = \
322 RegisterManager(policy=options.registerManagerPolicy,
323 vrf_pool_managers=vrf_pool_mgrs,
324 srf_pool_managers=srf_pool_mgrs)
325 if options.TLB_prefetch:
326 compute_units[-1].prefetch_depth = options.TLB_prefetch
327 compute_units[-1].prefetch_prev_type = options.pf_type
328
329 # attach the LDS and the CU to the bus (actually a Bridge)
330 compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave
331 compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort
332
333 # Attach compute units to GPU
334 shader.CUs = compute_units
335
336 ########################## Creating the CPU system ########################
337 options.num_cpus = options.num_cpus
338
339 # The shader core will be whatever is after the CPU cores are accounted for
340 shader_idx = options.num_cpus
341
342 # The command processor will be whatever is after the shader is accounted for
343 cp_idx = shader_idx + 1
344 cp_list = []
345
346 # List of CPUs
347 cpu_list = []
348
349 CpuClass, mem_mode = Simulation.getCPUClass(options.cpu_type)
350 if CpuClass == AtomicSimpleCPU:
351 fatal("AtomicSimpleCPU is not supported")
352 if mem_mode != 'timing':
353 fatal("Only the timing memory mode is supported")
354 shader.timing = True
355
356 if options.fast_forward and options.fast_forward_pseudo_op:
357 fatal("Cannot fast-forward based both on the number of instructions and"
358 " on pseudo-ops")
359 fast_forward = options.fast_forward or options.fast_forward_pseudo_op
360
361 if fast_forward:
362 FutureCpuClass, future_mem_mode = CpuClass, mem_mode
363
364 CpuClass = X86KvmCPU
365 mem_mode = 'atomic_noncaching'
366 # Leave shader.timing untouched, because its value only matters at the
367 # start of the simulation and because we require switching cpus
368 # *before* the first kernel launch.
369
370 future_cpu_list = []
371
372 # Initial CPUs to be used during fast-forwarding.
373 for i in range(options.num_cpus):
374 cpu = CpuClass(cpu_id = i,
375 clk_domain = SrcClockDomain(
376 clock = options.CPUClock,
377 voltage_domain = VoltageDomain(
378 voltage = options.cpu_voltage)))
379 cpu_list.append(cpu)
380
381 if options.fast_forward:
382 cpu.max_insts_any_thread = int(options.fast_forward)
383
384 if fast_forward:
385 MainCpuClass = FutureCpuClass
386 else:
387 MainCpuClass = CpuClass
388
389 # CPs to be used throughout the simulation.
390 for i in range(options.num_cp):
391 cp = MainCpuClass(cpu_id = options.num_cpus + i,
392 clk_domain = SrcClockDomain(
393 clock = options.CPUClock,
394 voltage_domain = VoltageDomain(
395 voltage = options.cpu_voltage)))
396 cp_list.append(cp)
397
398 # Main CPUs (to be used after fast-forwarding if fast-forwarding is specified).
399 for i in range(options.num_cpus):
400 cpu = MainCpuClass(cpu_id = i,
401 clk_domain = SrcClockDomain(
402 clock = options.CPUClock,
403 voltage_domain = VoltageDomain(
404 voltage = options.cpu_voltage)))
405 if fast_forward:
406 cpu.switched_out = True
407 future_cpu_list.append(cpu)
408 else:
409 cpu_list.append(cpu)
410
411 host_cpu = cpu_list[0]
412
413 hsapp_gpu_map_vaddr = 0x200000000
414 hsapp_gpu_map_size = 0x1000
415 hsapp_gpu_map_paddr = int(Addr(options.mem_size))
416
417 # HSA kernel mode driver
418 gpu_driver = GPUComputeDriver(filename="kfd")
419
420 # Creating the GPU kernel launching components: that is the HSA
421 # packet processor (HSAPP), GPU command processor (CP), and the
422 # dispatcher.
423 gpu_hsapp = HSAPacketProcessor(pioAddr=hsapp_gpu_map_paddr,
424 numHWQueues=options.num_hw_queues)
425 dispatcher = GPUDispatcher()
426 gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp,
427 dispatcher=dispatcher)
428 gpu_driver.device = gpu_cmd_proc
429 shader.dispatcher = dispatcher
430 shader.gpu_cmd_proc = gpu_cmd_proc
431
432 # Create and assign the workload Check for rel_path in elements of
433 # base_list using test, returning the first full path that satisfies test
434 def find_path(base_list, rel_path, test):
435 for base in base_list:
436 if not base:
437 # base could be None if environment var not set
438 continue
439 full_path = os.path.join(base, rel_path)
440 if test(full_path):
441 return full_path
442 fatal("%s not found in %s" % (rel_path, base_list))
443
444 def find_file(base_list, rel_path):
445 return find_path(base_list, rel_path, os.path.isfile)
446
447 executable = find_path(benchmark_path, options.cmd, os.path.exists)
448 # It's common for a benchmark to be in a directory with the same
449 # name as the executable, so we handle that automatically
450 if os.path.isdir(executable):
451 benchmark_path = [executable]
452 executable = find_file(benchmark_path, options.cmd)
453
454 if options.env:
455 with open(options.env, 'r') as f:
456 env = [line.rstrip() for line in f]
457 else:
458 env = ['LD_LIBRARY_PATH=%s' % ':'.join([
459 "/proj/radl_tools/rocm-1.6/lib",
460 "/proj/radl_tools/rocm-1.6/hcc/lib64",
461 "/tool/pandora64/.package/libunwind-1.1/lib",
462 "/tool/pandora64/.package/gcc-6.4.0/lib64"
463 ]),
464 "HSA_ENABLE_INTERRUPT=0"]
465
466 process = Process(executable = executable, cmd = [options.cmd]
467 + options.options.split(), drivers = [gpu_driver], env = env)
468
469 for cpu in cpu_list:
470 cpu.createThreads()
471 cpu.workload = process
472
473 for cp in cp_list:
474 cp.workload = host_cpu.workload
475
476 if fast_forward:
477 for i in range(len(future_cpu_list)):
478 future_cpu_list[i].workload = cpu_list[i].workload
479 future_cpu_list[i].createThreads()
480
481 ########################## Create the overall system ########################
482 # List of CPUs that must be switched when moving between KVM and simulation
483 if fast_forward:
484 switch_cpu_list = \
485 [(cpu_list[i], future_cpu_list[i]) for i in range(options.num_cpus)]
486
487 # Full list of processing cores in the system.
488 cpu_list = cpu_list + [shader] + cp_list
489
490 # creating the overall system
491 # notice the cpu list is explicitly added as a parameter to System
492 system = System(cpu = cpu_list,
493 mem_ranges = [AddrRange(options.mem_size)],
494 cache_line_size = options.cacheline_size,
495 mem_mode = mem_mode)
496 if fast_forward:
497 system.future_cpu = future_cpu_list
498 system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
499 system.clk_domain = SrcClockDomain(clock = options.sys_clock,
500 voltage_domain = system.voltage_domain)
501
502 if fast_forward:
503 have_kvm_support = 'BaseKvmCPU' in globals()
504 if have_kvm_support and buildEnv['TARGET_ISA'] == "x86":
505 system.vm = KvmVM()
506 for i in range(len(host_cpu.workload)):
507 host_cpu.workload[i].useArchPT = True
508 host_cpu.workload[i].kvmInSE = True
509 else:
510 fatal("KvmCPU can only be used in SE mode with x86")
511
512 # configure the TLB hierarchy
513 GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
514
515 # create Ruby system
516 system.piobus = IOXBar(width=32, response_latency=0,
517 frontend_latency=0, forward_latency=0)
518 dma_list = [gpu_hsapp, gpu_cmd_proc]
519 Ruby.create_system(options, None, system, None, dma_list, None)
520 system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock,
521 voltage_domain = system.voltage_domain)
522 gpu_cmd_proc.pio = system.piobus.master
523 gpu_hsapp.pio = system.piobus.master
524
525 for i, dma_device in enumerate(dma_list):
526 exec('system.dma_cntrl%d.clk_domain = system.ruby.clk_domain' % i)
527
528 # attach the CPU ports to Ruby
529 for i in range(options.num_cpus):
530 ruby_port = system.ruby._cpu_ports[i]
531
532 # Create interrupt controller
533 system.cpu[i].createInterruptController()
534
535 # Connect cache port's to ruby
536 system.cpu[i].icache_port = ruby_port.slave
537 system.cpu[i].dcache_port = ruby_port.slave
538
539 ruby_port.mem_master_port = system.piobus.slave
540 if buildEnv['TARGET_ISA'] == "x86":
541 system.cpu[i].interrupts[0].pio = system.piobus.master
542 system.cpu[i].interrupts[0].int_master = system.piobus.slave
543 system.cpu[i].interrupts[0].int_slave = system.piobus.master
544 if fast_forward:
545 system.cpu[i].itb.walker.port = ruby_port.slave
546 system.cpu[i].dtb.walker.port = ruby_port.slave
547
548 # attach CU ports to Ruby
549 # Because of the peculiarities of the CP core, you may have 1 CPU but 2
550 # sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be
551 # hooked up until after the CP. To make this script generic, figure out
552 # the index as below, but note that this assumes there is one sequencer
553 # per compute unit and one sequencer per SQC for the math to work out
554 # correctly.
555 gpu_port_idx = len(system.ruby._cpu_ports) \
556 - options.num_compute_units - options.num_sqc \
557 - options.num_scalar_cache
558 gpu_port_idx = gpu_port_idx - options.num_cp * 2
559
560 wavefront_size = options.wf_size
561 for i in range(n_cu):
562 # The pipeline issues wavefront_size number of uncoalesced requests
563 # in one GPU issue cycle. Hence wavefront_size mem ports.
564 for j in range(wavefront_size):
565 system.cpu[shader_idx].CUs[i].memory_port[j] = \
566 system.ruby._cpu_ports[gpu_port_idx].slave[j]
567 gpu_port_idx += 1
568
569 for i in range(n_cu):
570 if i > 0 and not i % options.cu_per_sqc:
571 print("incrementing idx on ", i)
572 gpu_port_idx += 1
573 system.cpu[shader_idx].CUs[i].sqc_port = \
574 system.ruby._cpu_ports[gpu_port_idx].slave
575 gpu_port_idx = gpu_port_idx + 1
576
577 for i in xrange(n_cu):
578 if i > 0 and not i % options.cu_per_scalar_cache:
579 print("incrementing idx on ", i)
580 gpu_port_idx += 1
581 system.cpu[shader_idx].CUs[i].scalar_port = \
582 system.ruby._cpu_ports[gpu_port_idx].slave
583 gpu_port_idx = gpu_port_idx + 1
584
585 # attach CP ports to Ruby
586 for i in range(options.num_cp):
587 system.cpu[cp_idx].createInterruptController()
588 system.cpu[cp_idx].dcache_port = \
589 system.ruby._cpu_ports[gpu_port_idx + i * 2].slave
590 system.cpu[cp_idx].icache_port = \
591 system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave
592 system.cpu[cp_idx].interrupts[0].pio = system.piobus.master
593 system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave
594 system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master
595 cp_idx = cp_idx + 1
596
597 ################# Connect the CPU and GPU via GPU Dispatcher ##################
598 # CPU rings the GPU doorbell to notify a pending task
599 # using this interface.
600 # And GPU uses this interface to notify the CPU of task completion
601 # The communcation happens through emulated driver.
602
603 # Note this implicit setting of the cpu_pointer, shader_pointer and tlb array
604 # parameters must be after the explicit setting of the System cpu list
605 if fast_forward:
606 shader.cpu_pointer = future_cpu_list[0]
607 else:
608 shader.cpu_pointer = host_cpu
609
610 ########################## Start simulation ########################
611
612 chroot = os.path.expanduser(options.chroot)
613 redirect_paths = [RedirectPath(src = "/proc",
614 dests = ["%s/fs/proc" % m5.options.outdir]),
615 RedirectPath(src = "/sys",
616 dests = ["%s/fs/sys" % m5.options.outdir]),
617 RedirectPath(src = "/tmp",
618 dests = ["%s/fs/tmp" % m5.options.outdir]),
619 RedirectPath(src = "/dev/shm",
620 dests = ["/dev/shm/%s/gem5_%s" %
621 (getpass.getuser(), os.getpid())])]
622
623 system.redirect_paths = redirect_paths
624
625 root = Root(system=system, full_system=False)
626
627 hsaTopology.createHsaTopology(options)
628
629 m5.ticks.setGlobalFrequency('1THz')
630 if options.abs_max_tick:
631 maxtick = options.abs_max_tick
632 else:
633 maxtick = m5.MaxTick
634
635 # Benchmarks support work item annotations
636 Simulation.setWorkCountOptions(system, options)
637
638 # Checkpointing is not supported by APU model
639 if (options.checkpoint_dir != None or
640 options.checkpoint_restore != None):
641 fatal("Checkpointing not supported by apu model")
642
643 checkpoint_dir = None
644 m5.instantiate(checkpoint_dir)
645
646 # Map workload to this address space
647 host_cpu.workload[0].map(0x10000000, 0x200000000, 4096)
648
649 if options.fast_forward:
650 print("Switch at instruction count: %d" % cpu_list[0].max_insts_any_thread)
651
652 exit_event = m5.simulate(maxtick)
653
654 if options.fast_forward:
655 if exit_event.getCause() == "a thread reached the max instruction count":
656 m5.switchCpus(system, switch_cpu_list)
657 print("Switched CPUS @ tick %s" % (m5.curTick()))
658 m5.stats.reset()
659 exit_event = m5.simulate(maxtick - m5.curTick())
660 elif options.fast_forward_pseudo_op:
661 while exit_event.getCause() == "switchcpu":
662 # If we are switching *to* kvm, then the current stats are meaningful
663 # Note that we don't do any warmup by default
664 if type(switch_cpu_list[0][0]) == FutureCpuClass:
665 print("Dumping stats...")
666 m5.stats.dump()
667 m5.switchCpus(system, switch_cpu_list)
668 print("Switched CPUS @ tick %s" % (m5.curTick()))
669 m5.stats.reset()
670 # This lets us switch back and forth without keeping a counter
671 switch_cpu_list = [(x[1], x[0]) for x in switch_cpu_list]
672 exit_event = m5.simulate(maxtick - m5.curTick())
673
674 print("Ticks:", m5.curTick())
675 print('Exiting because ', exit_event.getCause())
676
677 FileSystemConfig.cleanup_filesystem(options)
678
679 sys.exit(exit_event.getCode())