2 # Copyright (c) 2015 Advanced Micro Devices, Inc.
5 # For use for simulation and test purposes only
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are met:
10 # 1. Redistributions of source code must retain the above copyright notice,
11 # this list of conditions and the following disclaimer.
13 # 2. Redistributions in binary form must reproduce the above copyright notice,
14 # this list of conditions and the following disclaimer in the documentation
15 # and/or other materials provided with the distribution.
17 # 3. Neither the name of the copyright holder nor the names of its contributors
18 # may be used to endorse or promote products derived from this software
19 # without specific prior written permission.
21 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 # POSSIBILITY OF SUCH DAMAGE.
33 # Author: Steve Reinhardt
36 from ClockedObject
import ClockedObject
37 from Device
import DmaDevice
38 from m5
.defines
import buildEnv
39 from m5
.params
import *
40 from m5
.proxy
import *
41 from m5
.SimObject
import SimObject
42 from MemObject
import MemObject
43 from Process
import EmulatedDriver
44 from Bridge
import Bridge
45 from LdsState
import LdsState
47 class PrefetchType(Enum
): vals
= [
55 class VectorRegisterFile(SimObject
):
56 type = 'VectorRegisterFile'
57 cxx_class
= 'VectorRegisterFile'
58 cxx_header
= 'gpu-compute/vector_register_file.hh'
60 simd_id
= Param
.Int(0, 'SIMD ID associated with this VRF')
61 num_regs_per_simd
= Param
.Int(2048, 'number of vector registers per SIMD')
62 wfSize
= Param
.Int(64, 'Wavefront size (in work items)')
63 min_alloc
= Param
.Int(4, 'min number of VGPRs allocated per WF')
65 class Wavefront(SimObject
):
67 cxx_class
= 'Wavefront'
68 cxx_header
= 'gpu-compute/wavefront.hh'
70 simdId
= Param
.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
71 wf_slot_id
= Param
.Int('wavefront id (0-ComputeUnit.max_wfs)')
72 wfSize
= Param
.Int(64, 'Wavefront size (in work items)')
74 class ComputeUnit(MemObject
):
76 cxx_class
= 'ComputeUnit'
77 cxx_header
= 'gpu-compute/compute_unit.hh'
79 wavefronts
= VectorParam
.Wavefront('Number of wavefronts')
80 wfSize
= Param
.Int(64, 'Wavefront size (in work items)')
81 num_SIMDs
= Param
.Int(4, 'number of SIMD units per CU')
83 spbypass_pipe_length
= Param
.Int(4, 'vector ALU Single Precision bypass '\
86 dpbypass_pipe_length
= Param
.Int(8, 'vector ALU Double Precision bypass '\
89 issue_period
= Param
.Int(4, 'number of cycles per issue period')
90 num_global_mem_pipes
= Param
.Int(1,'number of global memory pipes per CU')
91 num_shared_mem_pipes
= Param
.Int(1,'number of shared memory pipes per CU')
92 n_wf
= Param
.Int(1, 'Number of wavefront slots per SIMD')
93 mem_req_latency
= Param
.Int(9, "Latency for request from the cu to ruby. "\
94 "Represents the pipeline to reach the TCP and "\
95 "specified in GPU clock cycles")
96 mem_resp_latency
= Param
.Int(9, "Latency for responses from ruby to the "\
97 "cu. Represents the pipeline between the TCP "\
98 "and cu as well as TCP data array access. "\
99 "Specified in GPU clock cycles")
100 system
= Param
.System(Parent
.any
, "system object")
101 cu_id
= Param
.Int('CU id')
102 vrf_to_coalescer_bus_width
= Param
.Int(32, "VRF->Coalescer data bus width "\
104 coalescer_to_vrf_bus_width
= Param
.Int(32, "Coalescer->VRF data bus width "\
107 memory_port
= VectorMasterPort("Port to the memory system")
108 translation_port
= VectorMasterPort('Port to the TLB hierarchy')
109 sqc_port
= MasterPort("Port to the SQC (I-cache")
110 sqc_tlb_port
= MasterPort("Port to the TLB for the SQC (I-cache)")
111 perLaneTLB
= Param
.Bool(False, "enable per-lane TLB")
112 prefetch_depth
= Param
.Int(0, "Number of prefetches triggered at a time"\
113 "(0 turns off prefetching)")
114 prefetch_stride
= Param
.Int(1, "Fixed Prefetch Stride (1 means next-page)")
115 prefetch_prev_type
= Param
.PrefetchType('PF_PHASE', "Prefetch the stride "\
116 "from last mem req in lane of "\
117 "CU|Phase|Wavefront")
118 execPolicy
= Param
.String("OLDEST-FIRST", "WF execution selection policy");
119 xactCasMode
= Param
.Bool(False, "Behavior of xact_cas_load magic instr.");
120 debugSegFault
= Param
.Bool(False, "enable debugging GPU seg faults")
121 functionalTLB
= Param
.Bool(False, "Assume TLB causes no delay")
123 localMemBarrier
= Param
.Bool(False, "Assume Barriers do not wait on "\
126 countPages
= Param
.Bool(False, "Generate per-CU file of all pages touched "\
127 "and how many times")
128 global_mem_queue_size
= Param
.Int(256, "Number of entries in the global "
129 "memory pipeline's queues")
130 local_mem_queue_size
= Param
.Int(256, "Number of entries in the local "
131 "memory pipeline's queues")
132 ldsBus
= Bridge() # the bridge between the CU and its LDS
133 ldsPort
= MasterPort("The port that goes to the LDS")
134 localDataStore
= Param
.LdsState("the LDS for this CU")
136 vector_register_file
= VectorParam
.VectorRegisterFile("Vector register "\
139 class Shader(ClockedObject
):
142 cxx_header
= 'gpu-compute/shader.hh'
144 CUs
= VectorParam
.ComputeUnit('Number of compute units')
145 n_wf
= Param
.Int(1, 'Number of wavefront slots per SIMD')
146 impl_kern_boundary_sync
= Param
.Bool(True, """Insert acq/rel packets into
147 ruby at kernel boundaries""")
148 separate_acquire_release
= Param
.Bool(False,
149 """Do ld_acquire/st_release generate separate requests for the
150 acquire and release?""")
151 globalmem
= Param
.MemorySize('64kB', 'Memory size')
152 timing
= Param
.Bool(False, 'timing memory accesses')
154 cpu_pointer
= Param
.BaseCPU(NULL
, "pointer to base CPU")
155 translation
= Param
.Bool(False, "address translation");
157 class ClDriver(EmulatedDriver
):
159 cxx_header
= 'gpu-compute/cl_driver.hh'
160 codefile
= VectorParam
.String('code file name(s)')
162 class GpuDispatcher(DmaDevice
):
163 type = 'GpuDispatcher'
164 cxx_header
= 'gpu-compute/dispatcher.hh'
165 # put at 8GB line for now
166 pio_addr
= Param
.Addr(0x200000000, "Device Address")
167 pio_latency
= Param
.Latency('1ns', "Programmed IO latency")
168 shader_pointer
= Param
.Shader('pointer to shader')
169 translation_port
= MasterPort('Port to the dispatcher TLB')
170 cpu
= Param
.BaseCPU("CPU to wake up on kernel completion")
172 cl_driver
= Param
.ClDriver('pointer to driver')
174 class OpType(Enum
): vals
= [
200 'OT_READONLY_ATOMIC',
211 # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version
213 'OT_SHARED_MEMFENCE',
214 'OT_GLOBAL_MEMFENCE',
224 class MemType(Enum
): vals
= [
238 class MemOpType(Enum
): vals
= [
279 class StorageClassType(Enum
): vals
= [
289 class RegisterType(Enum
): vals
= [
297 class GenericMemoryOrder(Enum
): vals
= [
299 'MEMORY_ORDER_RELAXED',
300 'MEMORY_ORDER_SC_ACQUIRE',
301 'MEMORY_ORDER_SC_RELEASE',
302 'MEMORY_ORDER_SC_ACQUIRE_RELEASE',
305 class GenericMemoryScope(Enum
): vals
= [
307 'MEMORY_SCOPE_WORKITEM',
308 'MEMORY_SCOPE_WAVEFRONT',
309 'MEMORY_SCOPE_WORKGROUP',
310 'MEMORY_SCOPE_DEVICE',
311 'MEMORY_SCOPE_SYSTEM',