src/gpu-compute/GPU.py

   1 #
   2 #  Copyright (c) 2015 Advanced Micro Devices, Inc.
   3 #  All rights reserved.
   4 #
   5 #  For use for simulation and test purposes only
   6 #
   7 #  Redistribution and use in source and binary forms, with or without
   8 #  modification, are permitted provided that the following conditions are met:
   9 #
  10 #  1. Redistributions of source code must retain the above copyright notice,
  11 #  this list of conditions and the following disclaimer.
  12 #
  13 #  2. Redistributions in binary form must reproduce the above copyright notice,
  14 #  this list of conditions and the following disclaimer in the documentation
  15 #  and/or other materials provided with the distribution.
  16 #
  17 #  3. Neither the name of the copyright holder nor the names of its contributors
  18 #  may be used to endorse or promote products derived from this software
  19 #  without specific prior written permission.
  20 #
  21 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31 #  POSSIBILITY OF SUCH DAMAGE.
  32 #
  33 #  Author: Steve Reinhardt
  34 #
  35
  36 from ClockedObject import ClockedObject
  37 from Device import DmaDevice
  38 from m5.defines import buildEnv
  39 from m5.params import *
  40 from m5.proxy import *
  41 from m5.SimObject import SimObject
  42 from MemObject import MemObject
  43 from Process import EmulatedDriver
  44 from Bridge import Bridge
  45 from LdsState import LdsState
  46
  47 class PrefetchType(Enum): vals = [
  48     'PF_CU',
  49     'PF_PHASE',
  50     'PF_WF',
  51     'PF_STRIDE',
  52     'PF_END',
  53     ]
  54
  55 class VectorRegisterFile(SimObject):
  56     type = 'VectorRegisterFile'
  57     cxx_class = 'VectorRegisterFile'
  58     cxx_header = 'gpu-compute/vector_register_file.hh'
  59
  60     simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
  61     num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
  62     wfSize = Param.Int(64, 'Wavefront size (in work items)')
  63     min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
  64
  65 class Wavefront(SimObject):
  66     type = 'Wavefront'
  67     cxx_class = 'Wavefront'
  68     cxx_header = 'gpu-compute/wavefront.hh'
  69
  70     simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
  71     wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
  72     wfSize = Param.Int(64, 'Wavefront size (in work items)')
  73
  74 class ComputeUnit(MemObject):
  75     type = 'ComputeUnit'
  76     cxx_class = 'ComputeUnit'
  77     cxx_header = 'gpu-compute/compute_unit.hh'
  78
  79     wavefronts = VectorParam.Wavefront('Number of wavefronts')
  80     wfSize = Param.Int(64, 'Wavefront size (in work items)')
  81     num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
  82
  83     spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
  84                                         'latency')
  85
  86     dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
  87                                         'latency')
  88
  89     issue_period = Param.Int(4, 'number of cycles per issue period')
  90     num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
  91     num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
  92     n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
  93     mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
  94                                 "Represents the pipeline to reach the TCP and "\
  95                                 "specified in GPU clock cycles")
  96     mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
  97                                  "cu. Represents the pipeline between the TCP "\
  98                                  "and cu as well as TCP data array access. "\
  99                                  "Specified in GPU clock cycles")
 100     system = Param.System(Parent.any, "system object")
 101     cu_id = Param.Int('CU id')
 102     vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
 103                                            "in bytes")
 104     coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
 105                                            "in bytes")
 106
 107     memory_port = VectorMasterPort("Port to the memory system")
 108     translation_port = VectorMasterPort('Port to the TLB hierarchy')
 109     sqc_port = MasterPort("Port to the SQC (I-cache")
 110     sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
 111     perLaneTLB = Param.Bool(False, "enable per-lane TLB")
 112     prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
 113                                "(0 turns off prefetching)")
 114     prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)")
 115     prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\
 116                                             "from last mem req in lane of "\
 117                                             "CU|Phase|Wavefront")
 118     execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
 119     xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
 120     debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
 121     functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
 122
 123     localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
 124                                         "kernel end")
 125
 126     countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
 127                                    "and how many times")
 128     global_mem_queue_size = Param.Int(256, "Number of entries in the global "
 129                                       "memory pipeline's queues")
 130     local_mem_queue_size = Param.Int(256, "Number of entries in the local "
 131                                       "memory pipeline's queues")
 132     ldsBus = Bridge() # the bridge between the CU and its LDS
 133     ldsPort = MasterPort("The port that goes to the LDS")
 134     localDataStore = Param.LdsState("the LDS for this CU")
 135
 136     vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
 137                                                           "file")
 138
 139 class Shader(ClockedObject):
 140     type = 'Shader'
 141     cxx_class = 'Shader'
 142     cxx_header = 'gpu-compute/shader.hh'
 143
 144     CUs = VectorParam.ComputeUnit('Number of compute units')
 145     n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
 146     impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
 147                                                   ruby at kernel boundaries""")
 148     separate_acquire_release = Param.Bool(False,
 149         """Do ld_acquire/st_release generate separate requests for the
 150         acquire and release?""")
 151     globalmem = Param.MemorySize('64kB', 'Memory size')
 152     timing = Param.Bool(False, 'timing memory accesses')
 153
 154     cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
 155     translation = Param.Bool(False, "address translation");
 156
 157 class ClDriver(EmulatedDriver):
 158     type = 'ClDriver'
 159     cxx_header = 'gpu-compute/cl_driver.hh'
 160     codefile = VectorParam.String('code file name(s)')
 161
 162 class GpuDispatcher(DmaDevice):
 163     type = 'GpuDispatcher'
 164     cxx_header = 'gpu-compute/dispatcher.hh'
 165     # put at 8GB line for now
 166     pio_addr = Param.Addr(0x200000000, "Device Address")
 167     pio_latency = Param.Latency('1ns', "Programmed IO latency")
 168     shader_pointer = Param.Shader('pointer to shader')
 169     translation_port = MasterPort('Port to the dispatcher TLB')
 170     cpu = Param.BaseCPU("CPU to wake up on kernel completion")
 171
 172     cl_driver = Param.ClDriver('pointer to driver')
 173
 174 class OpType(Enum): vals = [
 175     'OT_NULL',
 176     'OT_ALU',
 177     'OT_SPECIAL',
 178     'OT_GLOBAL_READ',
 179     'OT_GLOBAL_WRITE',
 180     'OT_GLOBAL_ATOMIC',
 181     'OT_GLOBAL_HIST',
 182     'OT_GLOBAL_LDAS',
 183     'OT_SHARED_READ',
 184     'OT_SHARED_WRITE',
 185     'OT_SHARED_ATOMIC',
 186     'OT_SHARED_HIST',
 187     'OT_SHARED_LDAS',
 188     'OT_PRIVATE_READ',
 189     'OT_PRIVATE_WRITE',
 190     'OT_PRIVATE_ATOMIC',
 191     'OT_PRIVATE_HIST',
 192     'OT_PRIVATE_LDAS',
 193     'OT_SPILL_READ',
 194     'OT_SPILL_WRITE',
 195     'OT_SPILL_ATOMIC',
 196     'OT_SPILL_HIST',
 197     'OT_SPILL_LDAS',
 198     'OT_READONLY_READ',
 199     'OT_READONLY_WRITE',
 200     'OT_READONLY_ATOMIC',
 201     'OT_READONLY_HIST',
 202     'OT_READONLY_LDAS',
 203     'OT_FLAT_READ',
 204     'OT_FLAT_WRITE',
 205     'OT_FLAT_ATOMIC',
 206     'OT_FLAT_HIST',
 207     'OT_FLAT_LDAS',
 208     'OT_KERN_READ',
 209     'OT_BRANCH',
 210
 211     # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version
 212     #       of the compiler.
 213     'OT_SHARED_MEMFENCE',
 214     'OT_GLOBAL_MEMFENCE',
 215     'OT_BOTH_MEMFENCE',
 216
 217     'OT_BARRIER',
 218     'OT_PRINT',
 219     'OT_RET',
 220     'OT_NOP',
 221     'OT_ARG'
 222     ]
 223
 224 class MemType(Enum): vals = [
 225     'M_U8',
 226     'M_U16',
 227     'M_U32',
 228     'M_U64',
 229     'M_S8',
 230     'M_S16',
 231     'M_S32',
 232     'M_S64',
 233     'M_F16',
 234     'M_F32',
 235     'M_F64',
 236     ]
 237
 238 class MemOpType(Enum): vals = [
 239     'MO_LD',
 240     'MO_ST',
 241     'MO_LDAS',
 242     'MO_LDA',
 243     'MO_AAND',
 244     'MO_AOR',
 245     'MO_AXOR',
 246     'MO_ACAS',
 247     'MO_AEXCH',
 248     'MO_AADD',
 249     'MO_ASUB',
 250     'MO_AINC',
 251     'MO_ADEC',
 252     'MO_AMAX',
 253     'MO_AMIN',
 254     'MO_ANRAND',
 255     'MO_ANROR',
 256     'MO_ANRXOR',
 257     'MO_ANRCAS',
 258     'MO_ANREXCH',
 259     'MO_ANRADD',
 260     'MO_ANRSUB',
 261     'MO_ANRINC',
 262     'MO_ANRDEC',
 263     'MO_ANRMAX',
 264     'MO_ANRMIN',
 265     'MO_HAND',
 266     'MO_HOR',
 267     'MO_HXOR',
 268     'MO_HCAS',
 269     'MO_HEXCH',
 270     'MO_HADD',
 271     'MO_HSUB',
 272     'MO_HINC',
 273     'MO_HDEC',
 274     'MO_HMAX',
 275     'MO_HMIN',
 276     'MO_UNDEF'
 277     ]
 278
 279 class StorageClassType(Enum): vals = [
 280     'SC_SPILL',
 281     'SC_GLOBAL',
 282     'SC_SHARED',
 283     'SC_PRIVATE',
 284     'SC_READONLY',
 285     'SC_KERNARG',
 286     'SC_NONE',
 287     ]
 288
 289 class RegisterType(Enum): vals = [
 290     'RT_VECTOR',
 291     'RT_SCALAR',
 292     'RT_CONDITION',
 293     'RT_HARDWARE',
 294     'RT_NONE',
 295     ]
 296
 297 class GenericMemoryOrder(Enum): vals = [
 298     'MEMORY_ORDER_NONE',
 299     'MEMORY_ORDER_RELAXED',
 300     'MEMORY_ORDER_SC_ACQUIRE',
 301     'MEMORY_ORDER_SC_RELEASE',
 302     'MEMORY_ORDER_SC_ACQUIRE_RELEASE',
 303     ]
 304
 305 class GenericMemoryScope(Enum): vals = [
 306     'MEMORY_SCOPE_NONE',
 307     'MEMORY_SCOPE_WORKITEM',
 308     'MEMORY_SCOPE_WAVEFRONT',
 309     'MEMORY_SCOPE_WORKGROUP',
 310     'MEMORY_SCOPE_DEVICE',
 311     'MEMORY_SCOPE_SYSTEM',
 312     ]