src/gpu-compute/GPU.py

   1 # Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
   2 # All rights reserved.
   3 #
   4 # For use for simulation and test purposes only
   5 #
   6 # Redistribution and use in source and binary forms, with or without
   7 # modification, are permitted provided that the following conditions are met:
   8 #
   9 # 1. Redistributions of source code must retain the above copyright notice,
  10 # this list of conditions and the following disclaimer.
  11 #
  12 # 2. Redistributions in binary form must reproduce the above copyright notice,
  13 # this list of conditions and the following disclaimer in the documentation
  14 # and/or other materials provided with the distribution.
  15 #
  16 # 3. Neither the name of the copyright holder nor the names of its
  17 # contributors may be used to endorse or promote products derived from this
  18 # software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  21 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  24 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30 # POSSIBILITY OF SUCH DAMAGE.
  31 #
  32 # Authors: Steve Reinhardt
  33
  34 from m5.defines import buildEnv
  35 from m5.params import *
  36 from m5.proxy import *
  37 from m5.SimObject import SimObject
  38
  39 from m5.objects.Bridge import Bridge
  40 from m5.objects.ClockedObject import ClockedObject
  41 from m5.objects.Device import DmaDevice
  42 from m5.objects.HSADevice import HSADevice
  43 from m5.objects.HSADriver import HSADriver
  44 from m5.objects.LdsState import LdsState
  45 from m5.objects.Process import EmulatedDriver
  46
  47 class PrefetchType(Enum): vals = [
  48     'PF_CU',
  49     'PF_PHASE',
  50     'PF_WF',
  51     'PF_STRIDE',
  52     'PF_END',
  53     ]
  54
  55 class PoolManager(SimObject):
  56     type = 'PoolManager'
  57     abstract = True
  58     cxx_header = "gpu-compute/pool_manager.hh"
  59
  60     min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
  61     pool_size = Param.Int(2048, 'number of vector registers per SIMD')
  62
  63 # The simple pool manage only allows one workgroup to
  64 # be executing on a CU at any given time.
  65 class SimplePoolManager(PoolManager):
  66     type = 'SimplePoolManager'
  67     cxx_class = 'SimplePoolManager'
  68     cxx_header = "gpu-compute/simple_pool_manager.hh"
  69
  70 class RegisterFile(SimObject):
  71     type = 'RegisterFile'
  72     cxx_class = 'RegisterFile'
  73     cxx_header = 'gpu-compute/register_file.hh'
  74
  75     simd_id = Param.Int(-1, 'SIMD ID associated with this Register File')
  76     num_regs = Param.Int(2048, 'number of registers in this RF')
  77     wf_size = Param.Int(64, 'Wavefront size (in work items)')
  78
  79 class ScalarRegisterFile(RegisterFile):
  80     type = 'ScalarRegisterFile'
  81     cxx_class = 'ScalarRegisterFile'
  82     cxx_header = 'gpu-compute/scalar_register_file.hh'
  83
  84 class VectorRegisterFile(RegisterFile):
  85     type = 'VectorRegisterFile'
  86     cxx_class = 'VectorRegisterFile'
  87     cxx_header = 'gpu-compute/vector_register_file.hh'
  88
  89 class RegisterManager(SimObject):
  90     type = 'RegisterManager'
  91     cxx_class = 'RegisterManager'
  92     cxx_header = 'gpu-compute/register_manager.hh'
  93
  94     policy = Param.String("static", "Register Manager Policy")
  95     vrf_pool_managers = VectorParam.PoolManager('VRF Pool Managers')
  96     srf_pool_managers = VectorParam.PoolManager('SRF Pool Managers')
  97
  98 class Wavefront(SimObject):
  99     type = 'Wavefront'
 100     cxx_class = 'Wavefront'
 101     cxx_header = 'gpu-compute/wavefront.hh'
 102
 103     simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
 104     wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
 105     wf_size = Param.Int(64, 'Wavefront size (in work items)')
 106     max_ib_size = Param.Int(13, 'Maximum size (in number of insts) of the '
 107                             'instruction buffer (IB).')
 108
 109 # Most of the default values here are obtained from the
 110 # AMD Graphics Core Next (GCN) Architecture whitepaper.
 111 class ComputeUnit(ClockedObject):
 112     type = 'ComputeUnit'
 113     cxx_class = 'ComputeUnit'
 114     cxx_header = 'gpu-compute/compute_unit.hh'
 115
 116     wavefronts = VectorParam.Wavefront('Number of wavefronts')
 117     # Wavefront size is 64. This is configurable, however changing
 118     # this value to anything other than 64 will likely cause errors.
 119     wf_size = Param.Int(64, 'Wavefront size (in work items)')
 120     num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
 121     num_scalar_cores = Param.Int(1, 'number of Scalar cores per CU')
 122     num_scalar_mem_pipes = Param.Int(1, 'number of Scalar memory pipelines '\
 123                                      'per CU')
 124     simd_width = Param.Int(16, 'width (number of lanes) per SIMD unit')
 125
 126     operand_network_length = Param.Int(1, 'number of pipe stages of operand '\
 127                                           'network')
 128
 129     spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
 130                                         'latency')
 131
 132     dpbypass_pipe_length = Param.Int(4, 'vector ALU Double Precision bypass '\
 133                                         'latency')
 134     scalar_pipe_length = Param.Int(1, 'number of pipe stages per scalar ALU')
 135     issue_period = Param.Int(4, 'number of cycles per issue period')
 136
 137     vrf_gm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\
 138                                       'GM bus')
 139     srf_scm_bus_latency = Param.Int(1, 'number of cycles per use of SRF '\
 140                                        'to Scalar Mem bus')
 141     vrf_lm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\
 142                                       'LM bus')
 143
 144     num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
 145     num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
 146     n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
 147     mem_req_latency = Param.Int(50, "Latency for request from the cu to ruby. "\
 148                                 "Represents the pipeline to reach the TCP "\
 149                                 "and specified in GPU clock cycles")
 150     mem_resp_latency = Param.Int(50, "Latency for responses from ruby to the "\
 151                                  "cu. Represents the pipeline between the "\
 152                                  "TCP and cu as well as TCP data array "\
 153                                  "access. Specified in GPU clock cycles")
 154     system = Param.System(Parent.any, "system object")
 155     cu_id = Param.Int('CU id')
 156     vrf_to_coalescer_bus_width = Param.Int(64, "VRF->Coalescer data bus "\
 157                                            "width in bytes")
 158     coalescer_to_vrf_bus_width = Param.Int(64, "Coalescer->VRF data bus "\
 159                                            "width  in bytes")
 160
 161     memory_port = VectorMasterPort("Port to the memory system")
 162     translation_port = VectorMasterPort('Port to the TLB hierarchy')
 163     sqc_port = MasterPort("Port to the SQC (I-cache")
 164     sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
 165     scalar_port = MasterPort("Port to the scalar data cache")
 166     scalar_tlb_port = MasterPort("Port to the TLB for the scalar data cache")
 167     perLaneTLB = Param.Bool(False, "enable per-lane TLB")
 168     prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
 169                                "(0 turns off prefetching)")
 170     prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)")
 171     prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\
 172                                             "from last mem req in lane of "\
 173                                             "CU|Phase|Wavefront")
 174     execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
 175     debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
 176     functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
 177
 178     localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
 179                                         "kernel end")
 180
 181     countPages = Param.Bool(False, "Generate per-CU file of all pages "\
 182                             "touched and how many times")
 183     scalar_mem_queue_size = Param.Int(32, "Number of entries in scalar "\
 184                                       "memory pipeline's queues")
 185     global_mem_queue_size = Param.Int(256, "Number of entries in the global "
 186                                       "memory pipeline's queues")
 187     local_mem_queue_size = Param.Int(256, "Number of entries in the local "
 188                                       "memory pipeline's queues")
 189     max_wave_requests = Param.Int(64, "number of pending vector memory "\
 190                                       "requests per wavefront")
 191     max_cu_tokens = Param.Int(4, "Maximum number of tokens, i.e., the number"\
 192                             " of instructions that can be sent to coalescer")
 193     ldsBus = Bridge() # the bridge between the CU and its LDS
 194     ldsPort = MasterPort("The port that goes to the LDS")
 195     localDataStore = Param.LdsState("the LDS for this CU")
 196
 197     vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
 198                                                           "file")
 199
 200     scalar_register_file = VectorParam.ScalarRegisterFile("Scalar register "\
 201                                                           "file")
 202     out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery"
 203                                             " in the GM pipeline")
 204     register_manager = Param.RegisterManager("Register Manager")
 205     fetch_depth = Param.Int(2, 'number of i-cache lines that may be '
 206                             'buffered in the fetch unit.')
 207
 208 class Shader(ClockedObject):
 209     type = 'Shader'
 210     cxx_class = 'Shader'
 211     cxx_header = 'gpu-compute/shader.hh'
 212     CUs = VectorParam.ComputeUnit('Number of compute units')
 213     gpu_cmd_proc = Param.GPUCommandProcessor('Command processor for GPU')
 214     dispatcher = Param.GPUDispatcher('GPU workgroup dispatcher')
 215     n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
 216     impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
 217                                          ruby at kernel boundaries""")
 218     globalmem = Param.MemorySize('64kB', 'Memory size')
 219     timing = Param.Bool(False, 'timing memory accesses')
 220
 221     cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
 222     translation = Param.Bool(False, "address translation");
 223     timer_period = Param.Clock('10us', "system timer period")
 224     idlecu_timeout = Param.Tick(0, "Idle CU watchdog timeout threshold")
 225     max_valu_insts = Param.Int(0, "Maximum vALU insts before exiting")
 226
 227 class GPUComputeDriver(HSADriver):
 228     type = 'GPUComputeDriver'
 229     cxx_header = 'gpu-compute/gpu_compute_driver.hh'
 230
 231 class GPUDispatcher(SimObject):
 232     type = 'GPUDispatcher'
 233     cxx_header = 'gpu-compute/dispatcher.hh'
 234
 235 class GPUCommandProcessor(HSADevice):
 236     type = 'GPUCommandProcessor'
 237     cxx_header = 'gpu-compute/gpu_command_processor.hh'
 238     dispatcher = Param.GPUDispatcher('workgroup dispatcher for the GPU')
 239
 240 class StorageClassType(Enum): vals = [
 241     'SC_SPILL',
 242     'SC_GLOBAL',
 243     'SC_GROUP',
 244     'SC_PRIVATE',
 245     'SC_READONLY',
 246     'SC_KERNARG',
 247     'SC_ARG',
 248     'SC_NONE',
 249     ]