gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model

[gem5.git] / src / gpu-compute / GPU.py
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py

index 7eaf65fec16e7437e5e3abec936547fa41956907..6b033f40366e059fe4cfb52d5f789f28c87d94a8 100644 (file)
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -1,48 +1,48 @@
+# Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+# All rights reserved.
  #
-#  Copyright (c) 2015 Advanced Micro Devices, Inc.
-#  All rights reserved.
+# For use for simulation and test purposes only
  #
-#  For use for simulation and test purposes only
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
  #
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
  #
-#  1. Redistributions of source code must retain the above copyright notice,
-#  this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
  #
-#  2. Redistributions in binary form must reproduce the above copyright notice,
-#  this list of conditions and the following disclaimer in the documentation
-#  and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
  #
-#  3. Neither the name of the copyright holder nor the names of its contributors
-#  may be used to endorse or promote products derived from this software
-#  without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-#  POSSIBILITY OF SUCH DAMAGE.
-#
-#  Author: Steve Reinhardt
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
  #
+# Authors: Steve Reinhardt
  
  from m5.defines import buildEnv
  from m5.params import *
  from m5.proxy import *
  from m5.SimObject import SimObject
  
+from m5.objects.Bridge import Bridge
  from m5.objects.ClockedObject import ClockedObject
  from m5.objects.Device import DmaDevice
-from m5.objects.Process import EmulatedDriver
-from m5.objects.Bridge import Bridge
+from m5.objects.HSADevice import HSADevice
+from m5.objects.HSADriver import HSADriver
  from m5.objects.LdsState import LdsState
+from m5.objects.Process import EmulatedDriver
  
  class PrefetchType(Enum): vals = [
      'PF_CU',
@@ -52,15 +52,48 @@ class PrefetchType(Enum): vals = [
      'PF_END',
      ]
  
-class VectorRegisterFile(SimObject):
+class PoolManager(SimObject):
+    type = 'PoolManager'
+    abstract = True
+    cxx_header = "gpu-compute/pool_manager.hh"
+
+    min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
+    pool_size = Param.Int(2048, 'number of vector registers per SIMD')
+
+# The simple pool manage only allows one workgroup to
+# be executing on a CU at any given time.
+class SimplePoolManager(PoolManager):
+    type = 'SimplePoolManager'
+    cxx_class = 'SimplePoolManager'
+    cxx_header = "gpu-compute/simple_pool_manager.hh"
+
+class RegisterFile(SimObject):
+    type = 'RegisterFile'
+    cxx_class = 'RegisterFile'
+    cxx_header = 'gpu-compute/register_file.hh'
+
+    simd_id = Param.Int(-1, 'SIMD ID associated with this Register File')
+    num_regs = Param.Int(2048, 'number of registers in this RF')
+    wf_size = Param.Int(64, 'Wavefront size (in work items)')
+
+class ScalarRegisterFile(RegisterFile):
+    type = 'ScalarRegisterFile'
+    cxx_class = 'ScalarRegisterFile'
+    cxx_header = 'gpu-compute/scalar_register_file.hh'
+
+class VectorRegisterFile(RegisterFile):
      type = 'VectorRegisterFile'
      cxx_class = 'VectorRegisterFile'
      cxx_header = 'gpu-compute/vector_register_file.hh'
  
-    simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
-    num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
-    wfSize = Param.Int(64, 'Wavefront size (in work items)')
-    min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
+class RegisterManager(SimObject):
+    type = 'RegisterManager'
+    cxx_class = 'RegisterManager'
+    cxx_header = 'gpu-compute/register_manager.hh'
+
+    policy = Param.String("static", "Register Manager Policy")
+    vrf_pool_managers = VectorParam.PoolManager('VRF Pool Managers')
+    srf_pool_managers = VectorParam.PoolManager('SRF Pool Managers')
  
  class Wavefront(SimObject):
      type = 'Wavefront'
@@ -69,45 +102,68 @@ class Wavefront(SimObject):
  
      simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
      wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
-    wfSize = Param.Int(64, 'Wavefront size (in work items)')
+    wf_size = Param.Int(64, 'Wavefront size (in work items)')
+    max_ib_size = Param.Int(13, 'Maximum size (in number of insts) of the '
+                            'instruction buffer (IB).')
  
+# Most of the default values here are obtained from the
+# AMD Graphics Core Next (GCN) Architecture whitepaper.
  class ComputeUnit(ClockedObject):
      type = 'ComputeUnit'
      cxx_class = 'ComputeUnit'
      cxx_header = 'gpu-compute/compute_unit.hh'
  
      wavefronts = VectorParam.Wavefront('Number of wavefronts')
-    wfSize = Param.Int(64, 'Wavefront size (in work items)')
+    # Wavefront size is 64. This is configurable, however changing
+    # this value to anything other than 64 will likely cause errors.
+    wf_size = Param.Int(64, 'Wavefront size (in work items)')
      num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
+    num_scalar_cores = Param.Int(1, 'number of Scalar cores per CU')
+    num_scalar_mem_pipes = Param.Int(1, 'number of Scalar memory pipelines '\
+                                     'per CU')
+    simd_width = Param.Int(16, 'width (number of lanes) per SIMD unit')
+
+    operand_network_length = Param.Int(1, 'number of pipe stages of operand '\
+                                          'network')
  
      spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
                                          'latency')
  
-    dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
+    dpbypass_pipe_length = Param.Int(4, 'vector ALU Double Precision bypass '\
                                          'latency')
-
+    scalar_pipe_length = Param.Int(1, 'number of pipe stages per scalar ALU')
      issue_period = Param.Int(4, 'number of cycles per issue period')
+
+    vrf_gm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\
+                                      'GM bus')
+    srf_scm_bus_latency = Param.Int(1, 'number of cycles per use of SRF '\
+                                       'to Scalar Mem bus')
+    vrf_lm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\
+                                      'LM bus')
+
      num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
      num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
-    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
-    mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
-                                "Represents the pipeline to reach the TCP and "\
-                                "specified in GPU clock cycles")
-    mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
-                                 "cu. Represents the pipeline between the TCP "\
-                                 "and cu as well as TCP data array access. "\
-                                 "Specified in GPU clock cycles")
+    n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
+    mem_req_latency = Param.Int(50, "Latency for request from the cu to ruby. "\
+                                "Represents the pipeline to reach the TCP "\
+                                "and specified in GPU clock cycles")
+    mem_resp_latency = Param.Int(50, "Latency for responses from ruby to the "\
+                                 "cu. Represents the pipeline between the "\
+                                 "TCP and cu as well as TCP data array "\
+                                 "access. Specified in GPU clock cycles")
      system = Param.System(Parent.any, "system object")
      cu_id = Param.Int('CU id')
-    vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
-                                           "in bytes")
-    coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
-                                           "in bytes")
+    vrf_to_coalescer_bus_width = Param.Int(64, "VRF->Coalescer data bus "\
+                                           "width in bytes")
+    coalescer_to_vrf_bus_width = Param.Int(64, "Coalescer->VRF data bus "\
+                                           "width  in bytes")
  
      memory_port = VectorMasterPort("Port to the memory system")
      translation_port = VectorMasterPort('Port to the TLB hierarchy')
      sqc_port = MasterPort("Port to the SQC (I-cache")
      sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
+    scalar_port = MasterPort("Port to the scalar data cache")
+    scalar_tlb_port = MasterPort("Port to the TLB for the scalar data cache")
      perLaneTLB = Param.Bool(False, "enable per-lane TLB")
      prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
                                 "(0 turns off prefetching)")
@@ -116,19 +172,22 @@ class ComputeUnit(ClockedObject):
                                              "from last mem req in lane of "\
                                              "CU|Phase|Wavefront")
      execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
-    xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
      debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
      functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
  
      localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
                                          "kernel end")
  
-    countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
-                                   "and how many times")
+    countPages = Param.Bool(False, "Generate per-CU file of all pages "\
+                            "touched and how many times")
+    scalar_mem_queue_size = Param.Int(32, "Number of entries in scalar "\
+                                      "memory pipeline's queues")
      global_mem_queue_size = Param.Int(256, "Number of entries in the global "
                                        "memory pipeline's queues")
      local_mem_queue_size = Param.Int(256, "Number of entries in the local "
                                        "memory pipeline's queues")
+    max_wave_requests = Param.Int(64, "number of pending vector memory "\
+                                      "requests per wavefront")
      max_cu_tokens = Param.Int(4, "Maximum number of tokens, i.e., the number"\
                              " of instructions that can be sent to coalescer")
      ldsBus = Bridge() # the bridge between the CU and its LDS
@@ -137,72 +196,54 @@ class ComputeUnit(ClockedObject):
  
      vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
                                                            "file")
+
+    scalar_register_file = VectorParam.ScalarRegisterFile("Scalar register "\
+                                                          "file")
      out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery"
                                              " in the GM pipeline")
+    register_manager = Param.RegisterManager("Register Manager")
+    fetch_depth = Param.Int(2, 'number of i-cache lines that may be '
+                            'buffered in the fetch unit.')
  
  class Shader(ClockedObject):
      type = 'Shader'
      cxx_class = 'Shader'
      cxx_header = 'gpu-compute/shader.hh'
-
      CUs = VectorParam.ComputeUnit('Number of compute units')
-    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+    gpu_cmd_proc = Param.GPUCommandProcessor('Command processor for GPU')
+    dispatcher = Param.GPUDispatcher('GPU workgroup dispatcher')
+    n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
      impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
-                                                  ruby at kernel boundaries""")
-    separate_acquire_release = Param.Bool(False,
-        """Do ld_acquire/st_release generate separate requests for the
-        acquire and release?""")
+                                         ruby at kernel boundaries""")
      globalmem = Param.MemorySize('64kB', 'Memory size')
      timing = Param.Bool(False, 'timing memory accesses')
  
      cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
      translation = Param.Bool(False, "address translation");
+    timer_period = Param.Clock('10us', "system timer period")
+    idlecu_timeout = Param.Tick(0, "Idle CU watchdog timeout threshold")
+    max_valu_insts = Param.Int(0, "Maximum vALU insts before exiting")
  
-class ClDriver(EmulatedDriver):
-    type = 'ClDriver'
-    cxx_header = 'gpu-compute/cl_driver.hh'
-    codefile = VectorParam.String('code file name(s)')
+class GPUComputeDriver(HSADriver):
+    type = 'GPUComputeDriver'
+    cxx_header = 'gpu-compute/gpu_compute_driver.hh'
  
-class GpuDispatcher(DmaDevice):
-    type = 'GpuDispatcher'
+class GPUDispatcher(SimObject):
+    type = 'GPUDispatcher'
      cxx_header = 'gpu-compute/dispatcher.hh'
-    # put at 8GB line for now
-    pio_addr = Param.Addr(0x200000000, "Device Address")
-    pio_latency = Param.Latency('1ns', "Programmed IO latency")
-    shader_pointer = Param.Shader('pointer to shader')
-    translation_port = MasterPort('Port to the dispatcher TLB')
-    cpu = Param.BaseCPU("CPU to wake up on kernel completion")
-
-    cl_driver = Param.ClDriver('pointer to driver')
-
-class MemType(Enum): vals = [
-    'M_U8',
-    'M_U16',
-    'M_U32',
-    'M_U64',
-    'M_S8',
-    'M_S16',
-    'M_S32',
-    'M_S64',
-    'M_F16',
-    'M_F32',
-    'M_F64',
-    ]
+
+class GPUCommandProcessor(HSADevice):
+    type = 'GPUCommandProcessor'
+    cxx_header = 'gpu-compute/gpu_command_processor.hh'
+    dispatcher = Param.GPUDispatcher('workgroup dispatcher for the GPU')
  
  class StorageClassType(Enum): vals = [
      'SC_SPILL',
      'SC_GLOBAL',
-    'SC_SHARED',
+    'SC_GROUP',
      'SC_PRIVATE',
      'SC_READONLY',
      'SC_KERNARG',
+    'SC_ARG',
      'SC_NONE',
      ]
-
-class RegisterType(Enum): vals = [
-    'RT_VECTOR',
-    'RT_SCALAR',
-    'RT_CONDITION',
-    'RT_HARDWARE',
-    'RT_NONE',
-    ]