configs, mem: Support running VIPER with GCN3

[gem5.git] / configs / ruby / GPU_VIPER.py
diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py

index eeed637d4bd6a5d4f78e987a35b470adf74e08b2..92dcf5e52fb84882da7b58c544c530c9b336ae0e 100644 (file)
--- a/configs/ruby/GPU_VIPER.py
+++ b/configs/ruby/GPU_VIPER.py
@@ -1,47 +1,50 @@
+# Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+# All rights reserved.
  #
-#  Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
-#  All rights reserved.
+# For use for simulation and test purposes only
  #
-#  For use for simulation and test purposes only
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
  #
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
  #
-#  1. Redistributions of source code must retain the above copyright notice,
-#  this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
  #
-#  2. Redistributions in binary form must reproduce the above copyright notice,
-#  this list of conditions and the following disclaimer in the documentation
-#  and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
  #
-#  3. Neither the name of the copyright holder nor the names of its contributors
-#  may be used to endorse or promote products derived from this software
-#  without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-#  POSSIBILITY OF SUCH DAMAGE.
-#
-#  Author: Lisa Hsu
-#
-
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+import six
  import math
  import m5
  from m5.objects import *
  from m5.defines import buildEnv
-from Ruby import create_topology
-from Ruby import send_evicts
+from m5.util import addToPath
+from .Ruby import create_topology
+from .Ruby import send_evicts
+
+addToPath('../')
+
+from topologies.Cluster import Cluster
+from topologies.Crossbar import Crossbar
  
-from Cluster import Cluster
-from Crossbar import Crossbar
+if six.PY3:
+    long = int
  
  class CntrlBase:
      _seqs = 0
@@ -73,7 +76,7 @@ class L1Cache(RubyCache):
      def create(self, size, assoc, options):
          self.size = MemorySize(size)
          self.assoc = assoc
-        self.replacement_policy = PseudoLRUReplacementPolicy()
+        self.replacement_policy = TreePLRURP()
  
  class L2Cache(RubyCache):
      resourceStalls = False
@@ -83,7 +86,7 @@ class L2Cache(RubyCache):
      def create(self, size, assoc, options):
          self.size = MemorySize(size)
          self.assoc = assoc
-        self.replacement_policy = PseudoLRUReplacementPolicy()
+        self.replacement_policy = TreePLRURP()
  
  class CPCntrl(CorePair_Controller, CntrlBase):
  
@@ -134,7 +137,7 @@ class TCPCache(RubyCache):
          self.size = MemorySize(options.tcp_size)
          self.assoc = options.tcp_assoc
          self.resourceStalls = options.no_tcc_resource_stalls
-        self.replacement_policy = PseudoLRUReplacementPolicy()
+        self.replacement_policy = TreePLRURP()
  
  class TCPCntrl(TCP_Controller, CntrlBase):
  
@@ -154,6 +157,11 @@ class TCPCntrl(TCP_Controller, CntrlBase):
          self.coalescer.ruby_system = ruby_system
          self.coalescer.support_inst_reqs = False
          self.coalescer.is_cpu_sequencer = False
+        if options.tcp_deadlock_threshold:
+          self.coalescer.deadlock_threshold = \
+              options.tcp_deadlock_threshold
+        self.coalescer.max_coalesces_per_cycle = \
+            options.max_coalesces_per_cycle
  
          self.sequencer = RubySequencer()
          self.sequencer.version = self.seqCount()
@@ -209,7 +217,7 @@ class SQCCache(RubyCache):
      def create(self, options):
          self.size = MemorySize(options.sqc_size)
          self.assoc = options.sqc_assoc
-        self.replacement_policy = PseudoLRUReplacementPolicy()
+        self.replacement_policy = TreePLRURP()
  
  class SQCCntrl(SQC_Controller, CntrlBase):
  
@@ -228,6 +236,9 @@ class SQCCntrl(SQC_Controller, CntrlBase):
          self.sequencer.ruby_system = ruby_system
          self.sequencer.support_data_reqs = False
          self.sequencer.is_cpu_sequencer = False
+        if options.sqc_deadlock_threshold:
+          self.sequencer.deadlock_threshold = \
+            options.sqc_deadlock_threshold
  
          self.ruby_system = ruby_system
  
@@ -258,7 +269,7 @@ class TCC(RubyCache):
              self.size.value = long(128 * self.assoc)
          self.start_index_bit = math.log(options.cacheline_size, 2) + \
                                 math.log(options.num_tccs, 2)
-        self.replacement_policy = PseudoLRUReplacementPolicy()
+        self.replacement_policy = TreePLRURP()
  
  
  class TCCCntrl(TCC_Controller, CntrlBase):
@@ -288,7 +299,7 @@ class L3Cache(RubyCache):
          self.dataAccessLatency = options.l3_data_latency
          self.tagAccessLatency = options.l3_tag_latency
          self.resourceStalls = False
-        self.replacement_policy = PseudoLRUReplacementPolicy()
+        self.replacement_policy = TreePLRURP()
  
  class L3Cntrl(L3Cache_Controller, CntrlBase):
      def create(self, options, ruby_system, system):
@@ -371,6 +382,9 @@ def define_options(parser):
                        help = "SQC cache size")
      parser.add_option("--sqc-assoc", type = 'int', default = 8,
                        help = "SQC cache assoc")
+    parser.add_option("--sqc-deadlock-threshold", type='int',
+                      help="Set the SQC deadlock threshold to some value")
+
      parser.add_option("--WB_L1", action = "store_true", default = False,
                        help = "writeback L1")
      parser.add_option("--WB_L2", action = "store_true", default = False,
@@ -387,10 +401,16 @@ def define_options(parser):
                        help = "tcp size")
      parser.add_option("--tcp-assoc", type = 'int', default = 16,
                        help = "tcp assoc")
+    parser.add_option("--tcp-deadlock-threshold", type='int',
+                      help="Set the TCP deadlock threshold to some value")
+    parser.add_option("--max-coalesces-per-cycle", type="int", default=1,
+                      help="Maximum insts that may coalesce in a cycle");
+
      parser.add_option("--noL1", action = "store_true", default = False,
                        help = "bypassL1")
  
-def create_system(options, full_system, system, dma_devices, ruby_system):
+def create_system(options, full_system, system, dma_devices, bootmem,
+                  ruby_system):
      if buildEnv['PROTOCOL'] != 'GPU_VIPER':
          panic("This script requires the GPU_VIPER protocol to be built.")
  
@@ -427,7 +447,7 @@ def create_system(options, full_system, system, dma_devices, ruby_system):
          mainCluster = Cluster(intBW=crossbar_bw)
      else:
          mainCluster = Cluster(intBW=8) # 16 GB/s
-    for i in xrange(options.num_dirs):
+    for i in range(options.num_dirs):
  
          dir_cntrl = DirCntrl(noTCCdir = True, TCC_select_num_bits = TCC_bits)
          dir_cntrl.create(options, ruby_system, system)
@@ -453,6 +473,16 @@ def create_system(options, full_system, system, dma_devices, ruby_system):
  
          dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
          dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
+        dir_cntrl.requestToMemory = MessageBuffer()
+        dir_cntrl.responseFromMemory = MessageBuffer()
+
+        dir_cntrl.requestFromDMA = MessageBuffer(ordered=True)
+        dir_cntrl.requestFromDMA.slave = ruby_system.network.master
+
+        dir_cntrl.responseToDMA = MessageBuffer()
+        dir_cntrl.responseToDMA.master = ruby_system.network.slave
+
+        dir_cntrl.requestToMemory = MessageBuffer()
          dir_cntrl.responseFromMemory = MessageBuffer()
  
          exec("ruby_system.dir_cntrl%d = dir_cntrl" % i)
@@ -465,7 +495,7 @@ def create_system(options, full_system, system, dma_devices, ruby_system):
          cpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw)
      else:
          cpuCluster = Cluster(extBW = 8, intBW = 8) # 16 GB/s
-    for i in xrange((options.num_cpus + 1) / 2):
+    for i in range((options.num_cpus + 1) // 2):
  
          cp_cntrl = CPCntrl()
          cp_cntrl.create(options, ruby_system, system)
@@ -497,12 +527,64 @@ def create_system(options, full_system, system, dma_devices, ruby_system):
  
          cpuCluster.add(cp_cntrl)
  
+    # Register CPUs and caches for each CorePair and directory (SE mode only)
+    if not full_system:
+        for i in range((options.num_cpus + 1) // 2):
+            FileSystemConfig.register_cpu(physical_package_id = 0,
+                                          core_siblings = \
+                                            range(options.num_cpus),
+                                          core_id = i*2,
+                                          thread_siblings = [])
+
+            FileSystemConfig.register_cpu(physical_package_id = 0,
+                                          core_siblings = \
+                                            range(options.num_cpus),
+                                          core_id = i*2+1,
+                                          thread_siblings = [])
+
+            FileSystemConfig.register_cache(level = 0,
+                                            idu_type = 'Instruction',
+                                            size = options.l1i_size,
+                                            line_size = options.cacheline_size,
+                                            assoc = options.l1i_assoc,
+                                            cpus = [i*2, i*2+1])
+
+            FileSystemConfig.register_cache(level = 0,
+                                            idu_type = 'Data',
+                                            size = options.l1d_size,
+                                            line_size = options.cacheline_size,
+                                            assoc = options.l1d_assoc,
+                                            cpus = [i*2])
+
+            FileSystemConfig.register_cache(level = 0,
+                                            idu_type = 'Data',
+                                            size = options.l1d_size,
+                                            line_size = options.cacheline_size,
+                                            assoc = options.l1d_assoc,
+                                            cpus = [i*2+1])
+
+            FileSystemConfig.register_cache(level = 1,
+                                            idu_type = 'Unified',
+                                            size = options.l2_size,
+                                            line_size = options.cacheline_size,
+                                            assoc = options.l2_assoc,
+                                            cpus = [i*2, i*2+1])
+
+        for i in range(options.num_dirs):
+            FileSystemConfig.register_cache(level = 2,
+                                            idu_type = 'Unified',
+                                            size = options.l3_size,
+                                            line_size = options.cacheline_size,
+                                            assoc = options.l3_assoc,
+                                            cpus = [n for n in
+                                                range(options.num_cpus)])
+
      gpuCluster = None
      if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
        gpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw)
      else:
        gpuCluster = Cluster(extBW = 8, intBW = 8) # 16 GB/s
-    for i in xrange(options.num_compute_units):
+    for i in range(options.num_compute_units):
  
          tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
                               issue_latency = 1,
@@ -541,7 +623,7 @@ def create_system(options, full_system, system, dma_devices, ruby_system):
  
          gpuCluster.add(tcp_cntrl)
  
-    for i in xrange(options.num_sqc):
+    for i in range(options.num_sqc):
  
          sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
          sqc_cntrl.create(options, ruby_system, system)
@@ -567,6 +649,28 @@ def create_system(options, full_system, system, dma_devices, ruby_system):
          # SQC also in GPU cluster
          gpuCluster.add(sqc_cntrl)
  
+    for i in xrange(options.num_scalar_cache):
+        scalar_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
+        scalar_cntrl.create(options, ruby_system, system)
+
+        exec('ruby_system.scalar_cntrl%d = scalar_cntrl' % i)
+
+        cpu_sequencers.append(scalar_cntrl.sequencer)
+
+        scalar_cntrl.requestFromSQC = MessageBuffer(ordered = True)
+        scalar_cntrl.requestFromSQC.master = ruby_system.network.slave
+
+        scalar_cntrl.probeToSQC = MessageBuffer(ordered = True)
+        scalar_cntrl.probeToSQC.slave = ruby_system.network.master
+
+        scalar_cntrl.responseToSQC = MessageBuffer(ordered = True)
+        scalar_cntrl.responseToSQC.slave = ruby_system.network.master
+
+        scalar_cntrl.mandatoryQueue = \
+            MessageBuffer(buffer_size=options.buffers_size)
+
+        gpuCluster.add(scalar_cntrl)
+
      for i in xrange(options.num_cp):
  
          tcp_ID = options.num_compute_units + i
@@ -621,7 +725,7 @@ def create_system(options, full_system, system, dma_devices, ruby_system):
          # SQC also in GPU cluster
          gpuCluster.add(sqc_cntrl)
  
-    for i in xrange(options.num_tccs):
+    for i in range(options.num_tccs):
  
          tcc_cntrl = TCCCntrl(l2_response_latency = options.TCC_latency)
          tcc_cntrl.create(options, ruby_system, system)
@@ -662,13 +766,27 @@ def create_system(options, full_system, system, dma_devices, ruby_system):
          # TCC cntrls added to the GPU cluster
          gpuCluster.add(tcc_cntrl)
  
-    # Assuming no DMA devices
-    assert(len(dma_devices) == 0)
+    for i, dma_device in enumerate(dma_devices):
+        dma_seq = DMASequencer(version=i, ruby_system=ruby_system)
+        dma_cntrl = DMA_Controller(version=i, dma_sequencer=dma_seq,
+                                   ruby_system=ruby_system)
+        exec('system.dma_cntrl%d = dma_cntrl' % i)
+        if dma_device.type == 'MemTest':
+            exec('system.dma_cntrl%d.dma_sequencer.slave = dma_devices.test'
+                 % i)
+        else:
+            exec('system.dma_cntrl%d.dma_sequencer.slave = dma_device.dma' % i)
+        dma_cntrl.requestToDir = MessageBuffer(buffer_size=0)
+        dma_cntrl.requestToDir.master = ruby_system.network.slave
+        dma_cntrl.responseFromDir = MessageBuffer(buffer_size=0)
+        dma_cntrl.responseFromDir.slave = ruby_system.network.master
+        dma_cntrl.mandatoryQueue = MessageBuffer(buffer_size = 0)
+        gpuCluster.add(dma_cntrl)
  
      # Add cpu/gpu clusters to main cluster
      mainCluster.add(cpuCluster)
      mainCluster.add(gpuCluster)
  
-    ruby_system.network.number_of_virtual_networks = 10
+    ruby_system.network.number_of_virtual_networks = 11
  
      return (cpu_sequencers, dir_cntrl_nodes, mainCluster)