configs/example/ruby_gpu_random_test.py

   1 # Copyright (c) 2018-2020 Advanced Micro Devices, Inc.
   2 # All rights reserved.
   3 #
   4 # For use for simulation and test purposes only
   5 #
   6 # Redistribution and use in source and binary forms, with or without
   7 # modification, are permitted provided that the following conditions are met:
   8 #
   9 # 1. Redistributions of source code must retain the above copyright notice,
  10 # this list of conditions and the following disclaimer.
  11 #
  12 # 2. Redistributions in binary form must reproduce the above copyright notice,
  13 # this list of conditions and the following disclaimer in the documentation
  14 # and/or other materials provided with the distribution.
  15 #
  16 # 3. Neither the name of the copyright holder nor the names of its
  17 # contributors may be used to endorse or promote products derived from this
  18 # software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  21 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  24 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30 # POSSIBILITY OF SUCH DAMAGE.
  31
  32 import m5
  33 from m5.objects import *
  34 from m5.defines import buildEnv
  35 from m5.util import addToPath
  36 import os, optparse, sys
  37
  38 addToPath('../')
  39
  40 from common import Options
  41 from ruby import Ruby
  42
  43 #
  44 # Add the ruby specific and protocol specific options
  45 #
  46 parser = optparse.OptionParser()
  47 Options.addNoISAOptions(parser)
  48 Ruby.define_options(parser)
  49
  50 # GPU Ruby tester options
  51 parser.add_option("--cache-size", type="choice", default="small",
  52                   choices=["small", "large"],
  53                   help="Cache sizes to use. Small encourages races between \
  54                         requests and writebacks. Large stresses write-through \
  55                         and/or write-back GPU caches.")
  56 parser.add_option("--system-size", type="choice", default="small",
  57                   choices=["small", "medium", "large"],
  58                   help="This option defines how many CUs, CPUs and cache \
  59                         components in the test system.")
  60 parser.add_option("--address-range", type="choice", default="small",
  61                   choices=["small", "large"],
  62                   help="This option defines the number of atomic \
  63                         locations that affects the working set's size. \
  64                         A small number of atomic locations encourage more \
  65                         races among threads. The large option stresses cache \
  66                         resources.")
  67 parser.add_option("--episode-length", type="choice", default="short",
  68                   choices=["short", "medium", "long"],
  69                   help="This option defines the number of LDs and \
  70                         STs in an episode. The small option encourages races \
  71                         between the start and end of an episode. The long \
  72                         option encourages races between LDs and STs in the \
  73                         same episode.")
  74 parser.add_option("--test-length", type="int", default=1,
  75                   help="The number of episodes to be executed by each \
  76                         wavefront. This determines the maximum number, i.e., \
  77                         val X #WFs, of episodes to be executed in the test.")
  78 parser.add_option("--debug-tester", action='store_true',
  79                   help="This option will turn on DRF checker")
  80 parser.add_option("--random-seed", type="int", default=0,
  81                   help="Random seed number. Default value (i.e., 0) means \
  82                         using runtime-specific value")
  83 parser.add_option("--log-file", type="string", default="gpu-ruby-test.log")
  84
  85 (options, args) = parser.parse_args()
  86
  87 if args:
  88      print("Error: script doesn't take any positional arguments")
  89      sys.exit(1)
  90
  91 #
  92 # Set up cache size - 2 options
  93 #   0: small cache
  94 #   1: large cache
  95 #
  96 if (options.cache_size == "small"):
  97     options.tcp_size="256B"
  98     options.tcp_assoc=2
  99     options.tcc_size="1kB"
 100     options.tcc_assoc=2
 101 elif (options.cache_size == "large"):
 102     options.tcp_size="256kB"
 103     options.tcp_assoc=16
 104     options.tcc_size="1024kB"
 105     options.tcc_assoc=16
 106
 107 #
 108 # Set up system size - 3 options
 109 #
 110 if (options.system_size == "small"):
 111     # 1 CU, 1 CPU, 1 SQC, 1 Scalar
 112     options.wf_size = 1
 113     options.wavefronts_per_cu = 1
 114     options.num_cpus = 1
 115     options.cu_per_sqc = 1
 116     options.cu_per_scalar_cache = 1
 117     options.num_compute_units = 1
 118 elif (options.system_size == "medium"):
 119     # 4 CUs, 4 CPUs, 1 SQCs, 1 Scalars
 120     options.wf_size = 16
 121     options.wavefronts_per_cu = 4
 122     options.num_cpus = 4
 123     options.cu_per_sqc = 4
 124     options.cu_per_scalar_cache = 4
 125     options.num_compute_units = 4
 126 elif (options.system_size == "large"):
 127     # 8 CUs, 4 CPUs, 1 SQCs, 1 Scalars
 128     options.wf_size = 32
 129     options.wavefronts_per_cu = 4
 130     options.num_cpus = 4
 131     options.cu_per_sqc = 4
 132     options.cu_per_scalar_cache = 4
 133     options.num_compute_units = 8
 134
 135 #
 136 # Set address range - 2 options
 137 #   level 0: small
 138 #   level 1: large
 139 # Each location corresponds to a 4-byte piece of data
 140 #
 141 options.mem_size = '1024MB'
 142 if (options.address_range == "small"):
 143     num_atomic_locs = 10
 144     num_regular_locs_per_atomic_loc = 10000
 145 elif (options.address_range == "large"):
 146     num_atomic_locs = 100
 147     num_regular_locs_per_atomic_loc = 100000
 148
 149 #
 150 # Set episode length (# of actions per episode) - 3 options
 151 #   0: 10 actions
 152 #   1: 100 actions
 153 #   2: 500 actions
 154 #
 155 if (options.episode_length == "short"):
 156     eps_length = 10
 157 elif (options.episode_length == "medium"):
 158     eps_length = 100
 159 elif (options.episode_length == "long"):
 160     eps_length = 500
 161
 162 #
 163 # Set Ruby and tester deadlock thresholds. Ruby's deadlock detection is the
 164 # primary check for deadlocks. The tester's deadlock threshold detection is
 165 # a secondary check for deadlock. If there is a bug in RubyPort that causes
 166 # a packet not to return to the tester properly, the tester will issue a
 167 # deadlock panic. We set cache_deadlock_threshold < tester_deadlock_threshold
 168 # to detect deadlock caused by Ruby protocol first before one caused by the
 169 # coalescer. Both units are in Ticks
 170 #
 171 options.cache_deadlock_threshold = 1e8
 172 tester_deadlock_threshold = 1e9
 173
 174 # For now we're testing only GPU protocol, so we force num_cpus to be 0
 175 options.num_cpus = 0
 176
 177 # Number of CUs
 178 n_CUs = options.num_compute_units
 179
 180 # Set test length, i.e., number of episodes per wavefront * #WFs.
 181 # Test length can be 1x#WFs, 10x#WFs, 100x#WFs, ...
 182 n_WFs = n_CUs * options.wavefronts_per_cu
 183 max_episodes = options.test_length * n_WFs
 184
 185 # Number of SQC and Scalar caches
 186 assert(n_CUs % options.cu_per_sqc == 0)
 187 n_SQCs = n_CUs // options.cu_per_sqc
 188 options.num_sqc = n_SQCs
 189
 190 assert(options.cu_per_scalar_cache != 0)
 191 n_Scalars = n_CUs // options.cu_per_scalar_cache
 192 options.num_scalar_cache = n_Scalars
 193
 194 #
 195 # Create GPU Ruby random tester
 196 #
 197 tester = ProtocolTester(cus_per_sqc = options.cu_per_sqc,
 198                         cus_per_scalar = options.cu_per_scalar_cache,
 199                         wavefronts_per_cu = options.wavefronts_per_cu,
 200                         workitems_per_wavefront = options.wf_size,
 201                         num_atomic_locations = num_atomic_locs,
 202                         num_normal_locs_per_atomic = \
 203                                           num_regular_locs_per_atomic_loc,
 204                         max_num_episodes = max_episodes,
 205                         episode_length = eps_length,
 206                         debug_tester = options.debug_tester,
 207                         random_seed = options.random_seed,
 208                         log_file = options.log_file)
 209
 210 #
 211 # Create a gem5 system. Note that the memory object isn't actually used by the
 212 # tester, but is included to ensure the gem5 memory size == Ruby memory size
 213 # checks. The system doesn't have real CPUs or CUs. It just has a tester that
 214 # has physical ports to be connected to Ruby
 215 #
 216 system = System(cpu = tester,
 217                 mem_ranges = [AddrRange(options.mem_size)],
 218                 cache_line_size = options.cacheline_size,
 219                 mem_mode = 'timing')
 220
 221 system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
 222 system.clk_domain = SrcClockDomain(clock = options.sys_clock,
 223                                    voltage_domain = system.voltage_domain)
 224
 225 #
 226 # Command processor is not needed for the tester since we don't run real
 227 # kernels. Setting it to zero disables the VIPER protocol from creating
 228 # a command processor and its caches.
 229 #
 230 options.num_cp = 0
 231
 232 #
 233 # Create the Ruby system
 234 #
 235 Ruby.create_system(options, False, system)
 236
 237 #
 238 # The tester is most effective when randomization is turned on and
 239 # artifical delay is randomly inserted on messages
 240 #
 241 system.ruby.randomization = True
 242
 243 # Assert that we got the right number of Ruby ports
 244 assert(len(system.ruby._cpu_ports) == n_CUs + n_SQCs + n_Scalars)
 245
 246 #
 247 # Attach Ruby ports to the tester in the order:
 248 #               cpu_sequencers,
 249 #               vector_coalescers,
 250 #               sqc_sequencers,
 251 #               scalar_sequencers
 252 #
 253 # Note that this requires the protocol to create sequencers in this order
 254 #
 255 print("Attaching ruby ports to the tester")
 256 for i, ruby_port in enumerate(system.ruby._cpu_ports):
 257     ruby_port.no_retry_on_stall = True
 258     ruby_port.using_ruby_tester = True
 259
 260     if i < n_CUs:
 261         tester.cu_vector_ports = ruby_port.in_ports
 262         tester.cu_token_ports = ruby_port.gmTokenPort
 263         tester.max_cu_tokens = 4*n_WFs
 264     elif i < (n_CUs + n_SQCs):
 265         tester.cu_sqc_ports = ruby_port.in_ports
 266     else:
 267         tester.cu_scalar_ports = ruby_port.in_ports
 268
 269     i += 1
 270
 271 #
 272 # No CPU threads are needed for GPU tester
 273 #
 274 tester.cpu_threads = []
 275
 276 #
 277 # Create GPU wavefronts
 278 #
 279 thread_clock = SrcClockDomain(clock = '1GHz',
 280                               voltage_domain = system.voltage_domain)
 281 wavefronts = []
 282 g_thread_idx = 0
 283 print("Creating %i WFs attached to %i CUs" % \
 284                 (n_CUs * tester.wavefronts_per_cu, n_CUs))
 285 for cu_idx in range(n_CUs):
 286     for wf_idx in range(tester.wavefronts_per_cu):
 287         wavefronts.append(GpuWavefront(thread_id = g_thread_idx,
 288                                          cu_id = cu_idx,
 289                                          num_lanes = options.wf_size,
 290                                          clk_domain = thread_clock,
 291                                          deadlock_threshold = \
 292                                                 tester_deadlock_threshold))
 293         g_thread_idx += 1
 294 tester.wavefronts = wavefronts
 295
 296 #
 297 # Run simulation
 298 #
 299 root = Root(full_system = False, system = system)
 300
 301 # Not much point in this being higher than the L1 latency
 302 m5.ticks.setGlobalFrequency('1ns')
 303
 304 # Instantiate configuration
 305 m5.instantiate()
 306
 307 # Simulate until tester completes
 308 exit_event = m5.simulate()
 309
 310 print('Exiting tick: ', m5.curTick())
 311 print('Exiting because ', exit_event.getCause())