// // This file has been modified by Kevin Moore and Dan Nussbaum of the // Scalable Systems Research Group at Sun Microsystems Laboratories // (http://research.sun.com/scalable/) to support the Adaptive // Transactional Memory Test Platform (ATMTP). For information about // ATMTP, see the GEMS website: http://www.cs.wisc.edu/gems/. // // Please send email to atmtp-interest@sun.com with feedback, questions, or // to request future announcements about ATMTP. // // ---------------------------------------------------------------------- // // File modification date: 2008-02-23 // // ---------------------------------------------------------------------- // // ATMTP is distributed as part of the GEMS software toolset and is // available for use and modification under the terms of version 2 of the // GNU General Public License. The GNU General Public License is contained // in the file $GEMS/LICENSE. // // Multifacet GEMS is free software; you can redistribute it and/or modify // it under the terms of version 2 of the GNU General Public License as // published by the Free Software Foundation. // // Multifacet GEMS is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // // You should have received a copy of the GNU General Public License along // with the Multifacet GEMS; if not, write to the Free Software Foundation, // Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA // // ---------------------------------------------------------------------- // g_RANDOM_SEED: 1 g_DEADLOCK_THRESHOLD: 500000 // determines how many Simics cycles advance for every Ruby cycle // (does not apply when running Opal) SIMICS_RUBY_MULTIPLIER: 4 // Ruby cycles between when a sequencer issues a request and it arrives at // the L1 cache controller // // ** important ** this parameter determines the L2 hit latency when // using the SMP protocols with a combined L1/L2 controller (-cache.sm) // SEQUENCER_TO_CONTROLLER_LATENCY: 4 // When set to false, the L1 cache structures are probed for a hit in Sequencer.C // If a request hits, it is *not* issued to the cache controller // When set to true, all processor data requests issue to cache controller // // ** important ** this parameter must be set to false for proper L1/L2 hit timing // for the SMP protocols with combined L1/L2 controllers (-cache.sm) // REMOVE_SINGLE_CYCLE_DCACHE_FAST_PATH: false // When running with Opal in SMT configurations, this indicates the number of threads per physical processor g_NUM_SMT_THREADS: 1 // Maximum number of requests (including SW prefetches) outstanding from // the sequencer (Note: this also include items buffered in the store // buffer) g_SEQUENCER_OUTSTANDING_REQUESTS: 16 PROTOCOL_DEBUG_TRACE: true DEBUG_FILTER_STRING: none DEBUG_VERBOSITY_STRING: none DEBUG_START_TIME: 0 DEBUG_OUTPUT_FILENAME: none TRANSACTION_TRACE_ENABLED: false USER_MODE_DATA_ONLY: false PROFILE_HOT_LINES: false PROFILE_ALL_INSTRUCTIONS: false PRINT_INSTRUCTION_TRACE: false g_DEBUG_CYCLE: 0 BLOCK_STC: false PERFECT_MEMORY_SYSTEM: false PERFECT_MEMORY_SYSTEM_LATENCY: 0 DATA_BLOCK: false // ********************************************* // CACHE & MEMORY PARAMETERS // ********************************************* L1_CACHE_ASSOC: 4 L1_CACHE_NUM_SETS_BITS: 8 L2_CACHE_ASSOC: 4 L2_CACHE_NUM_SETS_BITS: 16 // 32 bits = 4 GB address space g_MEMORY_SIZE_BYTES: 1073741824 //4294967296 g_DATA_BLOCK_BYTES: 64 g_PAGE_SIZE_BYTES: 4096 g_REPLACEMENT_POLICY: PSEDUO_LRU // currently, only other option is LRU g_PROCS_PER_CHIP: 1 // set automatically g_NUM_PROCESSORS: 0 g_NUM_L2_BANKS: 0 g_NUM_MEMORIES: 0 // The following group of parameters are calculated. They must // _always_ be left at zero. g_NUM_CHIPS: 0 g_NUM_CHIP_BITS: 0 g_MEMORY_SIZE_BITS: 0 g_DATA_BLOCK_BITS: 0 g_PAGE_SIZE_BITS: 0 g_NUM_PROCESSORS_BITS: 0 g_PROCS_PER_CHIP_BITS: 0 g_NUM_L2_BANKS_BITS: 0 g_NUM_L2_BANKS_PER_CHIP: 0 g_NUM_L2_BANKS_PER_CHIP_BITS: 0 g_NUM_MEMORIES_BITS: 0 g_NUM_MEMORIES_PER_CHIP: 0 g_MEMORY_MODULE_BITS: 0 g_MEMORY_MODULE_BLOCKS: 0 // For certain CMP protocols, determines whether the lowest bits of a block address // are used to index to a L2 cache bank or into the sets of a // single bank // lowest highest // true: g_DATA_BLOCK_BITS | g_NUM_L2_BANKS_PER_CHIP_BITS | L2_CACHE_NUM_SETS_BITS // false: g_DATA_BLOCK_BITS | L2_CACHE_NUM_SETS_BITS | g_NUM_L2_BANKS_PER_CHIP_BITS MAP_L2BANKS_TO_LOWEST_BITS: false // TIMING PARAMETERS -- many of these are protocol specific. See SLICC files // to determine where they apply MEMORY_RESPONSE_LATENCY_MINUS_2: 158 // determines memory response latency DIRECTORY_CACHE_LATENCY: 6 NULL_LATENCY: 1 ISSUE_LATENCY: 2 CACHE_RESPONSE_LATENCY: 12 L1_RESPONSE_LATENCY: 3 L2_RESPONSE_LATENCY: 6 L2_TAG_LATENCY: 6 DIRECTORY_LATENCY: 80 NETWORK_LINK_LATENCY: 1 COPY_HEAD_LATENCY: 4 ON_CHIP_LINK_LATENCY: 1 RECYCLE_LATENCY: 10 L2_RECYCLE_LATENCY: 5 TIMER_LATENCY: 10000 TBE_RESPONSE_LATENCY: 1 PERIODIC_TIMER_WAKEUPS: true // constants used by CMP protocols // cache bank access times L1_REQUEST_LATENCY: 2 L2_REQUEST_LATENCY: 4 // Number of transitions each controller state machines can complete per cycle // i.e. the number of ports to each controller // L1cache is the sum of the L1I and L1D cache ports L1CACHE_TRANSITIONS_PER_RUBY_CYCLE: 32 // Note: if SINGLE_ACCESS_L2_BANKS is enabled, this will probably enforce a // much greater constraint on the concurrency of a L2 cache bank L2CACHE_TRANSITIONS_PER_RUBY_CYCLE: 32 DIRECTORY_TRANSITIONS_PER_RUBY_CYCLE: 32 DMA_TRANSITIONS_PER_RUBY_CYCLE: 1 // Number of TBEs available for demand misses, ALL prefetches, and replacements // used by one-level protocols NUMBER_OF_TBES: 128 // two-level protocols NUMBER_OF_L1_TBES: 32 NUMBER_OF_L2_TBES: 32 // ** INTERCONECT PARAMETERS ** // g_PRINT_TOPOLOGY: true g_NETWORK_TOPOLOGY: HIERARCHICAL_SWITCH g_CACHE_DESIGN: NUCA // specifies file prefix for FILE_SPECIFIED topology FAN_OUT_DEGREE: 4 // for HIERARCHICAL SWITCH topology g_adaptive_routing: true NUMBER_OF_VIRTUAL_NETWORKS: 6 // bandwidth unit is 1/1000 byte per cycle. the following parameter is multiplied by // topology specific link weights g_endpoint_bandwidth: 10000 // ** finite buffering parameters // // note: Finite buffering allows us to simulate a realistic virtual cut-through // routed network with idealized flow control. this feature is NOT heavily tested FINITE_BUFFERING: false // All message buffers within the network (i.e. the switch's input and // output buffers) are set to the size specified below by the FINITE_BUFFER_SIZE FINITE_BUFFER_SIZE: 3 // g_SEQUENCER_OUTSTANDING_REQUESTS (above) controlls the number of demand requests // issued by the sequencer. The PROCESSOR_BUFFER_SIZE controlls the // number of requests in the mandatory queue // Only effects the simualtion when FINITE_BUFFERING is enabled PROCESSOR_BUFFER_SIZE: 10 // The PROTOCOL_BUFFER_SIZE limits the size of all other buffers connecting to // Controllers. Controlls the number of request issued by the L2 HW Prefetcher PROTOCOL_BUFFER_SIZE: 32 // ** end finite buffering parameters // (deprecated) // Allows on a single accesses to a multi-cycle L2 bank. // Ensures the cache array is only accessed once for every L2_REQUEST_LATENCY // number of cycles. However the TBE table can be accessed in parallel. SINGLE_ACCESS_L2_BANKS: true // MOESI_CMP_token parameters (some might be deprecated) g_FILTERING_ENABLED: false g_DISTRIBUTED_PERSISTENT_ENABLED: true g_RETRY_THRESHOLD: 1 g_DYNAMIC_TIMEOUT_ENABLED: true g_FIXED_TIMEOUT_LATENCY: 300 // tester parameters (overridden by testerconfig.defaults) // // injects random message delays to excite protocol races RANDOMIZATION: false g_SYNTHETIC_DRIVER: false g_DETERMINISTIC_DRIVER: false g_trace_warmup_length: 1000000 g_bash_bandwidth_adaptive_threshold: 0.75 g_tester_length: 0 // # of synthetic locks == 16 * 128 g_synthetic_locks: 2048 g_deterministic_addrs: 1 g_SpecifiedGenerator: DetermInvGenerator g_callback_counter: 0 g_NUM_COMPLETIONS_BEFORE_PASS: 0 // parameters used by locking synthetic tester g_think_time: 5 g_hold_time: 5 g_wait_time: 5 // Princeton Network (Garnet) g_GARNET_NETWORK: true g_DETAIL_NETWORK: false g_NETWORK_TESTING: false g_FLIT_SIZE: 16 g_NUM_PIPE_STAGES: 4 g_VCS_PER_CLASS: 4 g_BUFFER_SIZE: 4 /////////////////////////////////////////////////////////////////////////////// // // MemoryControl: // Basic cycle time of the memory controller. This defines the period which is // used as the memory channel clock period, the address bus bit time, and the // memory controller cycle time. // Assuming a 200 MHz memory channel (DDR-400, which has 400 bits/sec data), // and a 2 GHz Ruby clock: MEM_BUS_CYCLE_MULTIPLIER: 10 // How many internal banks in each DRAM chip: BANKS_PER_RANK: 8 // How many sets of DRAM chips per DIMM. RANKS_PER_DIMM: 2 // How many DIMMs per channel. (Currently the only thing that // matters is the number of ranks per channel, i.e. the product // of this parameter and RANKS_PER_DIMM. But if and when this is // expanded to do FB-DIMMs, the distinction between the two // will matter.) DIMMS_PER_CHANNEL: 2 // Which bits to use to find the bank, rank, and DIMM numbers. // You could choose to have the bank bits, rank bits, and DIMM bits // in any order; here they are in that order. // For these defaults, we assume this format for addresses: // Offset within line: [5:0] // Memory controller #: [7:6] // Bank: [10:8] // Rank: [11] // DIMM: [12] // Row addr / Col addr: [top:13] // If you get these bits wrong, then some banks won't see any // requests; you need to check for this in the .stats output. BANK_BIT_0: 8 RANK_BIT_0: 11 DIMM_BIT_0: 12 // Number of entries max in each bank queues; set to whatever you want. // If it is too small, you will see in the .stats file a lot of delay // time spent in the common input queue. BANK_QUEUE_SIZE: 12 // Bank cycle time (tRC) measured in memory cycles: BANK_BUSY_TIME: 11 // This is how many memory address cycles to delay between reads to // different ranks of DRAMs to allow for clock skew: RANK_RANK_DELAY: 1 // This is how many memory address cycles to delay between a read // and a write. This is based on two things: (1) the data bus is // used one cycle earlier in the operation; (2) a round-trip wire // delay from the controller to the DIMM that did the reading. READ_WRITE_DELAY: 2 // Basic address and data bus occupancy. If you are assuming a // 16-byte-wide data bus (pairs of DIMMs side-by-side), then // the data bus occupancy matches the address bus occupancy at // two cycles. But if the channel is only 8 bytes wide, you // need to increase this bus occupancy time to 4 cycles. BASIC_BUS_BUSY_TIME: 2 // Latency to returning read request or writeback acknowledgement. // Measured in memory address cycles. // This equals tRCD + CL + AL + (four bit times) // + (round trip on channel) // + (memory control internal delays) // It's going to be an approximation, so pick what you like. // Note: The fact that latency is a constant, and does not depend on two // low-order address bits, implies that our memory controller either: // (a) tells the DRAM to read the critical word first, and sends the // critical word first back to the CPU, or (b) waits until it has // seen all four bit times on the data wires before sending anything // back. Either is plausible. If (a), remove the "four bit times" // term from the calculation above. MEM_CTL_LATENCY: 12 // refresh_period is the number of memory cycles between refresh // of row x in bank n and refresh of row x+1 in bank n. For DDR-400, // this is typically 7.8 usec for commercial systems; after 8192 such // refreshes, this will have refreshed the whole chip in 64 msec. If // we have a 5 nsec memory clock, 7800 / 5 = 1560 cycles. The memory // controller will divide this by the total number of banks, and kick // off a refresh to *somebody* every time that amount is counted // down to zero. (There will be some rounding error there, but it // should have minimal effect.) REFRESH_PERIOD: 1560 // tFAW is a DRAM chip parameter which restricts the number of // activates that can be done within a certain window of time. // The window is specified here in terms of number of memory // controller cycles. At most four activates may be done during // any such sliding window. If this number is set to be no more // than 4 * BASIC_BUS_BUSY_TIME, it will have no effect. // It is typical in real systems for tFAW to have no effect, but // it may be useful in throttling power. Set to zero to ignore. TFAW: 0 // By default, the memory controller uses round-robin to arbitrate // between ready bank queues for use of the address bus. If you // wish to add randomness to the system, set this parameter to // one instead, and it will restart the round-robin pointer at a // random bank number each cycle. If you want additional // nondeterminism, set the parameter to some integer n >= 2, and // it will in addition add a n% chance each cycle that a ready bank // will be delayed an additional cycle. Note that if you are // in MEM_FIXED_DELAY mode (see below), MEM_RANDOM_ARBITRATE=1 will // have no effect, but MEM_RANDOM_ARBITRATE=2 or more will. MEM_RANDOM_ARBITRATE: 0 // The following parameter, if nonzero, will disable the memory // controller and instead give every request a fixed latency. The // nonzero value specified here is measured in memory cycles and is // just added to MEM_CTL_LATENCY. It will also show up in the stats // file as a contributor to memory_delays_stalled_at_head_of_bank_queue. MEM_FIXED_DELAY: 0 // If instead of DDR-400, you wanted DDR-800, the channel gets faster // but the basic operation of the DRAM core is unchanged. // Busy times appear to double just because they are measured // in smaller clock cycles. The performance advantage comes because // the bus busy times don't actually quite double. // You would use something like these values: // // MEM_BUS_CYCLE_MULTIPLIER: 5 // BANK_BUSY_TIME: 22 // RANK_RANK_DELAY: 2 // READ_WRITE_DELAY: 3 // BASIC_BUS_BUSY_TIME: 3 // MEM_CTL_LATENCY: 20 // REFRESH_PERIOD: 3120