From d228a283c9039f25e2b046ee895950a0eefc28ba Mon Sep 17 00:00:00 2001
From: Wendy Elsasser <wendy.elsasser@arm.com>
Date: Tue, 21 May 2019 14:38:53 -0500
Subject: [PATCH] mem: Modify DRAM controller for flexibility and new memories

This change includes:
1) Verify available command bandwidth
2) Add support for multi-cycle commands
3) Add new timing parameters
4) Add ability to interleave bursts
5) Add LPDDR5 configurations

The DRAM controller historically does not verify contention on the
command bus and if there is adaquate command bandwidth to issue a
new command. As memory technologies evolve, multiple cycles are becoming
a requirement for some commands.  Depending on the burst length, this
can stress the command bandwidth. A check was added to verify command
issue does not exceed a maximum value within a defined window. The
default window is a burst, with the maximum value defined based on the
burst length and media clocking characteristics. When the command bandwidth
is exceeded, commands will be shifted to subsequent burst windows.

Added support for multi-cycle commands, specifically Activate, which
requires a larger address width as capacities grow.  Additionally,
added support for multi-cycle Read / Write bursts for low power
DRAM cases in which additional CLK synchronization may be required
to run at higher speeds.

To support emerging memories, added the following new timing parameters.
1) tPPD -- Precharge-to-Precharge delay
2) tAAD -- Max delay between Activate-1 and Activate-2 commands

I/O data rates are continuing to increase for DRAM but the core frequency
is still fairly stagnant for many technologies. As we increase the burst
length, either the core prefetch needs to increase (for a seamless burst)
or the burst will be transferred with gaps on the data bus. To support
the latter case, added the ability to interleave 2 bursts across bank
groups.

Using the changes above, added an initial set of LPDDR5 configurations.

Change-Id: I1b14fed221350e6e403f7cbf089fe6c7f033c181
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/26236
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: Gem5 Cloud Project GCB service account <345032938727@cloudbuild.gserviceaccount.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 configs/dram/sweep.py |  13 +-
 src/mem/DRAMCtrl.py   | 325 +++++++++++++++++++++++++++++++++++++++++-
 src/mem/dram_ctrl.cc  | 265 ++++++++++++++++++++++++++++------
 src/mem/dram_ctrl.hh  |  71 ++++++++-
 src/mem/drampower.cc  |   6 +-
 5 files changed, 624 insertions(+), 56 deletions(-)

diff --git a/configs/dram/sweep.py b/configs/dram/sweep.py
index c2650a72f..d3c86c334 100644
--- a/configs/dram/sweep.py
+++ b/configs/dram/sweep.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2014-2015, 2018-2019 ARM Limited
+# Copyright (c) 2014-2015, 2018-2020 ARM Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -146,7 +146,8 @@ page_size = system.mem_ctrls[0].devices_per_rank.value * \
 
 # match the maximum bandwidth of the memory, the parameter is in seconds
 # and we need it in ticks (ps)
-itt = system.mem_ctrls[0].tBURST.value * 1000000000000
+itt =  getattr(system.mem_ctrls[0].tBURST_MIN, 'value',
+               system.mem_ctrls[0].tBURST.value) * 1000000000000
 
 # assume we start at 0
 max_addr = mem_range.end
@@ -180,8 +181,8 @@ m5.instantiate()
 def trace():
     addr_map = ObjectList.dram_addr_map_list.get(options.addr_map)
     generator = dram_generators[options.mode](system.tgen)
-    for bank in range(1, nbr_banks + 1):
-        for stride_size in range(burst_size, max_stride + 1, burst_size):
+    for stride_size in range(burst_size, max_stride + 1, burst_size):
+        for bank in range(1, nbr_banks + 1):
             num_seq_pkts = int(math.ceil(float(stride_size) / burst_size))
             yield generator(period,
                             0, max_addr, burst_size, int(itt), int(itt),
@@ -194,5 +195,5 @@ system.tgen.start(trace())
 
 m5.simulate()
 
-print("DRAM sweep with burst: %d, banks: %d, max stride: %d" %
-    (burst_size, nbr_banks, max_stride))
+print("DRAM sweep with burst: %d, banks: %d, max stride: %d, request \
+       generation period: %d" % (burst_size, nbr_banks, max_stride, itt))
diff --git a/src/mem/DRAMCtrl.py b/src/mem/DRAMCtrl.py
index 121d00425..0f70dffec 100644
--- a/src/mem/DRAMCtrl.py
+++ b/src/mem/DRAMCtrl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012-2019 ARM Limited
+# Copyright (c) 2012-2020 ARM Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -171,7 +171,17 @@ class DRAMCtrl(QoSMemCtrl):
     # tBURST is equivalent to the CAS-to-CAS delay (tCCD)
     # With bank group architectures, tBURST represents the CAS-to-CAS
     # delay for bursts to different bank groups (tCCD_S)
-    tBURST = Param.Latency("Burst duration (for DDR burst length / 2 cycles)")
+    tBURST = Param.Latency("Burst duration "
+                           "(typically burst length / 2 cycles)")
+
+    # tBURST_MAX is the column array cycle delay required before next access,
+    # which could be greater than tBURST when the memory access time is greater
+    # than tBURST
+    tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay")
+
+    # tBURST_MIN is the minimum delay between bursts, which could be less than
+    # tBURST when interleaving is supported
+    tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts")
 
     # CAS-to-CAS delay for bursts to the same bank group
     # only utilized with bank group architectures; set to 0 for default case
@@ -196,6 +206,10 @@ class DRAMCtrl(QoSMemCtrl):
     # write-to-read, same rank turnaround penalty
     tWTR = Param.Latency("Write to read, same rank switching time")
 
+    # write-to-read, same rank turnaround penalty for same bank group
+    tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching "
+                           "time, same bank group")
+
     # read-to-write, same rank turnaround penalty
     tRTW = Param.Latency("Read to write, same rank switching time")
 
@@ -205,6 +219,16 @@ class DRAMCtrl(QoSMemCtrl):
     # different rank bus delay
     tCS = Param.Latency("Rank to rank switching time")
 
+    # minimum precharge to precharge delay time
+    tPPD = Param.Latency("0ns", "PRE to PRE delay")
+
+    # maximum delay between two-cycle ACT command phases
+    tAAD = Param.Latency(Self.tCK,
+                         "Maximum delay between two-cycle ACT commands")
+
+    two_cycle_activate = Param.Bool(False,
+                         "Two cycles required to send activate")
+
     # minimum row activate to row activate delay time
     tRRD = Param.Latency("ACT to ACT delay")
 
@@ -229,6 +253,11 @@ class DRAMCtrl(QoSMemCtrl):
     # time to exit self-refresh mode with locked DLL
     tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL")
 
+    # number of data beats per clock. with DDR, default is 2, one per edge
+    beats_per_clock = Param.Unsigned(2, "Data beats per clock")
+
+    data_clock_sync = Param.Bool(False, "Synchronization commands required")
+
     # Currently rolled into other params
     ######################################################################
 
@@ -1189,3 +1218,295 @@ class HBM_1000_4H_1x64(HBM_1000_4H_1x128):
 
     # self refresh exit time
     tXS = '65ns'
+
+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# Starting with 5.5Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture
+# burst of 32, which means bursts can be interleaved
+class LPDDR5_5500_1x16_BG_BL32(DRAMCtrl):
+
+    # Increase buffer size to account for more bank resources
+    read_buffer_size = 64
+
+    # Set page policy to better suit DMC Huxley
+    page_policy = 'close_adaptive'
+
+    # 16-bit channel interface
+    device_bus_width = 16
+
+    # LPDDR5 is a BL16 or BL32 device
+    # With BG mode, BL16 and BL32 are supported
+    # Use BL32 for higher command bandwidth
+    burst_length = 32
+
+    # size of device in bytes
+    device_size = '1GB'
+
+    # 2kB page with BG mode
+    device_rowbuffer_size = '2kB'
+
+    # Use a 1x16 configuration
+    devices_per_rank = 1
+
+    # Use a single rank
+    ranks_per_channel = 1
+
+    # LPDDR5 supports configurable bank options
+    # 8B  : BL32, all frequencies
+    # 16B : BL32 or BL16, <=3.2Gbps
+    # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps
+    # Initial configuration will have 16 banks with Bank Group Arch
+    # to maximim resources and enable higher data rates
+    banks_per_rank = 16
+    bank_groups_per_rank = 4
+
+    # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK
+    tCK = '1.455ns'
+
+    # Greater of 2 CK or 18ns
+    tRCD = '18ns'
+
+    # Base RL is 16 CK @ 687.5 MHz = 23.28ns
+    tCL = '23.280ns'
+
+    # Greater of 2 CK or 18ns
+    tRP = '18ns'
+
+    # Greater of 3 CK or 42ns
+    tRAS = '42ns'
+
+    # Greater of 3 CK or 34ns
+    tWR = '34ns'
+
+    # active powerdown and precharge powerdown exit time
+    # Greater of 3 CK or 7ns
+    tXP = '7ns'
+
+    # self refresh exit time (tRFCab + 7.5ns)
+    tXS = '217.5ns'
+
+    # Greater of 2 CK or 7.5 ns minus 2 CK
+    tRTP = '4.59ns'
+
+    # With BG architecture, burst of 32 transferred in two 16-beat
+    # sub-bursts, with a 16-beat gap in between.
+    # Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz
+    # tBURST is the delay to transfer the Bstof32 =  6 CK @ 687.5 MHz
+    tBURST = '8.73ns'
+    # can interleave a Bstof32 from another bank group at tBURST_MIN
+    # 16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz
+    tBURST_MIN = '2.91ns'
+    # tBURST_MAX is the maximum burst delay for same bank group timing
+    # this is 8 CK @ 687.5 MHz
+    tBURST_MAX = '11.64ns'
+
+    # 8 CK @ 687.5 MHz
+    tCCD_L = "11.64ns"
+
+    # LPDDR5, 8 Gbit/channel for 280ns tRFCab
+    tRFC = '210ns'
+    tREFI = '3.9us'
+
+    # Greater of 4 CK or 6.25 ns
+    tWTR = '6.25ns'
+    # Greater of 4 CK or 12 ns
+    tWTR_L = '12ns'
+
+    # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL
+    # tWCKDQ0/tCK will be 1 CK for most cases
+    # For gem5 RL = WL and BL/n is already accounted for with tBURST
+    # Result is and additional 1 CK is required
+    tRTW = '1.455ns'
+
+    # Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns
+    tCS = '2.91ns'
+
+    # 2 CK
+    tPPD = '2.91ns'
+
+    # Greater of 2 CK or 5 ns
+    tRRD = '5ns'
+    tRRD_L = '5ns'
+
+    # With Bank Group Arch mode tFAW is 20 ns
+    tXAW = '20ns'
+    activation_limit = 4
+
+    # at 5Gbps, 4:1 WCK to CK ratio required
+    # 2 data beats per WCK (DDR) -> 8 per CK
+    beats_per_clock = 8
+
+    # 2 cycles required to send activate command
+    # 2 command phases can be sent back-to-back or
+    # with a gap up to tAAD = 8 CK
+    two_cycle_activate = True
+    tAAD = '11.640ns'
+
+    data_clock_sync = True
+
+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# Starting with 5.5Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture, burst of 16
+class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32):
+
+    # LPDDR5 is a BL16 or BL32 device
+    # With BG mode, BL16 and BL32 are supported
+    # Use BL16 for smaller access granularity
+    burst_length = 16
+
+    # For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio
+    tBURST = '2.91ns'
+    tBURST_MIN = '2.91ns'
+    # For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio
+    tBURST_MAX = '5.82ns'
+
+    # 4 CK @ 687.5 MHz
+    tCCD_L = "5.82ns"
+
+
+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# Starting with 5.5Gbps data rates and 8Gbit die
+# Configuring for 8-bank mode, burst of 32
+class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32):
+
+    # 4kB page with 8B mode
+    device_rowbuffer_size = '4kB'
+
+    # LPDDR5 supports configurable bank options
+    # 8B  : BL32, all frequencies
+    # 16B : BL32 or BL16, <=3.2Gbps
+    # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps
+    # Select 8B
+    banks_per_rank = 8
+    bank_groups_per_rank = 0
+
+    # For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio
+    tBURST = '5.82ns'
+    tBURST_MIN = '5.82ns'
+    tBURST_MAX = '5.82ns'
+
+    # Greater of 4 CK or 12 ns
+    tWTR = '12ns'
+
+    # Greater of 2 CK or 10 ns
+    tRRD = '10ns'
+
+    # With 8B mode tFAW is 40 ns
+    tXAW = '40ns'
+    activation_limit = 4
+
+    # Reset BG arch timing for 8B mode
+    tCCD_L = "0ns"
+    tRRD_L = "0ns"
+    tWTR_L = "0ns"
+
+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# 6.4Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture
+# burst of 32, which means bursts can be interleaved
+class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32):
+
+    # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK
+    tCK = '1.25ns'
+
+    # Base RL is 17 CK @ 800 MHz = 21.25ns
+    tCL = '21.25ns'
+
+    # With BG architecture, burst of 32 transferred in two 16-beat
+    # sub-bursts, with a 16-beat gap in between.
+    # Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz
+    # tBURST is the delay to transfer the Bstof32 =  6 CK @ 800 MHz
+    tBURST = '7.5ns'
+    # can interleave a Bstof32 from another bank group at tBURST_MIN
+    # 16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz
+    tBURST_MIN = '2.5ns'
+    # tBURST_MAX is the maximum burst delay for same bank group timing
+    # this is 8 CK @ 800 MHz
+    tBURST_MAX = '10ns'
+
+    # 8 CK @ 800 MHz
+    tCCD_L = "10ns"
+
+    # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL
+    # tWCKDQ0/tCK will be 1 CK for most cases
+    # For gem5 RL = WL and BL/n is already accounted for with tBURST
+    # Result is and additional 1 CK is required
+    tRTW = '1.25ns'
+
+    # Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns
+    tCS = '2.5ns'
+
+    # 2 CK
+    tPPD = '2.5ns'
+
+    # 2 command phases can be sent back-to-back or
+    # with a gap up to tAAD = 8 CK
+    tAAD = '10ns'
+
+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on initial
+# JEDEC specifcation
+# 6.4Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture, burst of 16
+class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32):
+
+    # LPDDR5 is a BL16 or BL32 device
+    # With BG mode, BL16 and BL32 are supported
+    # Use BL16 for smaller access granularity
+    burst_length = 16
+
+    # For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio
+    tBURST = '2.5ns'
+    tBURST_MIN = '2.5ns'
+    # For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio
+    tBURST_MAX = '5ns'
+
+    # 4 CK @ 800 MHz
+    tCCD_L = "5ns"
+
+
+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# 6.4Gbps data rates and 8Gbit die
+# Configuring for 8-bank mode, burst of 32
+class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32):
+
+    # 4kB page with 8B mode
+    device_rowbuffer_size = '4kB'
+
+    # LPDDR5 supports configurable bank options
+    # 8B  : BL32, all frequencies
+    # 16B : BL32 or BL16, <=3.2Gbps
+    # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps
+    # Select 8B
+    banks_per_rank = 8
+    bank_groups_per_rank = 0
+
+    # For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio
+    tBURST = '5ns'
+    tBURST_MIN = '5ns'
+    tBURST_MAX = '5ns'
+
+    # Greater of 4 CK or 12 ns
+    tWTR = '12ns'
+
+    # Greater of 2 CK or 10 ns
+    tRRD = '10ns'
+
+    # With 8B mode tFAW is 40 ns
+    tXAW = '40ns'
+    activation_limit = 4
+
+    # Reset BG arch timing for 8B mode
+    tCCD_L = "0ns"
+    tRRD_L = "0ns"
+    tWTR_L = "0ns"
diff --git a/src/mem/dram_ctrl.cc b/src/mem/dram_ctrl.cc
index ed2be4b5f..0a8479eb8 100644
--- a/src/mem/dram_ctrl.cc
+++ b/src/mem/dram_ctrl.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2019 ARM Limited
+ * Copyright (c) 2010-2020 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -77,12 +77,22 @@ DRAMCtrl::DRAMCtrl(const DRAMCtrlParams* p) :
     minWritesPerSwitch(p->min_writes_per_switch),
     writesThisTime(0), readsThisTime(0),
     tCK(p->tCK), tRTW(p->tRTW), tCS(p->tCS), tBURST(p->tBURST),
+    tBURST_MIN(p->tBURST_MIN),
     tCCD_L_WR(p->tCCD_L_WR),
     tCCD_L(p->tCCD_L), tRCD(p->tRCD), tCL(p->tCL), tRP(p->tRP), tRAS(p->tRAS),
     tWR(p->tWR), tRTP(p->tRTP), tRFC(p->tRFC), tREFI(p->tREFI), tRRD(p->tRRD),
-    tRRD_L(p->tRRD_L), tXAW(p->tXAW), tXP(p->tXP), tXS(p->tXS),
+    tRRD_L(p->tRRD_L), tPPD(p->tPPD), tAAD(p->tAAD), tXAW(p->tXAW),
+    tXP(p->tXP), tXS(p->tXS),
+    clkResyncDelay(tCL + p->tBURST_MAX),
+    maxCommandsPerBurst(burstLength / p->beats_per_clock),
+    dataClockSync(p->data_clock_sync),
+    twoCycleActivate(p->two_cycle_activate),
     activationLimit(p->activation_limit), rankToRankDly(tCS + tBURST),
     wrToRdDly(tCL + tBURST + p->tWTR), rdToWrDly(tRTW + tBURST),
+    wrToRdDlySameBG(tCL + p->tBURST_MAX + p->tWTR_L),
+    rdToWrDlySameBG(tRTW + p->tBURST_MAX),
+    burstInterleave(tBURST != tBURST_MIN),
+    burstDataCycles(burstInterleave ? p->tBURST_MAX / 2 : tBURST),
     memSchedPolicy(p->mem_sched_policy), addrMapping(p->addr_mapping),
     pageMgmt(p->page_policy),
     maxAccessesPerRow(p->max_accesses_per_row),
@@ -104,7 +114,6 @@ DRAMCtrl::DRAMCtrl(const DRAMCtrlParams* p) :
     readQueue.resize(p->qos_priorities);
     writeQueue.resize(p->qos_priorities);
 
-
     for (int i = 0; i < ranksPerChannel; i++) {
         Rank* rank = new Rank(*this, p, i);
         ranks.push_back(rank);
@@ -668,7 +677,8 @@ DRAMCtrl::processRespondEvent()
     // track if this is the last packet before idling
     // and that there are no outstanding commands to this rank
     if (dram_pkt->rankRef.isQueueEmpty() &&
-        dram_pkt->rankRef.outstandingEvents == 0 && enableDRAMPowerdown) {
+        dram_pkt->rankRef.outstandingEvents == 0 &&
+        dram_pkt->rankRef.inRefIdleState() && enableDRAMPowerdown) {
         // verify that there are no events scheduled
         assert(!dram_pkt->rankRef.activateEvent.scheduled());
         assert(!dram_pkt->rankRef.prechargeEvent.scheduled());
@@ -720,6 +730,11 @@ DRAMCtrl::processRespondEvent()
 
             DPRINTF(Drain, "DRAM controller done draining\n");
             signalDrainDone();
+        } else if ((dram_pkt->rankRef.refreshState == REF_PRE) &&
+                   !dram_pkt->rankRef.prechargeEvent.scheduled()) {
+            // kick the refresh event loop into action again if banks already
+            // closed and just waiting for read to complete
+            schedule(dram_pkt->rankRef.refreshEvent, curTick());
         }
     }
 
@@ -803,8 +818,8 @@ DRAMCtrl::chooseNextFRFCFS(DRAMPacketQueue& queue, Tick extra_col_delay)
         const Tick col_allowed_at = dram_pkt->isRead() ? bank.rdAllowedAt :
                                                          bank.wrAllowedAt;
 
-        DPRINTF(DRAM, "%s checking packet in bank %d\n",
-                __func__, dram_pkt->bankRef.bank);
+        DPRINTF(DRAM, "%s checking packet in bank %d, row %d\n",
+                __func__, dram_pkt->bankRef.bank, dram_pkt->row);
 
         // check if rank is not doing a refresh and thus is available, if not,
         // jump to the next packet
@@ -913,13 +928,127 @@ DRAMCtrl::accessAndRespond(PacketPtr pkt, Tick static_latency)
     return;
 }
 
+void
+DRAMCtrl::pruneBurstTick()
+{
+    auto it = burstTicks.begin();
+    while (it != burstTicks.end()) {
+        auto current_it = it++;
+        if (curTick() > *current_it) {
+            DPRINTF(DRAM, "Removing burstTick for %d\n", *current_it);
+            burstTicks.erase(current_it);
+        }
+    }
+}
+
+Tick
+DRAMCtrl::getBurstWindow(Tick cmd_tick)
+{
+    // get tick aligned to burst window
+    Tick burst_offset = cmd_tick % burstDataCycles;
+    return (cmd_tick - burst_offset);
+}
+
+Tick
+DRAMCtrl::verifySingleCmd(Tick cmd_tick)
+{
+    // start with assumption that there is no contention on command bus
+    Tick cmd_at = cmd_tick;
+
+    // get tick aligned to burst window
+    Tick burst_tick = getBurstWindow(cmd_tick);
+
+    // verify that we have command bandwidth to issue the command
+    // if not, iterate over next window(s) until slot found
+    while (burstTicks.count(burst_tick) >= maxCommandsPerBurst) {
+        DPRINTF(DRAM, "Contention found on command bus at %d\n", burst_tick);
+        burst_tick += burstDataCycles;
+        cmd_at = burst_tick;
+    }
+
+    // add command into burst window and return corresponding Tick
+    burstTicks.insert(burst_tick);
+    return cmd_at;
+}
+
+Tick
+DRAMCtrl::verifyMultiCmd(Tick cmd_tick, Tick max_multi_cmd_split)
+{
+    // start with assumption that there is no contention on command bus
+    Tick cmd_at = cmd_tick;
+
+    // get tick aligned to burst window
+    Tick burst_tick = getBurstWindow(cmd_tick);
+
+    // Command timing requirements are from 2nd command
+    // Start with assumption that 2nd command will issue at cmd_at and
+    // find prior slot for 1st command to issue
+    // Given a maximum latency of max_multi_cmd_split between the commands,
+    // find the burst at the maximum latency prior to cmd_at
+    Tick burst_offset = 0;
+    Tick first_cmd_offset = cmd_tick % burstDataCycles;
+    while (max_multi_cmd_split > (first_cmd_offset + burst_offset)) {
+        burst_offset += burstDataCycles;
+    }
+    // get the earliest burst aligned address for first command
+    // ensure that the time does not go negative
+    Tick first_cmd_tick = burst_tick - std::min(burst_offset, burst_tick);
+
+    // Can required commands issue?
+    bool first_can_issue = false;
+    bool second_can_issue = false;
+    // verify that we have command bandwidth to issue the command(s)
+    while (!first_can_issue || !second_can_issue) {
+        bool same_burst = (burst_tick == first_cmd_tick);
+        auto first_cmd_count = burstTicks.count(first_cmd_tick);
+        auto second_cmd_count = same_burst ? first_cmd_count + 1 :
+                                   burstTicks.count(burst_tick);
+
+        first_can_issue = first_cmd_count < maxCommandsPerBurst;
+        second_can_issue = second_cmd_count < maxCommandsPerBurst;
+
+        if (!second_can_issue) {
+            DPRINTF(DRAM, "Contention (cmd2) found on command bus at %d\n",
+                    burst_tick);
+            burst_tick += burstDataCycles;
+            cmd_at = burst_tick;
+        }
+
+        // Verify max_multi_cmd_split isn't violated when command 2 is shifted
+        // If commands initially were issued in same burst, they are
+        // now in consecutive bursts and can still issue B2B
+        bool gap_violated = !same_burst &&
+             ((burst_tick - first_cmd_tick) > max_multi_cmd_split);
+
+        if (!first_can_issue || (!second_can_issue && gap_violated)) {
+            DPRINTF(DRAM, "Contention (cmd1) found on command bus at %d\n",
+                    first_cmd_tick);
+            first_cmd_tick += burstDataCycles;
+        }
+    }
+
+    // Add command to burstTicks
+    burstTicks.insert(burst_tick);
+    burstTicks.insert(first_cmd_tick);
+
+    return cmd_at;
+}
+
 void
 DRAMCtrl::activateBank(Rank& rank_ref, Bank& bank_ref,
                        Tick act_tick, uint32_t row)
 {
     assert(rank_ref.actTicks.size() == activationLimit);
 
-    DPRINTF(DRAM, "Activate at tick %d\n", act_tick);
+    // verify that we have command bandwidth to issue the activate
+    // if not, shift to next burst window
+    Tick act_at;
+    if (twoCycleActivate)
+        act_at = verifyMultiCmd(act_tick, tAAD);
+    else
+        act_at = verifySingleCmd(act_tick);
+
+    DPRINTF(DRAM, "Activate at tick %d\n", act_at);
 
     // update the open row
     assert(bank_ref.openRow == Bank::NO_ROW);
@@ -935,21 +1064,21 @@ DRAMCtrl::activateBank(Rank& rank_ref, Bank& bank_ref,
     assert(rank_ref.numBanksActive <= banksPerRank);
 
     DPRINTF(DRAM, "Activate bank %d, rank %d at tick %lld, now got %d active\n",
-            bank_ref.bank, rank_ref.rank, act_tick,
+            bank_ref.bank, rank_ref.rank, act_at,
             ranks[rank_ref.rank]->numBanksActive);
 
     rank_ref.cmdList.push_back(Command(MemCommand::ACT, bank_ref.bank,
-                               act_tick));
+                               act_at));
 
-    DPRINTF(DRAMPower, "%llu,ACT,%d,%d\n", divCeil(act_tick, tCK) -
+    DPRINTF(DRAMPower, "%llu,ACT,%d,%d\n", divCeil(act_at, tCK) -
             timeStampOffset, bank_ref.bank, rank_ref.rank);
 
     // The next access has to respect tRAS for this bank
-    bank_ref.preAllowedAt = act_tick + tRAS;
+    bank_ref.preAllowedAt = act_at + tRAS;
 
     // Respect the row-to-column command delay for both read and write cmds
-    bank_ref.rdAllowedAt = std::max(act_tick + tRCD, bank_ref.rdAllowedAt);
-    bank_ref.wrAllowedAt = std::max(act_tick + tRCD, bank_ref.wrAllowedAt);
+    bank_ref.rdAllowedAt = std::max(act_at + tRCD, bank_ref.rdAllowedAt);
+    bank_ref.wrAllowedAt = std::max(act_at + tRCD, bank_ref.wrAllowedAt);
 
     // start by enforcing tRRD
     for (int i = 0; i < banksPerRank; i++) {
@@ -959,13 +1088,13 @@ DRAMCtrl::activateBank(Rank& rank_ref, Bank& bank_ref,
             // bank group architecture requires longer delays between
             // ACT commands within the same bank group.  Use tRRD_L
             // in this case
-            rank_ref.banks[i].actAllowedAt = std::max(act_tick + tRRD_L,
+            rank_ref.banks[i].actAllowedAt = std::max(act_at + tRRD_L,
                                              rank_ref.banks[i].actAllowedAt);
         } else {
             // use shorter tRRD value when either
             // 1) bank group architecture is not supportted
             // 2) bank is in a different bank group
-            rank_ref.banks[i].actAllowedAt = std::max(act_tick + tRRD,
+            rank_ref.banks[i].actAllowedAt = std::max(act_at + tRRD,
                                              rank_ref.banks[i].actAllowedAt);
         }
     }
@@ -975,10 +1104,10 @@ DRAMCtrl::activateBank(Rank& rank_ref, Bank& bank_ref,
     if (!rank_ref.actTicks.empty()) {
         // sanity check
         if (rank_ref.actTicks.back() &&
-           (act_tick - rank_ref.actTicks.back()) < tXAW) {
+           (act_at - rank_ref.actTicks.back()) < tXAW) {
             panic("Got %d activates in window %d (%llu - %llu) which "
-                  "is smaller than %llu\n", activationLimit, act_tick -
-                  rank_ref.actTicks.back(), act_tick,
+                  "is smaller than %llu\n", activationLimit, act_at -
+                  rank_ref.actTicks.back(), act_at,
                   rank_ref.actTicks.back(), tXAW);
         }
 
@@ -987,13 +1116,13 @@ DRAMCtrl::activateBank(Rank& rank_ref, Bank& bank_ref,
         rank_ref.actTicks.pop_back();
 
         // record an new activation (in the future)
-        rank_ref.actTicks.push_front(act_tick);
+        rank_ref.actTicks.push_front(act_at);
 
         // cannot activate more than X times in time window tXAW, push the
         // next one (the X + 1'st activate) to be tXAW away from the
         // oldest in our window of X
         if (rank_ref.actTicks.back() &&
-           (act_tick - rank_ref.actTicks.back()) < tXAW) {
+           (act_at - rank_ref.actTicks.back()) < tXAW) {
             DPRINTF(DRAM, "Enforcing tXAW with X = %d, next activate "
                     "no earlier than %llu\n", activationLimit,
                     rank_ref.actTicks.back() + tXAW);
@@ -1008,14 +1137,15 @@ DRAMCtrl::activateBank(Rank& rank_ref, Bank& bank_ref,
     // at the point when this activate takes place, make sure we
     // transition to the active power state
     if (!rank_ref.activateEvent.scheduled())
-        schedule(rank_ref.activateEvent, act_tick);
-    else if (rank_ref.activateEvent.when() > act_tick)
+        schedule(rank_ref.activateEvent, act_at);
+    else if (rank_ref.activateEvent.when() > act_at)
         // move it sooner in time
-        reschedule(rank_ref.activateEvent, act_tick);
+        reschedule(rank_ref.activateEvent, act_at);
 }
 
 void
-DRAMCtrl::prechargeBank(Rank& rank_ref, Bank& bank, Tick pre_at, bool trace)
+DRAMCtrl::prechargeBank(Rank& rank_ref, Bank& bank, Tick pre_tick,
+                        bool auto_or_preall, bool trace)
 {
     // make sure the bank has an open row
     assert(bank.openRow != Bank::NO_ROW);
@@ -1026,8 +1156,21 @@ DRAMCtrl::prechargeBank(Rank& rank_ref, Bank& bank, Tick pre_at, bool trace)
 
     bank.openRow = Bank::NO_ROW;
 
-    // no precharge allowed before this one
-    bank.preAllowedAt = pre_at;
+    Tick pre_at = pre_tick;
+    if (auto_or_preall) {
+        // no precharge allowed before this one
+        bank.preAllowedAt = pre_at;
+    } else {
+        // Issuing an explicit PRE command
+        // Verify that we have command bandwidth to issue the precharge
+        // if not, shift to next burst window
+        pre_at = verifySingleCmd(pre_tick);
+        // enforce tPPD
+        for (int i = 0; i < banksPerRank; i++) {
+            rank_ref.banks[i].preAllowedAt = std::max(pre_at + tPPD,
+                                             rank_ref.banks[i].preAllowedAt);
+        }
+    }
 
     Tick pre_done_at = pre_at + tRP;
 
@@ -1047,6 +1190,7 @@ DRAMCtrl::prechargeBank(Rank& rank_ref, Bank& bank, Tick pre_at, bool trace)
         DPRINTF(DRAMPower, "%llu,PRE,%d,%d\n", divCeil(pre_at, tCK) -
                 timeStampOffset, bank.bank, rank_ref.rank);
     }
+
     // if we look at the current number of active banks we might be
     // tempted to think the DRAM is now idle, however this can be
     // undone by an activate that is scheduled to happen before we
@@ -1068,6 +1212,10 @@ DRAMCtrl::doDRAMAccess(DRAMPacket* dram_pkt)
     DPRINTF(DRAM, "Timing access to addr %lld, rank/bank/row %d %d %d\n",
             dram_pkt->addr, dram_pkt->rank, dram_pkt->bank, dram_pkt->row);
 
+    // first clean up the burstTick set, removing old entries
+    // before adding new entries for next burst
+    pruneBurstTick();
+
     // get the rank
     Rank& rank = dram_pkt->rankRef;
 
@@ -1113,9 +1261,36 @@ DRAMCtrl::doDRAMAccess(DRAMPacket* dram_pkt)
     // the command; need minimum of tBURST between commands
     Tick cmd_at = std::max({col_allowed_at, nextBurstAt, curTick()});
 
+    // verify that we have command bandwidth to issue the burst
+    // if not, shift to next burst window
+    if (dataClockSync && ((cmd_at - rank.lastBurstTick) > clkResyncDelay))
+        cmd_at = verifyMultiCmd(cmd_at, tCK);
+    else
+        cmd_at = verifySingleCmd(cmd_at);
+
+    // if we are interleaving bursts, ensure that
+    // 1) we don't double interleave on next burst issue
+    // 2) we are at an interleave boundary; if not, shift to next boundary
+    Tick burst_gap = tBURST_MIN;
+    if (burstInterleave) {
+        if (cmd_at == (rank.lastBurstTick + tBURST_MIN)) {
+            // already interleaving, push next command to end of full burst
+            burst_gap = tBURST;
+        } else if (cmd_at < (rank.lastBurstTick + tBURST)) {
+            // not at an interleave boundary after bandwidth check
+            // Shift command to tBURST boundary to avoid data contention
+            // Command will remain in the same burstTicks window given that
+            // tBURST is less than tBURST_MAX
+            cmd_at = rank.lastBurstTick + tBURST;
+        }
+    }
+    DPRINTF(DRAM, "Schedule RD/WR burst at tick %d\n", cmd_at);
+
     // update the packet ready time
     dram_pkt->readyTime = cmd_at + tCL + tBURST;
 
+    rank.lastBurstTick = cmd_at;
+
     // update the time for the next read/write burst for each
     // bank (add a max with tCCD/tCCD_L/tCCD_L_WR here)
     Tick dly_to_rd_cmd;
@@ -1134,14 +1309,15 @@ DRAMCtrl::doDRAMAccess(DRAMPacket* dram_pkt)
                     // tCCD_L_WR is required for write-to-write
                     // Need to also take bus turnaround delays into account
                     dly_to_rd_cmd = dram_pkt->isRead() ?
-                                    tCCD_L : std::max(tCCD_L, wrToRdDly);
+                                    tCCD_L : std::max(tCCD_L, wrToRdDlySameBG);
                     dly_to_wr_cmd = dram_pkt->isRead() ?
-                                    std::max(tCCD_L, rdToWrDly) : tCCD_L_WR;
+                                    std::max(tCCD_L, rdToWrDlySameBG) :
+                                    tCCD_L_WR;
                 } else {
                     // tBURST is default requirement for diff BG timing
                     // Need to also take bus turnaround delays into account
-                    dly_to_rd_cmd = dram_pkt->isRead() ? tBURST : wrToRdDly;
-                    dly_to_wr_cmd = dram_pkt->isRead() ? rdToWrDly : tBURST;
+                    dly_to_rd_cmd = dram_pkt->isRead() ? burst_gap : wrToRdDly;
+                    dly_to_wr_cmd = dram_pkt->isRead() ? rdToWrDly : burst_gap;
                 }
             } else {
                 // different rank is by default in a different bank group and
@@ -1236,8 +1412,7 @@ DRAMCtrl::doDRAMAccess(DRAMPacket* dram_pkt)
                                                    MemCommand::WR;
 
     // Update bus state to reflect when previous command was issued
-    nextBurstAt = cmd_at + tBURST;
-
+    nextBurstAt = cmd_at + burst_gap;
     DPRINTF(DRAM, "Access to %lld, ready at %lld next burst at %lld.\n",
             dram_pkt->addr, dram_pkt->readyTime, nextBurstAt);
 
@@ -1252,7 +1427,8 @@ DRAMCtrl::doDRAMAccess(DRAMPacket* dram_pkt)
     if (auto_precharge) {
         // if auto-precharge push a PRE command at the correct tick to the
         // list used by DRAMPower library to calculate power
-        prechargeBank(rank, bank, std::max(curTick(), bank.preAllowedAt));
+        prechargeBank(rank, bank, std::max(curTick(), bank.preAllowedAt),
+                      true);
 
         DPRINTF(DRAM, "Auto-precharged bank: %d\n", dram_pkt->bankId);
     }
@@ -1689,7 +1865,7 @@ DRAMCtrl::Rank::Rank(DRAMCtrl& _memory, const DRAMCtrlParams* _p, int rank)
       refreshState(REF_IDLE), inLowPowerState(false), rank(rank),
       readEntries(0), writeEntries(0), outstandingEvents(0),
       wakeUpAllowedAt(0), power(_p, false), banks(_p->banks_per_rank),
-      numBanksActive(0), actTicks(_p->activation_limit, 0),
+      numBanksActive(0), actTicks(_p->activation_limit, 0), lastBurstTick(0),
       writeDoneEvent([this]{ processWriteDoneEvent(); }, name()),
       activateEvent([this]{ processActivateEvent(); }, name()),
       prechargeEvent([this]{ processPrechargeEvent(); }, name()),
@@ -1922,7 +2098,7 @@ DRAMCtrl::Rank::processRefreshEvent()
 
             for (auto &b : banks) {
                 if (b.openRow != Bank::NO_ROW) {
-                    memory.prechargeBank(*this, b, pre_at, false);
+                    memory.prechargeBank(*this, b, pre_at, true, false);
                 } else {
                     b.actAllowedAt = std::max(b.actAllowedAt, act_allowed_at);
                     b.preAllowedAt = std::max(b.preAllowedAt, pre_at);
@@ -1946,17 +2122,21 @@ DRAMCtrl::Rank::processRefreshEvent()
         } else {
             // banks state is closed but haven't transitioned pwrState to IDLE
             // or have outstanding ACT,RD/WR,Auto-PRE sequence scheduled
-            // should have outstanding precharge event in this case
-            assert(prechargeEvent.scheduled());
+            // should have outstanding precharge or read response event
+            assert(prechargeEvent.scheduled() ||
+                   memory.respondEvent.scheduled());
             // will start refresh when pwrState transitions to IDLE
         }
 
         assert(numBanksActive == 0);
 
-        // wait for all banks to be precharged, at which point the
-        // power state machine will transition to the idle state, and
-        // automatically move to a refresh, at that point it will also
-        // call this method to get the refresh event loop going again
+        // wait for all banks to be precharged or read to complete
+        // When precharge commands are done, power state machine will
+        // transition to the idle state, and automatically move to a
+        // refresh, at that point it will also call this method to get
+        // the refresh event loop going again
+        // Similarly, when read response completes, if all banks are
+        // precharged, will call this method to get loop re-started
         return;
     }
 
@@ -2612,7 +2792,8 @@ DRAMCtrl::DRAMStats::regStats()
     avgWrBW = (bytesWritten / 1000000) / simSeconds;
     avgRdBWSys = (bytesReadSys / 1000000) / simSeconds;
     avgWrBWSys = (bytesWrittenSys / 1000000) / simSeconds;
-    peakBW = (SimClock::Frequency / dram.tBURST) * dram.burstSize / 1000000;
+    peakBW = (SimClock::Frequency / dram.burstDataCycles) *
+              dram.burstSize / 1000000;
 
     busUtil = (avgRdBW + avgWrBW) / peakBW * 100;
 
diff --git a/src/mem/dram_ctrl.hh b/src/mem/dram_ctrl.hh
index 8e026f527..0fe78da4e 100644
--- a/src/mem/dram_ctrl.hh
+++ b/src/mem/dram_ctrl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2019 ARM Limited
+ * Copyright (c) 2012-2020 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -446,6 +446,11 @@ class DRAMCtrl : public QoS::MemCtrl
         /** List to keep track of activate ticks */
         std::deque<Tick> actTicks;
 
+        /**
+         * Track when we issued the last read/write burst
+         */
+        Tick lastBurstTick;
+
         Rank(DRAMCtrl& _memory, const DRAMCtrlParams* _p, int rank);
 
         const std::string name() const
@@ -862,6 +867,46 @@ class DRAMCtrl : public QoS::MemCtrl
     std::pair<std::vector<uint32_t>, bool>
     minBankPrep(const DRAMPacketQueue& queue, Tick min_col_at) const;
 
+    /**
+     * Remove commands that have already issued from burstTicks
+     */
+    void pruneBurstTick();
+
+    /**
+     * Calculate burst window aligned tick
+     *
+     * @param cmd_tick Initial tick of command
+     * @return burst window aligned tick
+     */
+    Tick getBurstWindow(Tick cmd_tick);
+
+    /**
+     * Check for command bus contention for single cycle command.
+     * If there is contention, shift command to next burst.
+     * Check verifies that the commands issued per burst is less
+     * than a defined max number, maxCommandsPerBurst.
+     * Therefore, contention per cycle is not verified and instead
+     * is done based on a burst window.
+     *
+     * @param cmd_tick Initial tick of command, to be verified
+     * @return tick for command issue without contention
+     */
+    Tick verifySingleCmd(Tick cmd_tick);
+
+    /**
+     * Check for command bus contention for multi-cycle (2 currently)
+     * command. If there is contention, shift command(s) to next burst.
+     * Check verifies that the commands issued per burst is less
+     * than a defined max number, maxCommandsPerBurst.
+     * Therefore, contention per cycle is not verified and instead
+     * is done based on a burst window.
+     *
+     * @param cmd_tick Initial tick of command, to be verified
+     * @param max_multi_cmd_split Maximum delay between commands
+     * @return tick for command issue without contention
+     */
+    Tick verifyMultiCmd(Tick cmd_tick, Tick max_multi_cmd_split = 0);
+
     /**
      * Keep track of when row activations happen, in order to enforce
      * the maximum number of activations in the activation window. The
@@ -883,11 +928,13 @@ class DRAMCtrl : public QoS::MemCtrl
      *
      * @param rank_ref The rank to precharge
      * @param bank_ref The bank to precharge
-     * @param pre_at Time when the precharge takes place
+     * @param pre_tick Time when the precharge takes place
+     * @param auto_or_preall Is this an auto-precharge or precharge all command
      * @param trace Is this an auto precharge then do not add to trace
      */
     void prechargeBank(Rank& rank_ref, Bank& bank_ref,
-                       Tick pre_at, bool trace = true);
+                       Tick pre_tick, bool auto_or_preall = false,
+                       bool trace = true);
 
     /**
      * Used for debugging to observe the contents of the queues.
@@ -928,6 +975,13 @@ class DRAMCtrl : public QoS::MemCtrl
      */
     std::deque<DRAMPacket*> respQueue;
 
+    /**
+     * Holds count of commands issued in burst window starting at
+     * defined Tick. This is used to ensure that the command bandwidth
+     * does not exceed the allowable media constraints.
+     */
+    std::unordered_multiset<Tick> burstTicks;
+
     /**
      * Vector of ranks
      */
@@ -969,6 +1023,7 @@ class DRAMCtrl : public QoS::MemCtrl
     const Tick tRTW;
     const Tick tCS;
     const Tick tBURST;
+    const Tick tBURST_MIN;
     const Tick tCCD_L_WR;
     const Tick tCCD_L;
     const Tick tRCD;
@@ -981,13 +1036,23 @@ class DRAMCtrl : public QoS::MemCtrl
     const Tick tREFI;
     const Tick tRRD;
     const Tick tRRD_L;
+    const Tick tPPD;
+    const Tick tAAD;
     const Tick tXAW;
     const Tick tXP;
     const Tick tXS;
+    const Tick clkResyncDelay;
+    unsigned int maxCommandsPerBurst;
+    const bool dataClockSync;
+    const uint8_t twoCycleActivate;
     const uint32_t activationLimit;
     const Tick rankToRankDly;
     const Tick wrToRdDly;
     const Tick rdToWrDly;
+    const Tick wrToRdDlySameBG;
+    const Tick rdToWrDlySameBG;
+    const bool burstInterleave;
+    const Tick burstDataCycles;
 
     /**
      * Memory controller configuration initialized based on parameter
diff --git a/src/mem/drampower.cc b/src/mem/drampower.cc
index 05107919c..f5069282b 100644
--- a/src/mem/drampower.cc
+++ b/src/mem/drampower.cc
@@ -150,10 +150,10 @@ DRAMPower::hasTwoVDD(const DRAMCtrlParams* p)
 uint8_t
 DRAMPower::getDataRate(const DRAMCtrlParams* p)
 {
-    uint32_t burst_cycles = divCeil(p->tBURST, p->tCK);
+    uint32_t burst_cycles = divCeil(p->tBURST_MAX, p->tCK);
     uint8_t data_rate = p->burst_length / burst_cycles;
     // 4 for GDDR5
-    if (data_rate != 1 && data_rate != 2 && data_rate != 4)
-        fatal("Got unexpected data rate %d, should be 1 or 2 or 4\n");
+    if (data_rate != 1 && data_rate != 2 && data_rate != 4 && data_rate != 8)
+        fatal("Got unexpected data rate %d, should be 1 or 2 or 4 or 8\n");
     return data_rate;
 }
-- 
2.30.2