# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Authors: Erfan Azarkhish
+# Abdul Mutaal Ahmad
# A Simplified model of a complete HMC device. Based on:
# [1] http://www.hybridmemorycube.org/specification-download/
# (G. Kim et. al)
# [5] Near Data Processing, Are we there yet? (M. Gokhale)
# http://www.cs.utah.edu/wondp/gokhale.pdf
+# [6] openHMC - A Configurable Open-Source Hybrid Memory Cube Controller
+# (J. Schmidt)
+# [7] Hybrid Memory Cube performance characterization on data-centric
+# workloads (M. Gokhale)
#
# This script builds a complete HMC device composed of vault controllers,
# serial links, the main internal crossbar, and an external hmc controller.
# This component is simply an instance of the NoncoherentXBar class, and its
# parameters are tuned to [2].
#
-# - SERIAL LINKS:
+# - SERIAL LINKS CONTROLLER:
# SerialLink is a simple variation of the Bridge class, with the ability to
-# account for the latency of packet serialization. We assume that the
-# serializer component at the transmitter side does not need to receive the
-# whole packet to start the serialization. But the deserializer waits for
-# the complete packet to check its integrity first.
+# account for the latency of packet serialization and controller latency. We
+# assume that the serializer component at the transmitter side does not need
+# to receive the whole packet to start the serialization. But the
+# deserializer waits for the complete packet to check its integrity first.
+#
# * Bandwidth of the serial links is not modeled in the SerialLink component
-# itself. Instead bandwidth/port of the HMCController has been adjusted to
-# reflect the bandwidth delivered by 1 serial link.
+# itself.
+#
+# * Latency of serial link controller is composed of SerDes latency + link
+# controller
#
-# - HMC CONTROLLER:
-# Contains a large buffer (modeled with Bridge) to hide the access latency
-# of the memory cube. Plus it simply forwards the packets to the serial
-# links in a round-robin fashion to balance load among them.
# * It is inferred from the standard [1] and the literature [3] that serial
# links share the same address range and packets can travel over any of
# them so a load distribution mechanism is required among them.
+#
+# -----------------------------------------
+# | Host/HMC Controller |
+# | ---------------------- |
+# | | Link Aggregator | opt |
+# | ---------------------- |
+# | ---------------------- |
+# | | Serial Link + Ser | * 4 |
+# | ---------------------- |
+# |---------------------------------------
+# -----------------------------------------
+# | Device
+# | ---------------------- |
+# | | Xbar | * 4 |
+# | ---------------------- |
+# | ---------------------- |
+# | | Vault Controller | * 16 |
+# | ---------------------- |
+# | ---------------------- |
+# | | Memory | |
+# | ---------------------- |
+# |---------------------------------------|
+#
+# In this version we have present 3 different HMC archiecture along with
+# alongwith their corresponding test script.
+#
+# same: It has 4 crossbars in HMC memory. All the crossbars are connected
+# to each other, providing complete memory range. This archicture also covers
+# the added latency for sending a request to non-local vault(bridge in b/t
+# crossbars). All the 4 serial links can access complete memory. So each
+# link can be connected to separate processor.
+#
+# distributed: It has 4 crossbars inside the HMC. Crossbars are not
+# connected.Through each crossbar only local vaults can be accessed. But to
+# support this architecture we need a crossbar between serial links and
+# processor.
+#
+# mixed: This is a hybrid architecture. It has 4 crossbars inside the HMC.
+# 2 Crossbars are connected to only local vaults. From other 2 crossbar, a
+# request can be forwarded to any other vault.
import optparse
# FIFOs at the input and output of the inteconnect)
xbar_response_latency = Param.Cycles(2, "Response latency of the XBar")
- #*****************************SERIAL LINK PARAMETERS**********************
- # Number of serial links [1]
- num_serial_links = Param.Unsigned(4, "Number of serial links")
+ # number of cross which connects 16 Vaults to serial link[7]
+ number_mem_crossbar = Param.Unsigned(4, "Number of crossbar in HMC"
+ )
+
+ #*****************************SERIAL LINK PARAMETERS***********************
+ # Number of serial links controllers [1]
+ num_links_controllers = Param.Unsigned(4, "Number of serial links")
# Number of packets (not flits) to store at the request side of the serial
# link. This number should be adjusted to achive required bandwidth
- link_buffer_size_req = Param.Unsigned(16, "Number of packets to buffer "
+ link_buffer_size_req = Param.Unsigned(10, "Number of packets to buffer "
"at the request side of the serial link")
# Number of packets (not flits) to store at the response side of the serial
# link. This number should be adjusted to achive required bandwidth
- link_buffer_size_rsp = Param.Unsigned(16, "Number of packets to buffer "
+ link_buffer_size_rsp = Param.Unsigned(10, "Number of packets to buffer "
"at the response side of the serial link")
# Latency of the serial link composed by SER/DES latency (1.6ns [4]) plus
# the PCB trace latency (3ns Estimated based on [5])
link_latency = Param.Latency('4.6ns', "Latency of the serial links")
- # Header overhead of the serial links: Header size is 128bits in HMC [1],
- # and we have 16 lanes, so the overhead is 8 cycles
- link_overhead = Param.Cycles(8, "The number of cycles required to"
- " transmit the packet header over the serial link")
-
- # Clock frequency of the serial links [1]
+ # Clock frequency of the each serial link(SerDes) [1]
link_frequency = Param.Frequency('10GHz', "Clock Frequency of the serial"
"links")
- # Number of parallel lanes in each serial link [1]
- num_lanes_per_link = Param.Unsigned(16, "Number of lanes per each link")
+ # Clock frequency of serial link Controller[6]
+ # clk_hmc[Mhz]= num_lanes_per_link * lane_speed [Gbits/s] /
+ # data_path_width * 10^6
+ # clk_hmc[Mhz]= 16 * 10 Gbps / 256 * 10^6 = 625 Mhz
+ link_controller_frequency = Param.Frequency('625MHz',
+ "Clock Frequency of the link controller")
- # Number of serial links [1]
- num_serial_links = Param.Unsigned(4, "Number of serial links")
+ # Latency of the serial link controller to process the packets[1][6]
+ # (ClockDomain = 625 Mhz )
+ # used here for calculations only
+ link_ctrl_latency = Param.Cycles(4, "The number of cycles required for the"
+ "controller to process the packet")
- #*****************************HMC CONTROLLER PARAMETERS*******************
- # Number of packets (not flits) to store at the HMC controller. This
- # number should be high enough to be able to hide the high latency of HMC
- ctrl_buffer_size_req = Param.Unsigned(256, "Number of packets to buffer "
- "at the HMC controller (request side)")
+ # total_ctrl_latency = link_ctrl_latency + link_latency
+ # total_ctrl_latency = 4(Cycles) * 1.6 ns + 4.6 ns
+ total_ctrl_latency = Param.Latency('11ns', "The latency experienced by"
+ "every packet regardless of size of packet")
- # Number of packets (not flits) to store at the response side of the HMC
- # controller.
- ctrl_buffer_size_rsp = Param.Unsigned(256, "Number of packets to buffer "
- "at the HMC controller (response side)")
+ # Number of parallel lanes in each serial link [1]
+ num_lanes_per_link = Param.Unsigned( 16, "Number of lanes per each link")
- # Latency of the HMC controller to process the packets
- # (ClockDomain = Host clock domain)
- ctrl_latency = Param.Cycles(4, "The number of cycles required for the "
- " controller to process the packet")
+ # Number of serial links [1]
+ num_serial_links = Param.Unsigned(4, "Number of serial links")
- # Wiring latency from the SoC crossbar to the HMC controller
- ctrl_static_latency = Param.Latency('500ps', "Static latency of the HMC"
- "controller")
+ # speed of each lane of serial link - SerDes serial interface 10 Gb/s
+ serial_link_speed = Param.UInt64(10, "Gbs/s speed of each lane of"
+ "serial link")
- #*****************************PERFORMANCE MONITORING**********************
+ #*****************************PERFORMANCE MONITORING************************
# The main monitor behind the HMC Controller
- enable_global_monitor = Param.Bool(True, "The main monitor behind the "
+ enable_global_monitor = Param.Bool(False, "The main monitor behind the "
"HMC Controller")
# The link performance monitors
- enable_link_monitor = Param.Bool(True, "The link monitors")
+ enable_link_monitor = Param.Bool(False, "The link monitors" )
+
+ # link aggregator enable - put a cross between buffers & links
+ enable_link_aggr = Param.Bool(False, "The crossbar between port and "
+ "Link Controller")
+
+ enable_buff_div = Param.Bool(True, "Memory Range of Buffer is"
+ "divided between total range")
+
+ #*****************************HMC ARCHITECTURE ************************
+ # Memory chunk for 16 vault - numbers of vault / number of crossbars
+ mem_chunk = Param.Unsigned(4, "Chunk of memory range for each cross bar "
+ "in arch 0")
+
+ # size of req buffer within crossbar, used for modelling extra latency
+ # when the reuqest go to non-local vault
+ xbar_buffer_size_req = Param.Unsigned(10, "Number of packets to buffer "
+ "at the request side of the crossbar")
+
+ # size of response buffer within crossbar, used for modelling extra latency
+ # when the response received from non-local vault
+ xbar_buffer_size_resp = Param.Unsigned(10, "Number of packets to buffer "
+ "at the response side of the crossbar")
+
+# configure host system with Serial Links
+def config_host_hmc(options, system):
+
+ system.hmc_host=HMCSystem()
+
+ try:
+ system.hmc_host.enable_global_monitor = options.enable_global_monitor
+ except:
+ pass;
+
+ try:
+ system.hmc_host.enable_link_monitor = options.enable_link_monitor
+ except:
+ pass;
+
+ # Serial link Controller with 16 SerDes links at 10 Gbps
+ # with serial link ranges w.r.t to architecture
+ system.hmc_host.seriallink = [SerialLink(ranges = options.ser_ranges[i],
+ req_size=system.hmc_host.link_buffer_size_req,
+ resp_size=system.hmc_host.link_buffer_size_rsp,
+ num_lanes=system.hmc_host.num_lanes_per_link,
+ link_speed=system.hmc_host.serial_link_speed,
+ delay=system.hmc_host.total_ctrl_latency)
+ for i in xrange(system.hmc_host.num_serial_links)]
+
+ # enable global monitor
+ if system.hmc_host.enable_global_monitor:
+ system.hmc_host.lmonitor = [ CommMonitor()
+ for i in xrange(system.hmc_host.num_serial_links)]
+
+ # set the clock frequency for serial link
+ for i in xrange(system.hmc_host.num_serial_links):
+ system.hmc_host.seriallink[i].clk_domain = SrcClockDomain(clock=system.
+ hmc_host.link_controller_frequency, voltage_domain=
+ VoltageDomain(voltage = '1V'))
+
+ # Connect membus/traffic gen to Serial Link Controller for differrent HMC
+ # architectures
+ if options.arch == "distributed":
+ for i in xrange(system.hmc_host.num_links_controllers):
+ if system.hmc_host.enable_global_monitor:
+ system.membus.master = system.hmc_host.lmonitor[i].slave
+ system.hmc_host.lmonitor[i].master = \
+ system.hmc_host.seriallink[i].slave
+ else:
+ system.membus.master = system.hmc_host.seriallink[i].slave
+ if options.arch == "mixed":
+ if system.hmc_host.enable_global_monitor:
+ system.membus.master = system.hmc_host.lmonitor[0].slave
+ system.hmc_host.lmonitor[0].master = \
+ system.hmc_host.seriallink[0].slave
+
+ system.membus.master = system.hmc_host.lmonitor[1].slave
+ system.hmc_host.lmonitor[1].master = \
+ system.hmc_host.seriallink[1].slave
+
+ system.tgen[2].port = system.hmc_host.lmonitor[2].slave
+ system.hmc_host.lmonitor[2].master = \
+ system.hmc_host.seriallink[2].slave
+
+ system.tgen[3].port = system.hmc_host.lmonitor[3].slave
+ system.hmc_host.lmonitor[3].master = \
+ system.hmc_host.seriallink[3].slave
+ else:
+ system.membus.master = system.hmc_host.seriallink[0].slave
+ system.membus.master = system.hmc_host.seriallink[1].slave
+ system.tgen[2].port = system.hmc_host.seriallink[2].slave
+ system.tgen[3].port = system.hmc_host.seriallink[3].slave
+ if options.arch == "same" :
+ for i in xrange(system.hmc_host.num_links_controllers):
+ if system.hmc_host.enable_global_monitor:
+ system.tgen[i].port = system.hmc_host.lmonitor[i].slave
+ system.hmc_host.lmonitor[i].master = \
+ system.hmc_host.seriallink[i].slave
+ else:
+ system.tgen[i].port = system.hmc_host.seriallink[i].slave
+
+ return system
# Create an HMC device and attach it to the current system
-def config_hmc(options, system):
+def config_hmc(options, system, hmc_host):
- system.hmc = HMCSystem()
+ # Create HMC device
+ system.hmc_dev = HMCSystem()
- system.buffer = Bridge(ranges=system.mem_ranges,
- req_size=system.hmc.ctrl_buffer_size_req,
- resp_size=system.hmc.ctrl_buffer_size_rsp,
- delay=system.hmc.ctrl_static_latency)
+ # Global monitor
try:
- system.hmc.enable_global_monitor = options.enable_global_monitor
+ system.hmc_dev.enable_global_monitor = options.enable_global_monitor
except:
pass;
try:
- system.hmc.enable_link_monitor = options.enable_link_monitor
+ system.hmc_dev.enable_link_monitor = options.enable_link_monitor
except:
pass;
- system.membus.master = system.buffer.slave
-
- # The HMC controller (Clock domain is the same as the host)
- system.hmccontroller = HMCController(width=(system.hmc.num_lanes_per_link.
- value * system.hmc.num_serial_links/8),
- frontend_latency=system.hmc.ctrl_latency,
- forward_latency=system.hmc.link_overhead,
- response_latency=system.hmc.link_overhead)
-
- system.hmccontroller.clk_domain = SrcClockDomain(clock=system.hmc.
- link_frequency, voltage_domain = VoltageDomain(voltage = '1V'))
-
- # Serial Links
- system.hmc.seriallink =[ SerialLink(ranges = system.mem_ranges,
- req_size=system.hmc.link_buffer_size_req,
- resp_size=system.hmc.link_buffer_size_rsp,
- num_lanes=system.hmc.num_lanes_per_link,
- delay=system.hmc.link_latency)
- for i in xrange(system.hmc.num_serial_links)]
-
- if system.hmc.enable_link_monitor:
- system.hmc.lmonitor = [ CommMonitor()
- for i in xrange(system.hmc.num_serial_links)]
-
- # The HMC Crossbar located in its logic-base (LoB)
- system.hmc.xbar = NoncoherentXBar(width = system.hmc.xbar_width,
- frontend_latency=system.hmc.xbar_frontend_latency,
- forward_latency=system.hmc.xbar_forward_latency,
- response_latency=system.hmc.xbar_response_latency )
- system.hmc.xbar.clk_domain = SrcClockDomain(clock =
- system.hmc.xbar_frequency, voltage_domain =
- VoltageDomain(voltage = '1V'))
-
- if system.hmc.enable_global_monitor:
- system.gmonitor = CommMonitor()
- system.buffer.master = system.gmonitor.slave
- system.gmonitor.master = system.hmccontroller.slave
- else:
- system.hmccontroller.slave = system.buffer.master
-
- for i in xrange(system.hmc.num_serial_links):
- system.hmccontroller.master = system.hmc.seriallink[i].slave
- system.hmc.seriallink[i].clk_domain = system.hmccontroller.clk_domain;
- if system.hmc.enable_link_monitor:
- system.hmc.seriallink[i].master = system.hmc.lmonitor[i].slave
- system.hmc.lmonitor[i].master = system.hmc.xbar.slave
+
+ if system.hmc_dev.enable_link_monitor:
+ system.hmc_dev.lmonitor = [ CommMonitor()
+ for i in xrange(system.hmc_dev.num_links_controllers)]
+
+ # 4 HMC Crossbars located in its logic-base (LoB)
+ system.hmc_dev.xbar = [ NoncoherentXBar(width=system.hmc_dev.xbar_width,
+ frontend_latency=system.hmc_dev.xbar_frontend_latency,
+ forward_latency=system.hmc_dev.xbar_forward_latency,
+ response_latency=system.hmc_dev.xbar_response_latency )
+ for i in xrange(system.hmc_host.number_mem_crossbar)]
+
+ for i in xrange(system.hmc_dev.number_mem_crossbar):
+ system.hmc_dev.xbar[i].clk_domain = SrcClockDomain(
+ clock=system.hmc_dev.xbar_frequency,voltage_domain=
+ VoltageDomain(voltage='1V'))
+
+ # Attach 4 serial link to 4 crossbar/s
+ for i in xrange(system.hmc_dev.num_serial_links):
+ if system.hmc_dev.enable_link_monitor:
+ system.hmc_host.seriallink[i].master = \
+ system.hmc_dev.lmonitor[i].slave
+ system.hmc_dev.lmonitor[i].master = system.hmc_dev.xbar[i].slave
else:
- system.hmc.seriallink[i].master = system.hmc.xbar.slave
+ system.hmc_host.seriallink[i].master = system.hmc_dev.xbar[i].slave
+
+ # Connecting xbar with each other for request arriving at the wrong xbar,
+ # then it will be forward to correct xbar. Bridge is used to connect xbars
+ if options.arch == "same":
+ numx = len(system.hmc_dev.xbar)
+
+ # create a list of buffers
+ system.hmc_dev.buffers = [ Bridge(
+ req_size=system.hmc_dev.xbar_buffer_size_req,
+ resp_size=system.hmc_dev.xbar_buffer_size_resp)
+ for i in xrange(numx * (system.hmc_dev.mem_chunk - 1))]
+
+ # Buffer iterator
+ it = iter(range(len(system.hmc_dev.buffers)))
+
+ # necesarry to add system_port to one of the xbar
+ system.system_port = system.hmc_dev.xbar[3].slave
+
+ # iterate over all the crossbars and connect them as required
+ for i in range(numx):
+ for j in range(numx):
+ # connect xbar to all other xbars except itself
+ if i != j:
+ # get the next index of buffer
+ index = it.next()
+
+ # Change the default values for ranges of bridge
+ system.hmc_dev.buffers[index].ranges = system.mem_ranges[
+ j * int(system.hmc_dev.mem_chunk):
+ (j + 1) * int(system.hmc_dev.mem_chunk)]
+
+ # Connect the bridge between corssbars
+ system.hmc_dev.xbar[i].master = system.hmc_dev.buffers[
+ index].slave
+ system.hmc_dev.buffers[
+ index].master = system.hmc_dev.xbar[j].slave
+ else:
+ # Don't connect the xbar to itself
+ pass
+
+ # Two crossbars are connected to all other crossbars-Other 2 vault
+ # can only direct traffic to it local vaults
+ if options.arch == "mixed":
+
+ system.hmc_dev.buffer30 = Bridge(ranges=system.mem_ranges[0:4])
+ system.hmc_dev.xbar[3].master = system.hmc_dev.buffer30.slave
+ system.hmc_dev.buffer30.master = system.hmc_dev.xbar[0].slave
+
+ system.hmc_dev.buffer31 = Bridge(ranges=system.mem_ranges[4:8])
+ system.hmc_dev.xbar[3].master = system.hmc_dev.buffer31.slave
+ system.hmc_dev.buffer31.master = system.hmc_dev.xbar[1].slave
+
+ system.hmc_dev.buffer32 = Bridge(ranges=system.mem_ranges[8:12])
+ system.hmc_dev.xbar[3].master = system.hmc_dev.buffer32.slave
+ system.hmc_dev.buffer32.master = system.hmc_dev.xbar[2].slave
+
+
+ system.hmc_dev.buffer20 = Bridge(ranges=system.mem_ranges[0:4])
+ system.hmc_dev.xbar[2].master = system.hmc_dev.buffer20.slave
+ system.hmc_dev.buffer20.master = system.hmc_dev.xbar[0].slave
+
+ system.hmc_dev.buffer21 = Bridge(ranges=system.mem_ranges[4:8])
+ system.hmc_dev.xbar[2].master = system.hmc_dev.buffer21.slave
+ system.hmc_dev.buffer21.master = system.hmc_dev.xbar[1].slave
+
+ system.hmc_dev.buffer23 = Bridge(ranges=system.mem_ranges[12:16])
+ system.hmc_dev.xbar[2].master = system.hmc_dev.buffer23.slave
+ system.hmc_dev.buffer23.master = system.hmc_dev.xbar[3].slave
+