1 # Copyright (c) 2012-2013 ARM Limited
4 # The license below extends only to copyright in the software and shall
5 # not be construed as granting a license to any other intellectual
6 # property including but not limited to intellectual property relating
7 # to a hardware implementation of the functionality of the software
8 # licensed hereunder. You may use the software subject to the license
9 # terms below provided that you ensure that this notice is replicated
10 # unmodified and in its entirety in all distributions of the software,
11 # modified or unmodified, in source code or in binary form.
13 # Copyright (c) 2015 The University of Bologna
14 # All rights reserved.
16 # Redistribution and use in source and binary forms, with or without
17 # modification, are permitted provided that the following conditions are
18 # met: redistributions of source code must retain the above copyright
19 # notice, this list of conditions and the following disclaimer;
20 # redistributions in binary form must reproduce the above copyright
21 # notice, this list of conditions and the following disclaimer in the
22 # documentation and/or other materials provided with the distribution;
23 # neither the name of the copyright holders nor the names of its
24 # contributors may be used to endorse or promote products derived from
25 # this software without specific prior written permission.
27 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39 # Authors: Erfan Azarkhish
42 # A Simplified model of a complete HMC device. Based on:
43 # [1] http://www.hybridmemorycube.org/specification-download/
44 # [2] High performance AXI-4.0 based interconnect for extensible smart memory
45 # cubes(E. Azarkhish et. al)
46 # [3] Low-Power Hybrid Memory Cubes With Link Power Management and Two-Level
47 # Prefetching (J. Ahn et. al)
48 # [4] Memory-centric system interconnect design with Hybrid Memory Cubes
50 # [5] Near Data Processing, Are we there yet? (M. Gokhale)
51 # http://www.cs.utah.edu/wondp/gokhale.pdf
52 # [6] openHMC - A Configurable Open-Source Hybrid Memory Cube Controller
54 # [7] Hybrid Memory Cube performance characterization on data-centric
55 # workloads (M. Gokhale)
57 # This script builds a complete HMC device composed of vault controllers,
58 # serial links, the main internal crossbar, and an external hmc controller.
60 # - VAULT CONTROLLERS:
61 # Instances of the HMC_2500_1x32 class with their functionality specified in
65 # This component is simply an instance of the NoncoherentXBar class, and its
66 # parameters are tuned to [2].
68 # - SERIAL LINKS CONTROLLER:
69 # SerialLink is a simple variation of the Bridge class, with the ability to
70 # account for the latency of packet serialization and controller latency. We
71 # assume that the serializer component at the transmitter side does not need
72 # to receive the whole packet to start the serialization. But the
73 # deserializer waits for the complete packet to check its integrity first.
75 # * Bandwidth of the serial links is not modeled in the SerialLink component
78 # * Latency of serial link controller is composed of SerDes latency + link
81 # * It is inferred from the standard [1] and the literature [3] that serial
82 # links share the same address range and packets can travel over any of
83 # them so a load distribution mechanism is required among them.
85 # -----------------------------------------
86 # | Host/HMC Controller |
87 # | ---------------------- |
88 # | | Link Aggregator | opt |
89 # | ---------------------- |
90 # | ---------------------- |
91 # | | Serial Link + Ser | * 4 |
92 # | ---------------------- |
93 # |---------------------------------------
94 # -----------------------------------------
96 # | ---------------------- |
98 # | ---------------------- |
99 # | ---------------------- |
100 # | | Vault Controller | * 16 |
101 # | ---------------------- |
102 # | ---------------------- |
104 # | ---------------------- |
105 # |---------------------------------------|
107 # In this version we have present 3 different HMC archiecture along with
108 # alongwith their corresponding test script.
110 # same: It has 4 crossbars in HMC memory. All the crossbars are connected
111 # to each other, providing complete memory range. This archicture also covers
112 # the added latency for sending a request to non-local vault(bridge in b/t
113 # crossbars). All the 4 serial links can access complete memory. So each
114 # link can be connected to separate processor.
116 # distributed: It has 4 crossbars inside the HMC. Crossbars are not
117 # connected.Through each crossbar only local vaults can be accessed. But to
118 # support this architecture we need a crossbar between serial links and
121 # mixed: This is a hybrid architecture. It has 4 crossbars inside the HMC.
122 # 2 Crossbars are connected to only local vaults. From other 2 crossbar, a
123 # request can be forwarded to any other vault.
125 from __future__
import print_function
126 from __future__
import absolute_import
131 from m5
.objects
import *
132 from m5
.util
import *
135 def add_options(parser
):
136 # *****************************CROSSBAR PARAMETERS*************************
137 # Flit size of the main interconnect [1]
138 parser
.add_argument("--xbar-width", default
=32, action
="store", type=int,
139 help="Data width of the main XBar (Bytes)")
141 # Clock frequency of the main interconnect [1]
142 # This crossbar, is placed on the logic-based of the HMC and it has its
143 # own voltage and clock domains, different from the DRAM dies or from the
145 parser
.add_argument("--xbar-frequency", default
='1GHz', type=str,
146 help="Clock Frequency of the main XBar")
148 # Arbitration latency of the HMC XBar [1]
149 parser
.add_argument("--xbar-frontend-latency", default
=1, action
="store",
150 type=int, help="Arbitration latency of the XBar")
152 # Latency to forward a packet via the interconnect [1](two levels of FIFOs
153 # at the input and output of the inteconnect)
154 parser
.add_argument("--xbar-forward-latency", default
=2, action
="store",
155 type=int, help="Forward latency of the XBar")
157 # Latency to forward a response via the interconnect [1](two levels of
158 # FIFOs at the input and output of the inteconnect)
159 parser
.add_argument("--xbar-response-latency", default
=2, action
="store",
160 type=int, help="Response latency of the XBar")
162 # number of cross which connects 16 Vaults to serial link[7]
163 parser
.add_argument("--number-mem-crossbar", default
=4, action
="store",
164 type=int, help="Number of crossbar in HMC")
166 # *****************************SERIAL LINK PARAMETERS**********************
167 # Number of serial links controllers [1]
168 parser
.add_argument("--num-links-controllers", default
=4, action
="store",
169 type=int, help="Number of serial links")
171 # Number of packets (not flits) to store at the request side of the serial
172 # link. This number should be adjusted to achive required bandwidth
173 parser
.add_argument("--link-buffer-size-req", default
=10, action
="store",
174 type=int, help="Number of packets to buffer at the\
175 request side of the serial link")
177 # Number of packets (not flits) to store at the response side of the serial
178 # link. This number should be adjusted to achive required bandwidth
179 parser
.add_argument("--link-buffer-size-rsp", default
=10, action
="store",
180 type=int, help="Number of packets to buffer at the\
181 response side of the serial link")
183 # Latency of the serial link composed by SER/DES latency (1.6ns [4]) plus
184 # the PCB trace latency (3ns Estimated based on [5])
185 parser
.add_argument("--link-latency", default
='4.6ns', type=str,
186 help="Latency of the serial links")
188 # Clock frequency of the each serial link(SerDes) [1]
189 parser
.add_argument("--link-frequency", default
='10GHz', type=str,
190 help="Clock Frequency of the serial links")
192 # Clock frequency of serial link Controller[6]
193 # clk_hmc[Mhz]= num_lanes_per_link * lane_speed [Gbits/s] /
194 # data_path_width * 10^6
195 # clk_hmc[Mhz]= 16 * 10 Gbps / 256 * 10^6 = 625 Mhz
196 parser
.add_argument("--link-controller-frequency", default
='625MHz',
197 type=str, help="Clock Frequency of the link\
200 # Latency of the serial link controller to process the packets[1][6]
201 # (ClockDomain = 625 Mhz )
202 # used here for calculations only
203 parser
.add_argument("--link-ctrl-latency", default
=4, action
="store",
204 type=int, help="The number of cycles required for the\
205 controller to process the packet")
207 # total_ctrl_latency = link_ctrl_latency + link_latency
208 # total_ctrl_latency = 4(Cycles) * 1.6 ns + 4.6 ns
209 parser
.add_argument("--total-ctrl-latency", default
='11ns', type=str,
210 help="The latency experienced by every packet\
211 regardless of size of packet")
213 # Number of parallel lanes in each serial link [1]
214 parser
.add_argument("--num-lanes-per-link", default
=16, action
="store",
215 type=int, help="Number of lanes per each link")
217 # Number of serial links [1]
218 parser
.add_argument("--num-serial-links", default
=4, action
="store",
219 type=int, help="Number of serial links")
221 # speed of each lane of serial link - SerDes serial interface 10 Gb/s
222 parser
.add_argument("--serial-link-speed", default
=10, action
="store",
223 type=int, help="Gbs/s speed of each lane of serial\
226 # address range for each of the serial links
227 parser
.add_argument("--serial-link-addr-range", default
='1GB', type=str,
228 help="memory range for each of the serial links.\
231 # *****************************PERFORMANCE MONITORING*********************
232 # The main monitor behind the HMC Controller
233 parser
.add_argument("--enable-global-monitor", action
="store_true",
234 help="The main monitor behind the HMC Controller")
236 # The link performance monitors
237 parser
.add_argument("--enable-link-monitor", action
="store_true",
238 help="The link monitors")
240 # link aggregator enable - put a cross between buffers & links
241 parser
.add_argument("--enable-link-aggr", action
="store_true", help="The\
242 crossbar between port and Link Controller")
244 parser
.add_argument("--enable-buff-div", action
="store_true",
245 help="Memory Range of Buffer is ivided between total\
248 # *****************************HMC ARCHITECTURE **************************
249 # Memory chunk for 16 vault - numbers of vault / number of crossbars
250 parser
.add_argument("--mem-chunk", default
=4, action
="store", type=int,
251 help="Chunk of memory range for each cross bar in\
254 # size of req buffer within crossbar, used for modelling extra latency
255 # when the reuqest go to non-local vault
256 parser
.add_argument("--xbar-buffer-size-req", default
=10, action
="store",
257 type=int, help="Number of packets to buffer at the\
258 request side of the crossbar")
260 # size of response buffer within crossbar, used for modelling extra latency
261 # when the response received from non-local vault
262 parser
.add_argument("--xbar-buffer-size-resp", default
=10, action
="store",
263 type=int, help="Number of packets to buffer at the\
264 response side of the crossbar")
265 # HMC device architecture. It affects the HMC host controller as well
266 parser
.add_argument("--arch", type=str, choices
=["same", "distributed",
267 "mixed"], default
="distributed", help="same: HMC with\
268 4 links, all with same range.\ndistributed: HMC with\
269 4 links with distributed range.\nmixed: mixed with\
270 same and distributed range.\nDefault: distributed")
271 # HMC device - number of vaults
272 parser
.add_argument("--hmc-dev-num-vaults", default
=16, action
="store",
273 type=int, help="number of independent vaults within\
274 the HMC device. Note: each vault has a memory\
275 controller (valut controller)\nDefault: 16")
276 # HMC device - vault capacity or size
277 parser
.add_argument("--hmc-dev-vault-size", default
='256MB', type=str,
278 help="vault storage capacity in bytes. Default:\
280 parser
.add_argument("--mem-type", type=str, choices
=["HMC_2500_1x32"],
281 default
="HMC_2500_1x32", help="type of HMC memory to\
282 use. Default: HMC_2500_1x32")
283 parser
.add_argument("--mem-channels", default
=1, action
="store", type=int,
284 help="Number of memory channels")
285 parser
.add_argument("--mem-ranks", default
=1, action
="store", type=int,
286 help="Number of ranks to iterate across")
287 parser
.add_argument("--burst-length", default
=256, action
="store",
288 type=int, help="burst length in bytes. Note: the\
289 cache line size will be set to this value.\nDefault:\
293 # configure HMC host controller
294 def config_hmc_host_ctrl(opt
, system
):
296 # create HMC host controller
297 system
.hmc_host
= SubSystem()
299 # Create additional crossbar for arch1
300 if opt
.arch
== "distributed" or opt
.arch
== "mixed":
302 vd
= VoltageDomain(voltage
='1V')
303 # Create additional crossbar for arch1
304 system
.membus
= NoncoherentXBar(width
=8)
305 system
.membus
.badaddr_responder
= BadAddr()
306 system
.membus
.default
= Self
.badaddr_responder
.pio
307 system
.membus
.width
= 8
308 system
.membus
.frontend_latency
= 3
309 system
.membus
.forward_latency
= 4
310 system
.membus
.response_latency
= 2
311 cd
= SrcClockDomain(clock
=clk
, voltage_domain
=vd
)
312 system
.membus
.clk_domain
= cd
314 # create memory ranges for the serial links
315 slar
= convert
.toMemorySize(opt
.serial_link_addr_range
)
316 # Memmory ranges of serial link for arch-0. Same as the ranges of vault
317 # controllers (4 vaults to 1 serial link)
318 if opt
.arch
== "same":
319 ser_ranges
= [AddrRange(0, (4*slar
)-1) for i
in
320 range(opt
.num_serial_links
)]
321 # Memmory ranges of serial link for arch-1. Distributed range accross
323 if opt
.arch
== "distributed":
324 ser_ranges
= [AddrRange(i
*slar
, ((i
+1)*slar
)-1) for i
in
325 range(opt
.num_serial_links
)]
326 # Memmory ranges of serial link for arch-2 'Mixed' address distribution
328 if opt
.arch
== "mixed":
329 ser_range0
= AddrRange(0, (1*slar
)-1)
330 ser_range1
= AddrRange(1*slar
, 2*slar
-1)
331 ser_range2
= AddrRange(0, (4*slar
)-1)
332 ser_range3
= AddrRange(0, (4*slar
)-1)
333 ser_ranges
= [ser_range0
, ser_range1
, ser_range2
, ser_range3
]
335 # Serial link Controller with 16 SerDes links at 10 Gbps with serial link
336 # ranges w.r.t to architecture
337 sl
= [SerialLink(ranges
=ser_ranges
[i
],
338 req_size
=opt
.link_buffer_size_req
,
339 resp_size
=opt
.link_buffer_size_rsp
,
340 num_lanes
=opt
.num_lanes_per_link
,
341 link_speed
=opt
.serial_link_speed
,
342 delay
=opt
.total_ctrl_latency
) for i
in
343 range(opt
.num_serial_links
)]
344 system
.hmc_host
.seriallink
= sl
346 # enable global monitor
347 if opt
.enable_global_monitor
:
348 system
.hmc_host
.lmonitor
= [CommMonitor() for i
in
349 range(opt
.num_serial_links
)]
351 # set the clock frequency for serial link
352 for i
in range(opt
.num_serial_links
):
353 clk
= opt
.link_controller_frequency
354 vd
= VoltageDomain(voltage
='1V')
355 scd
= SrcClockDomain(clock
=clk
, voltage_domain
=vd
)
356 system
.hmc_host
.seriallink
[i
].clk_domain
= scd
358 # Connect membus/traffic gen to Serial Link Controller for differrent HMC
361 if opt
.arch
== "distributed":
363 for i
in range(opt
.num_links_controllers
):
364 if opt
.enable_global_monitor
:
365 mb
.master
= hh
.lmonitor
[i
].slave
366 hh
.lmonitor
[i
].master
= hh
.seriallink
[i
].slave
368 mb
.master
= hh
.seriallink
[i
].slave
369 if opt
.arch
== "mixed":
371 if opt
.enable_global_monitor
:
372 mb
.master
= hh
.lmonitor
[0].slave
373 hh
.lmonitor
[0].master
= hh
.seriallink
[0].slave
374 mb
.master
= hh
.lmonitor
[1].slave
375 hh
.lmonitor
[1].master
= hh
.seriallink
[1].slave
377 mb
.master
= hh
.seriallink
[0].slave
378 mb
.master
= hh
.seriallink
[1].slave
380 if opt
.arch
== "same":
381 for i
in range(opt
.num_links_controllers
):
382 if opt
.enable_global_monitor
:
383 hh
.lmonitor
[i
].master
= hh
.seriallink
[i
].slave
388 # Create an HMC device
389 def config_hmc_dev(opt
, system
, hmc_host
):
392 system
.hmc_dev
= SubSystem()
394 # create memory ranges for the vault controllers
395 arv
= convert
.toMemorySize(opt
.hmc_dev_vault_size
)
396 addr_ranges_vaults
= [AddrRange(i
*arv
, ((i
+1)*arv
-1)) for i
in
397 range(opt
.hmc_dev_num_vaults
)]
398 system
.mem_ranges
= addr_ranges_vaults
400 if opt
.enable_link_monitor
:
401 lm
= [CommMonitor() for i
in range(opt
.num_links_controllers
)]
402 system
.hmc_dev
.lmonitor
= lm
404 # 4 HMC Crossbars located in its logic-base (LoB)
405 xb
= [NoncoherentXBar(width
=opt
.xbar_width
,
406 frontend_latency
=opt
.xbar_frontend_latency
,
407 forward_latency
=opt
.xbar_forward_latency
,
408 response_latency
=opt
.xbar_response_latency
) for i
in
409 range(opt
.number_mem_crossbar
)]
410 system
.hmc_dev
.xbar
= xb
412 for i
in range(opt
.number_mem_crossbar
):
413 clk
= opt
.xbar_frequency
414 vd
= VoltageDomain(voltage
='1V')
415 scd
= SrcClockDomain(clock
=clk
, voltage_domain
=vd
)
416 system
.hmc_dev
.xbar
[i
].clk_domain
= scd
418 # Attach 4 serial link to 4 crossbar/s
419 for i
in range(opt
.num_serial_links
):
420 if opt
.enable_link_monitor
:
421 system
.hmc_host
.seriallink
[i
].master
= \
422 system
.hmc_dev
.lmonitor
[i
].slave
423 system
.hmc_dev
.lmonitor
[i
].master
= system
.hmc_dev
.xbar
[i
].slave
425 system
.hmc_host
.seriallink
[i
].master
= system
.hmc_dev
.xbar
[i
].slave
427 # Connecting xbar with each other for request arriving at the wrong xbar,
428 # then it will be forward to correct xbar. Bridge is used to connect xbars
429 if opt
.arch
== "same":
430 numx
= len(system
.hmc_dev
.xbar
)
432 # create a list of buffers
433 system
.hmc_dev
.buffers
= [Bridge(req_size
=opt
.xbar_buffer_size_req
,
434 resp_size
=opt
.xbar_buffer_size_resp
)
435 for i
in range(numx
*(opt
.mem_chunk
-1))]
438 it
= iter(range(len(system
.hmc_dev
.buffers
)))
440 # necesarry to add system_port to one of the xbar
441 system
.system_port
= system
.hmc_dev
.xbar
[3].slave
443 # iterate over all the crossbars and connect them as required
444 for i
in range(numx
):
445 for j
in range(numx
):
446 # connect xbar to all other xbars except itself
448 # get the next index of buffer
451 # Change the default values for ranges of bridge
452 system
.hmc_dev
.buffers
[index
].ranges
= system
.mem_ranges
[
453 j
* int(opt
.mem_chunk
):
454 (j
+ 1) * int(opt
.mem_chunk
)]
456 # Connect the bridge between corssbars
457 system
.hmc_dev
.xbar
[i
].master
= system
.hmc_dev
.buffers
[
459 system
.hmc_dev
.buffers
[
460 index
].master
= system
.hmc_dev
.xbar
[j
].slave
462 # Don't connect the xbar to itself
465 # Two crossbars are connected to all other crossbars-Other 2 vault
466 # can only direct traffic to it local vaults
467 if opt
.arch
== "mixed":
468 system
.hmc_dev
.buffer30
= Bridge(ranges
=system
.mem_ranges
[0:4])
469 system
.hmc_dev
.xbar
[3].master
= system
.hmc_dev
.buffer30
.slave
470 system
.hmc_dev
.buffer30
.master
= system
.hmc_dev
.xbar
[0].slave
472 system
.hmc_dev
.buffer31
= Bridge(ranges
=system
.mem_ranges
[4:8])
473 system
.hmc_dev
.xbar
[3].master
= system
.hmc_dev
.buffer31
.slave
474 system
.hmc_dev
.buffer31
.master
= system
.hmc_dev
.xbar
[1].slave
476 system
.hmc_dev
.buffer32
= Bridge(ranges
=system
.mem_ranges
[8:12])
477 system
.hmc_dev
.xbar
[3].master
= system
.hmc_dev
.buffer32
.slave
478 system
.hmc_dev
.buffer32
.master
= system
.hmc_dev
.xbar
[2].slave
480 system
.hmc_dev
.buffer20
= Bridge(ranges
=system
.mem_ranges
[0:4])
481 system
.hmc_dev
.xbar
[2].master
= system
.hmc_dev
.buffer20
.slave
482 system
.hmc_dev
.buffer20
.master
= system
.hmc_dev
.xbar
[0].slave
484 system
.hmc_dev
.buffer21
= Bridge(ranges
=system
.mem_ranges
[4:8])
485 system
.hmc_dev
.xbar
[2].master
= system
.hmc_dev
.buffer21
.slave
486 system
.hmc_dev
.buffer21
.master
= system
.hmc_dev
.xbar
[1].slave
488 system
.hmc_dev
.buffer23
= Bridge(ranges
=system
.mem_ranges
[12:16])
489 system
.hmc_dev
.xbar
[2].master
= system
.hmc_dev
.buffer23
.slave
490 system
.hmc_dev
.buffer23
.master
= system
.hmc_dev
.xbar
[3].slave