Fix gearing
[gram.git] / gram / phy / ecp5ddrphy.py
1 # This file is Copyright (c) 2019 David Shah <dave@ds0.me>
2 # This file is Copyright (c) 2019-2020 Florent Kermarrec <florent@enjoy-digital.fr>
3 # This file is Copyright (c) 2020 LambdaConcept <contact@lambdaconcept.com>
4 # License: BSD
5
6 # 1:2 frequency-ratio DDR3 PHY for Lattice's ECP5
7 # DDR3: 800 MT/s
8
9 import math
10
11 from nmigen import *
12 from nmigen.lib.cdc import FFSynchronizer
13 from nmigen.utils import log2_int
14
15 from lambdasoc.periph import Peripheral
16
17 import gram.stream as stream
18 from gram.common import *
19 from gram.phy.dfi import Interface
20 from gram.compat import Timeline
21
22 # Lattice ECP5 DDR PHY Initialization --------------------------------------------------------------
23
24
25 class ECP5DDRPHYInit(Elaboratable):
26 def __init__(self):
27 self.pause = Signal()
28 self.stop = Signal()
29 self.delay = Signal()
30 self.reset = Signal()
31
32 def elaborate(self, platform):
33 m = Module()
34
35 new_lock = Signal()
36 update = Signal()
37 freeze = Signal()
38
39 # DDRDLLA instance -------------------------------------------------------------------------
40 _lock = Signal()
41 delay = Signal()
42 m.submodules += Instance("DDRDLLA",
43 i_CLK=ClockSignal("sync2x"),
44 i_RST=ResetSignal("init"),
45 i_UDDCNTLN=~update,
46 i_FREEZE=freeze,
47 o_DDRDEL=delay,
48 o_LOCK=_lock
49 )
50 lock = Signal()
51 lock_d = Signal()
52 m.submodules += FFSynchronizer(_lock, lock, o_domain="init")
53 m.d.init += lock_d.eq(lock)
54 m.d.sync += new_lock.eq(lock & ~lock_d)
55
56 # DDRDLLA/DDQBUFM/ECLK initialization sequence ---------------------------------------------
57 t = 8 # in cycles
58 tl = Timeline([
59 (1*t, [freeze.eq(1)]), # Freeze DDRDLLA
60 (2*t, [self.stop.eq(1)]), # Stop ECLK domain
61 (3*t, [self.reset.eq(1)]), # Reset ECLK domain
62 (4*t, [self.reset.eq(0)]), # Release ECLK domain reset
63 (5*t, [self.stop.eq(0)]), # Release ECLK domain stop
64 (6*t, [freeze.eq(0)]), # Release DDRDLLA freeze
65 (7*t, [self.pause.eq(1)]), # Pause DQSBUFM
66 (8*t, [update.eq(1)]), # Update DDRDLLA
67 (9*t, [update.eq(0)]), # Release DDRDMMA update
68 (10*t, [self.pause.eq(0)]), # Release DQSBUFM pause
69 ])
70 m.submodules += tl
71 # Wait DDRDLLA Lock
72 m.d.comb += tl.trigger.eq(new_lock)
73
74 m.d.comb += self.delay.eq(delay)
75
76 return m
77
78 # Lattice ECP5 DDR PHY -----------------------------------------------------------------------------
79
80
81 class ECP5DDRPHY(Peripheral, Elaboratable):
82 def __init__(self, pads, sys_clk_freq=100e6):
83 super().__init__(name="phy")
84
85 #self.pads = PHYPadsCombiner(pads)
86 self.pads = pads
87 self._sys_clk_freq = sys_clk_freq
88
89 databits = len(self.pads.dq.io)
90 assert databits % 8 == 0
91
92 # CSR
93 bank = self.csr_bank()
94
95 self._dly_sel = bank.csr(databits//8, "rw")
96
97 self._rdly_dq_rst = bank.csr(1, "rw")
98 self._rdly_dq_inc = bank.csr(1, "rw")
99 self._rdly_dq_bitslip_rst = bank.csr(1, "rw")
100 self._rdly_dq_bitslip = bank.csr(1, "rw")
101
102 self._burstdet_clr = bank.csr(1, "rw")
103 self._burstdet_seen = bank.csr(databits//8, "r")
104
105 self._zero_ev = self.event(mode="rise")
106
107 self._bridge = self.bridge(data_width=32, granularity=8, alignment=2)
108 self.bus = self._bridge.bus
109 self.irq = self._bridge.irq
110
111 addressbits = len(self.pads.a.o)
112 bankbits = len(self.pads.ba.o)
113 nranks = 1 if not hasattr(self.pads, "cs_n") else len(self.pads.cs_n.o)
114 databits = len(self.pads.dq.io)
115 self.dfi = Interface(addressbits, bankbits, nranks, 4*databits, 4)
116
117 # PHY settings -----------------------------------------------------------------------------
118 tck = 2/(2*2*self._sys_clk_freq)
119 nphases = 2
120 databits = len(self.pads.dq.io)
121 nranks = 1 if not hasattr(self.pads, "cs_n") else len(self.pads.cs_n.o)
122 addressbits = len(self.pads.a.o)
123 bankbits = len(self.pads.ba.o)
124 cl, cwl = get_cl_cw("DDR3", tck)
125 cl_sys_latency = get_sys_latency(nphases, cl)
126 cwl_sys_latency = get_sys_latency(nphases, cwl)
127 rdcmdphase, rdphase = get_sys_phases(nphases, cl_sys_latency, cl)
128 wrcmdphase, wrphase = get_sys_phases(nphases, cwl_sys_latency, cwl)
129 self.settings = PhySettings(
130 phytype="ECP5DDRPHY",
131 memtype="DDR3",
132 databits=databits,
133 dfi_databits=4*databits,
134 nranks=nranks,
135 nphases=nphases,
136 rdphase=rdphase,
137 wrphase=wrphase,
138 rdcmdphase=rdcmdphase,
139 wrcmdphase=wrcmdphase,
140 cl=cl,
141 cwl=cwl,
142 read_latency=2 + cl_sys_latency + 2 + log2_int(4//nphases) + 4,
143 write_latency=cwl_sys_latency
144 )
145
146 def elaborate(self, platform):
147 m = Module()
148
149 m.submodules += self._bridge
150
151 tck = 2/(2*2*self._sys_clk_freq)
152 nphases = 2
153 databits = len(self.pads.dq.io)
154 nranks = 1 if not hasattr(self.pads, "cs_n") else len(self.pads.cs_n.o)
155 addressbits = len(self.pads.a.o)
156 bankbits = len(self.pads.ba.o)
157
158 # Init -------------------------------------------------------------------------------------
159 m.submodules.init = init = ECP5DDRPHYInit()
160
161 # Parameters -------------------------------------------------------------------------------
162 cl, cwl = get_cl_cw("DDR3", tck)
163 cl_sys_latency = get_sys_latency(nphases, cl)
164 cwl_sys_latency = get_sys_latency(nphases, cwl)
165
166 # Observation
167 self.datavalid = Signal(databits//8)
168
169 # DFI Interface ----------------------------------------------------------------------------
170 dfi = self.dfi
171
172 bl8_chunk = Signal()
173 rddata_en = Signal(self.settings.read_latency)
174
175 # Clock --------------------------------------------------------------------------------
176 for i in range(len(self.pads.clk.o)):
177 sd_clk_se = Signal()
178 m.submodules += Instance("ODDRX2F",
179 i_RST=ResetSignal("dramsync"),
180 i_ECLK=ClockSignal("sync2x"),
181 i_SCLK=ClockSignal(),
182 i_D0=0,
183 i_D1=1,
184 i_D2=0,
185 i_D3=1,
186 o_Q=self.pads.clk.o[i]
187 )
188
189 # Addresses and Commands ---------------------------------------------------------------
190 for i in range(addressbits):
191 m.submodules += Instance("ODDRX2F",
192 i_RST=ResetSignal("dramsync"),
193 i_ECLK=ClockSignal("sync2x"),
194 i_SCLK=ClockSignal(),
195 i_D0=dfi.phases[0].address[i],
196 i_D1=dfi.phases[0].address[i],
197 i_D2=dfi.phases[1].address[i],
198 i_D3=dfi.phases[1].address[i],
199 o_Q=self.pads.a.o[i]
200 )
201 for i in range(bankbits):
202 m.submodules += Instance("ODDRX2F",
203 i_RST=ResetSignal("dramsync"),
204 i_ECLK=ClockSignal("sync2x"),
205 i_SCLK=ClockSignal(),
206 i_D0=dfi.phases[0].bank[i],
207 i_D1=dfi.phases[0].bank[i],
208 i_D2=dfi.phases[1].bank[i],
209 i_D3=dfi.phases[1].bank[i],
210 o_Q=self.pads.ba.o[i]
211 )
212 controls = ["ras_n", "cas_n", "we_n", "clk_en", "odt"]
213 if hasattr(self.pads, "reset_n"):
214 controls.append("reset_n")
215 if hasattr(self.pads, "cs_n"):
216 controls.append("cs_n")
217 for name in controls:
218 for i in range(len(getattr(self.pads, name))):
219 m.submodules += Instance("ODDRX2F",
220 i_RST=ResetSignal("dramsync"),
221 i_ECLK=ClockSignal("sync2x"),
222 i_SCLK=ClockSignal(),
223 i_D0=getattr(dfi.phases[0], name)[i],
224 i_D1=getattr(dfi.phases[0], name)[i],
225 i_D2=getattr(dfi.phases[1], name)[i],
226 i_D3=getattr(dfi.phases[1], name)[i],
227 o_Q=getattr(self.pads, name).o[i]
228 )
229
230 # DQ ---------------------------------------------------------------------------------------
231 dq_oe = Signal()
232 dqs_oe = Signal()
233 dqs_pattern = DQSPattern()
234 m.submodules += dqs_pattern
235 for i in range(databits//8):
236 # DQSBUFM
237 dqs_i = Signal()
238 dqsr90 = Signal()
239 dqsw270 = Signal()
240 dqsw = Signal()
241 rdpntr = Signal(3)
242 wrpntr = Signal(3)
243 rdly = Signal(7)
244 with m.If(self._dly_sel.w_data[i]):
245 with m.If(self._rdly_dq_rst.w_stb):
246 m.d.sync += rdly.eq(0)
247 with m.Elif(self._rdly_dq_inc.w_stb):
248 m.d.sync += rdly.eq(rdly + 1)
249 datavalid = Signal()
250 burstdet = Signal()
251 dqs_read = Signal()
252 dqs_bitslip = Signal(2)
253 with m.If(self._dly_sel.w_data[i]):
254 with m.If(self._rdly_dq_bitslip_rst.w_stb):
255 m.d.sync += dqs_bitslip.eq(0)
256 with m.Elif(self._rdly_dq_bitslip.w_stb):
257 m.d.sync += dqs_bitslip.eq(dqs_bitslip + 1)
258 with m.Switch(dqs_bitslip):
259 for j, b in enumerate(range(-2, 2)):
260 with m.Case(j):
261 m.d.sync += dqs_read.eq(rddata_en[cl_sys_latency + b:cl_sys_latency + b + 2] != 0)
262
263 m.submodules += Instance("DQSBUFM",
264 p_DQS_LI_DEL_ADJ="MINUS",
265 p_DQS_LI_DEL_VAL=1,
266 p_DQS_LO_DEL_ADJ="MINUS",
267 p_DQS_LO_DEL_VAL=4,
268
269 # Delay
270 i_DYNDELAY0=0,
271 i_DYNDELAY1=0,
272 i_DYNDELAY2=0,
273 i_DYNDELAY3=0,
274 i_DYNDELAY4=0,
275 i_DYNDELAY5=0,
276 i_DYNDELAY6=0,
277 i_DYNDELAY7=0,
278
279 # Clocks / Reset
280 i_SCLK=ClockSignal("sync"),
281 i_ECLK=ClockSignal("sync2x"),
282 i_RST=ResetSignal("dramsync"),
283 i_DDRDEL=init.delay,
284 i_PAUSE=init.pause | self._dly_sel.w_data[i],
285
286 # Control
287 # Assert LOADNs to use DDRDEL control
288 i_RDLOADN=0,
289 i_RDMOVE=0,
290 i_RDDIRECTION=1,
291 i_WRLOADN=0,
292 i_WRMOVE=0,
293 i_WRDIRECTION=1,
294
295 # Reads (generate shifted DQS clock for reads)
296 i_READ0=dqs_read,
297 i_READ1=dqs_read,
298 i_READCLKSEL0=rdly[0],
299 i_READCLKSEL1=rdly[1],
300 i_READCLKSEL2=rdly[2],
301 i_DQSI=dqs_i,
302 o_DQSR90=dqsr90,
303 o_RDPNTR0=rdpntr[0],
304 o_RDPNTR1=rdpntr[1],
305 o_RDPNTR2=rdpntr[2],
306 o_WRPNTR0=wrpntr[0],
307 o_WRPNTR1=wrpntr[1],
308 o_WRPNTR2=wrpntr[2],
309 o_DATAVALID=self.datavalid[i],
310 o_BURSTDET=burstdet,
311
312 # Writes (generate shifted ECLK clock for writes)
313 o_DQSW270=dqsw270,
314 o_DQSW=dqsw
315 )
316 burstdet_d = Signal()
317 m.d.sync += burstdet_d.eq(burstdet)
318 with m.If(self._burstdet_clr.w_stb):
319 m.d.sync += self._burstdet_seen.r_data[i].eq(0)
320 with m.If(burstdet & ~burstdet_d):
321 m.d.sync += self._burstdet_seen.r_data[i].eq(1)
322
323 # DQS and DM ---------------------------------------------------------------------------
324 dm_o_data = Signal(8)
325 dm_o_data_d = Signal(8)
326 dm_o_data_muxed = Signal(4)
327 m.d.comb += dm_o_data.eq(Cat(
328 dfi.phases[0].wrdata_mask[0*databits//8+i],
329 dfi.phases[0].wrdata_mask[1*databits//8+i],
330 dfi.phases[0].wrdata_mask[2*databits//8+i],
331 dfi.phases[0].wrdata_mask[3*databits//8+i],
332
333 dfi.phases[1].wrdata_mask[0*databits//8+i],
334 dfi.phases[1].wrdata_mask[1*databits//8+i],
335 dfi.phases[1].wrdata_mask[2*databits//8+i],
336 dfi.phases[1].wrdata_mask[3*databits//8+i]),
337 )
338 m.d.sync += dm_o_data_d.eq(dm_o_data)
339 with m.Switch(bl8_chunk):
340 with m.Case(0):
341 m.d.sync += dm_o_data_muxed.eq(dm_o_data[:4])
342 with m.Case(1):
343 m.d.sync += dm_o_data_muxed.eq(dm_o_data_d[4:])
344 m.submodules += Instance("ODDRX2DQA",
345 i_RST=ResetSignal("dramsync"),
346 i_ECLK=ClockSignal("sync2x"),
347 i_SCLK=ClockSignal("sync"),
348 i_DQSW270=dqsw270,
349 i_D0=dm_o_data_muxed[0],
350 i_D1=dm_o_data_muxed[1],
351 i_D2=dm_o_data_muxed[2],
352 i_D3=dm_o_data_muxed[3],
353 o_Q=self.pads.dm.o[i]
354 )
355
356 dqs = Signal()
357 dqs_oe_n = Signal()
358 m.submodules += [
359 Instance("ODDRX2DQSB",
360 i_RST=ResetSignal("dramsync"),
361 i_ECLK=ClockSignal("sync2x"),
362 i_SCLK=ClockSignal(),
363 i_DQSW=dqsw,
364 i_D0=0, # FIXME: dqs_pattern.o[3],
365 i_D1=1, # FIXME: dqs_pattern.o[2],
366 i_D2=0, # FIXME: dqs_pattern.o[1],
367 i_D3=1, # FIXME: dqs_pattern.o[0],
368 o_Q=dqs
369 ),
370 Instance("TSHX2DQSA",
371 i_RST=ResetSignal("dramsync"),
372 i_ECLK=ClockSignal("sync2x"),
373 i_SCLK=ClockSignal(),
374 i_DQSW=dqsw,
375 i_T0=~(dqs_pattern.preamble | dqs_oe |
376 dqs_pattern.postamble),
377 i_T1=~(dqs_pattern.preamble | dqs_oe |
378 dqs_pattern.postamble),
379 o_Q=dqs_oe_n
380 ),
381 Instance("BB",
382 i_I=dqs,
383 i_T=dqs_oe_n,
384 o_O=dqs_i,
385 io_B=self.pads.dqs.io[i]
386 )
387 ]
388
389 for j in range(8*i, 8*(i+1)):
390 dq_o = Signal()
391 dq_i = Signal()
392 dq_oe_n = Signal()
393 dq_i_delayed = Signal()
394 dq_i_data = Signal(8)
395 dq_o_data = Signal(8)
396 dq_o_data_d = Signal(8)
397 dq_o_data_muxed = Signal(4)
398 m.d.comb += dq_o_data.eq(Cat(
399 dfi.phases[0].wrdata[0*databits+j],
400 dfi.phases[0].wrdata[1*databits+j],
401 dfi.phases[0].wrdata[2*databits+j],
402 dfi.phases[0].wrdata[3*databits+j],
403
404 dfi.phases[1].wrdata[0*databits+j],
405 dfi.phases[1].wrdata[1*databits+j],
406 dfi.phases[1].wrdata[2*databits+j],
407 dfi.phases[1].wrdata[3*databits+j])
408 )
409 m.d.sync += dq_o_data_d.eq(dq_o_data)
410 # FIXME: use self.comb?
411 with m.Switch(bl8_chunk):
412 with m.Case(0):
413 m.d.sync += dq_o_data_muxed.eq(dq_o_data[:4])
414 with m.Case(1):
415 m.d.sync += dq_o_data_muxed.eq(dq_o_data_d[4:])
416 _dq_i_data = Signal(4)
417 m.submodules += [
418 Instance("ODDRX2DQA",
419 i_RST=ResetSignal("dramsync"),
420 i_ECLK=ClockSignal("sync2x"),
421 i_SCLK=ClockSignal(),
422 i_DQSW270=dqsw270,
423 i_D0=dq_o_data_muxed[0],
424 i_D1=dq_o_data_muxed[1],
425 i_D2=dq_o_data_muxed[2],
426 i_D3=dq_o_data_muxed[3],
427 o_Q=dq_o
428 ),
429 Instance("DELAYF",
430 p_DEL_MODE="DQS_ALIGNED_X2",
431 i_LOADN=1,
432 i_MOVE=0,
433 i_DIRECTION=0,
434 i_A=dq_i,
435 o_Z=dq_i_delayed
436 ),
437 Instance("IDDRX2DQA",
438 i_RST=ResetSignal("dramsync"),
439 i_ECLK=ClockSignal("sync2x"),
440 i_SCLK=ClockSignal(),
441 i_DQSR90=dqsr90,
442 i_RDPNTR0=rdpntr[0],
443 i_RDPNTR1=rdpntr[1],
444 i_RDPNTR2=rdpntr[2],
445 i_WRPNTR0=wrpntr[0],
446 i_WRPNTR1=wrpntr[1],
447 i_WRPNTR2=wrpntr[2],
448 i_D=dq_i_delayed,
449 o_Q0=_dq_i_data[0],
450 o_Q1=_dq_i_data[1],
451 o_Q2=_dq_i_data[2],
452 o_Q3=_dq_i_data[3],
453 )
454 ]
455 m.d.sync += dq_i_data[:4].eq(dq_i_data[4:])
456 m.d.sync += dq_i_data[4:].eq(_dq_i_data)
457 m.d.comb += [
458 dfi.phases[0].rddata[0*databits+j].eq(dq_i_data[0]),
459 dfi.phases[0].rddata[1*databits+j].eq(dq_i_data[1]),
460 dfi.phases[0].rddata[2*databits+j].eq(dq_i_data[2]),
461 dfi.phases[0].rddata[3*databits+j].eq(dq_i_data[3]),
462 dfi.phases[1].rddata[0*databits+j].eq(dq_i_data[4]),
463 dfi.phases[1].rddata[1*databits+j].eq(dq_i_data[5]),
464 dfi.phases[1].rddata[2*databits+j].eq(dq_i_data[6]),
465 dfi.phases[1].rddata[3*databits+j].eq(dq_i_data[7]),
466 ]
467 m.submodules += [
468 Instance("TSHX2DQA",
469 i_RST=ResetSignal("dramsync"),
470 i_ECLK=ClockSignal("sync2x"),
471 i_SCLK=ClockSignal(),
472 i_DQSW270=dqsw270,
473 i_T0=~(dqs_pattern.preamble | dq_oe |
474 dqs_pattern.postamble),
475 i_T1=~(dqs_pattern.preamble | dq_oe |
476 dqs_pattern.postamble),
477 o_Q=dq_oe_n,
478 ),
479 Instance("BB",
480 i_I=dq_o,
481 i_T=dq_oe_n,
482 o_O=dq_i,
483 io_B=self.pads.dq.io[j]
484 )
485 ]
486
487 # Read Control Path ------------------------------------------------------------------------
488 # Creates a shift register of read commands coming from the DFI interface. This shift register
489 # is used to control DQS read (internal read pulse of the DQSBUF) and to indicate to the
490 # DFI interface that the read data is valid.
491 #
492 # The DQS read must be asserted for 2 sys_clk cycles before the read data is coming back from
493 # the DRAM (see 6.2.4 READ Pulse Positioning Optimization of FPGA-TN-02035-1.2)
494 #
495 # The read data valid is asserted for 1 sys_clk cycle when the data is available on the DFI
496 # interface, the latency is the sum of the ODDRX2DQA, CAS, IDDRX2DQA latencies.
497 rddata_en_last = Signal.like(rddata_en)
498 m.d.comb += rddata_en.eq(
499 Cat(dfi.phases[self.settings.rdphase].rddata_en, rddata_en_last))
500 m.d.sync += rddata_en_last.eq(rddata_en)
501 m.d.sync += [phase.rddata_valid.eq(rddata_en[-1])
502 for phase in dfi.phases]
503
504 # Write Control Path -----------------------------------------------------------------------
505 # Creates a shift register of write commands coming from the DFI interface. This shift register
506 # is used to control DQ/DQS tristates and to select write data of the DRAM burst from the DFI
507 # interface: The PHY is operating in halfrate mode (so provide 4 datas every sys_clk cycles:
508 # 2x for DDR, 2x for halfrate) but DDR3 requires a burst of 8 datas (BL8) for best efficiency.
509 # Writes are then performed in 2 sys_clk cycles and data needs to be selected for each cycle.
510 # FIXME: understand +2
511 wrdata_en = Signal(cwl_sys_latency + 5)
512 wrdata_en_last = Signal.like(wrdata_en)
513 m.d.comb += wrdata_en.eq(
514 Cat(dfi.phases[self.settings.wrphase].wrdata_en, wrdata_en_last))
515 m.d.sync += wrdata_en_last.eq(wrdata_en)
516 m.d.comb += dq_oe.eq(wrdata_en[cwl_sys_latency + 2]
517 | wrdata_en[cwl_sys_latency + 3])
518 m.d.comb += bl8_chunk.eq(wrdata_en[cwl_sys_latency + 1])
519 m.d.comb += dqs_oe.eq(dq_oe)
520
521 # Write DQS Postamble/Preamble Control Path ------------------------------------------------
522 # Generates DQS Preamble 1 cycle before the first write and Postamble 1 cycle after the last
523 # write. During writes, DQS tristate is configured as output for at least 4 sys_clk cycles:
524 # 1 for Preamble, 2 for the Write and 1 for the Postamble.
525 m.d.comb += dqs_pattern.preamble.eq(
526 wrdata_en[cwl_sys_latency + 1] & ~wrdata_en[cwl_sys_latency + 2])
527 m.d.comb += dqs_pattern.postamble.eq(
528 wrdata_en[cwl_sys_latency + 4] & ~wrdata_en[cwl_sys_latency + 3])
529
530 return m