begin connecting up signals for LDSTCompUnit
[soc.git] / src / soc / experiment / compldst_multi.py
1 """ LOAD / STORE Computation Unit.
2
3 This module covers POWER9-compliant Load and Store operations,
4 with selection on each between immediate and indexed mode as
5 options for the calculation of the Effective Address (EA),
6 and also "update" mode which optionally stores that EA into
7 an additional register.
8
9 Stores are activated when Go_Store is enabled, and uses the ALU to
10 compute the "Effective Address", and, when ready (go_st_i and the
11 ALU ready) the operand (src3_i) is stored in the computed address.
12
13 Loads are activated when Go_Write[0] is enabled. They also use the ALU
14 to compute the EA, and the data comes out (at any time from the
15 PortInterface), and is captured by the LDCompSTUnit.
16
17 Both LD and ST may request that the address be computed from summing
18 operand1 (src[0]) with operand2 (src[1]) *or* by summing operand1 with
19 the immediate (from the opcode).
20
21 Both LD and ST may also request "update" mode (op_is_update) which
22 activates the use of Go_Write[1] to control storage of the EA into
23 a *second* operand in the register file.
24
25 Thus this module has *TWO* write-requests to the register file and
26 *THREE* read-requests to the register file.
27
28 It's a multi-level Finite State Machine that (unfortunately) nmigen.FSM
29 is not suited to (nmigen.FSM is clock-driven, and some aspects of
30 the FSM below are *combinatorial*).
31
32 * One FSM covers Operand collection and communication address-side
33 with the LD/ST PortInterface. its role ends when "RD_DONE" is asserted
34
35 * A second FSM activates to cover LD. it activates if op_is_ld is true
36
37 * A third FSM activates to cover ST. it activates if op_is_st is true
38
39 * The "overall" (fourth) FSM coordinates the progression and completion
40 of the three other FSMs, firing "WR_RESET" which switches off "busy"
41
42 Full diagram:
43 https://libre-soc.org/3d_gpu/ld_st_comp_unit.jpg
44
45 Links including to walk-through videos:
46 * https://libre-soc.org/3d_gpu/architecture/6600scoreboard/
47 """
48
49 from nmigen.compat.sim import run_simulation
50 from nmigen.cli import verilog, rtlil
51 from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array
52 from nmigen.hdl.rec import Record, Layout
53
54 from nmutil.latch import SRLatch, latchregister
55
56 from soc.experiment.compalu_multi import go_record
57 from soc.experiment.l0_cache import PortInterface
58 from soc.experiment.testmem import TestMemory
59 from soc.decoder.power_enums import InternalOp
60
61 from soc.experiment.alu_hier import CompALUOpSubset
62
63 from soc.decoder.power_enums import InternalOp, Function
64
65
66 class CompLDSTOpSubset(Record):
67 """CompLDSTOpSubset
68
69 a copy of the relevant subset information from Decode2Execute1Type
70 needed for LD/ST operations. use with eq_from_execute1 (below) to
71 grab subsets.
72 """
73 def __init__(self, name=None):
74 layout = (('insn_type', InternalOp),
75 ('imm_data', Layout((("imm", 64), ("imm_ok", 1)))),
76 ('is_32bit', 1),
77 ('is_signed', 1),
78 ('data_len', 4), # TODO: should be in separate CompLDSTSubset
79 ('byte_reverse', 1),
80 ('sign_extend', 1),
81 ('update', 1))
82
83 Record.__init__(self, Layout(layout), name=name)
84
85 # grrr. Record does not have kwargs
86 self.insn_type.reset_less = True
87 self.is_32bit.reset_less = True
88 self.is_signed.reset_less = True
89 self.data_len.reset_less = True
90 self.byte_reverse.reset_less = True
91 self.sign_extend.reset_less = True
92 self.update.reset_less = True
93
94 def eq_from_execute1(self, other):
95 """ use this to copy in from Decode2Execute1Type
96 """
97 res = []
98 for fname, sig in self.fields.items():
99 eqfrom = other.fields[fname]
100 res.append(sig.eq(eqfrom))
101 return res
102
103 def ports(self):
104 return [self.insn_type,
105 self.is_32bit,
106 self.is_signed,
107 self.data_len,
108 self.byte_reverse,
109 self.sign_extend,
110 self.update,
111 ]
112
113
114 class LDSTCompUnit(Elaboratable):
115 """ LOAD / STORE Computation Unit
116
117 Inputs
118 ------
119
120 * :rwid: register width
121 * :alu: an ALU module
122 * :mem: a Memory Module (read-write capable)
123 * :src_i: Source Operands (RA/RB/RC) - managed by rd[0-3] go/req
124
125 Control Signals (In)
126 --------------------
127
128 * :oper_i: operation being carried out (POWER9 decode LD/ST subset)
129 * :issue_i: LD/ST is being "issued".
130 * :shadown_i: Inverted-shadow is being held (stops STORE *and* WRITE)
131 * :go_rd_i: read is being actioned (latches in src regs)
132 * :go_wr_i: write mode (exactly like ALU CompUnit)
133 * :go_ad_i: address is being actioned (triggers actual mem LD)
134 * :go_st_i: store is being actioned (triggers actual mem STORE)
135 * :go_die_i: resets the unit back to "wait for issue"
136
137 Control Signals (Out)
138 ---------------------
139
140 * :busy_o: function unit is busy
141 * :rd_rel_o: request src1/src2
142 * :adr_rel_o: request address (from mem)
143 * :sto_rel_o: request store (to mem)
144 * :req_rel_o: request write (result)
145 * :load_mem_o: activate memory LOAD
146 * :stwd_mem_o: activate memory STORE
147
148 Note: load_mem_o, stwd_mem_o and req_rel_o MUST all be acknowledged
149 in a single cycle and the CompUnit set back to doing another op.
150 This means deasserting go_st_i, go_ad_i or go_wr_i as appropriate
151 depending on whether the operation is a STORE, LD, or a straight
152 ALU operation respectively.
153
154 Control Data (out)
155 ------------------
156 * :data_o: Dest out (LD) - managed by wr[0] go/req
157 * :addr_o: Address out (LD or ST) - managed by wr[1] go/req
158 """
159
160 def __init__(self, rwid, alu, mem, debugtest=False):
161 self.rwid = rwid
162 self.alu = alu
163 self.mem = mem
164 self.debugtest = debugtest
165
166 # POWER-compliant LD/ST has index and update: *fixed* number of ports
167 self.n_src = n_src = 3 # RA, RB, RT/RS
168 self.n_dst = n_dest = 2 # RA, RT/RS
169
170 self.counter = Signal(4)
171 src = []
172 for i in range(n_src):
173 j = i + 1 # name numbering to match src1/src2
174 src.append(Signal(rwid, name="src%d_i" % j, reset_less=True))
175
176 dst = []
177 for i in range(n_dst):
178 j = i + 1 # name numbering to match dest1/2...
179 dst.append(Signal(rwid, name="dest%d_i" % j, reset_less=True))
180
181 self.rd = go_record(n_src, name="rd") # read in, req out
182 self.wr = go_record(n_dst, name="wr") # write in, req out
183 self.go_rd_i = self.rd.go # temporary naming
184 self.go_wr_i = self.wr.go # temporary naming
185 self.rd_rel_o = self.rd.rel # temporary naming
186 self.req_rel_o = self.wr.rel # temporary naming
187
188 self.ad = go_record(1, name="ad") # address go in, req out
189 self.st = go_record(1, name="st") # store go in, req out
190 self.go_ad_i = self.ad.go # temp naming: go address in
191 self.go_st_i = self.st.go # temp naming: go store in
192 self.issue_i = Signal(reset_less=True) # fn issue in
193 self.isalu_i = Signal(reset_less=True) # fn issue as ALU in
194 self.shadown_i = Signal(reset=1) # shadow function, defaults to ON
195 self.go_die_i = Signal() # go die (reset)
196
197 # operation / data input
198 self.oper_i = CompALUOpSubset() # operand
199 self.src_i = Array(src)
200 self.src1_i = src[0] # oper1 in: RA
201 self.src2_i = src[1] # oper2 in: RB
202 self.src3_i = src[3] # oper2 in: RC (RS)
203
204 # outputs
205 self.busy_o = Signal(reset_less=True) # fn busy out
206 self.dest = Array(dst)
207 self.data_o = dst[0] # Dest1 out: RT
208
209 self.adr_rel_o = self.ad.rel # request address (from mem)
210 self.sto_rel_o = self.st.rel # request store (to mem)
211 self.done_o = Signal(reset_less=True) # final release signal
212 self.addr_o = dst[1] # Address out (LD or ST) - Update => RA
213
214 # hmm... TODO... move these to outside of LDSTCompUnit?
215 self.load_mem_o = Signal(reset_less=True) # activate memory LOAD
216 self.stwd_mem_o = Signal(reset_less=True) # activate memory STORE
217 self.ld_o = Signal(reset_less=True) # operation is a LD
218 self.st_o = Signal(reset_less=True) # operation is a ST
219
220 def elaborate(self, platform):
221 m = Module()
222 comb = m.d.comb
223 sync = m.d.sync
224
225 m.submodules.alu = self.alu
226 #m.submodules.mem = self.mem
227 m.submodules.opc_l = opc_l = SRLatch(sync=False, name="opc")
228 m.submodules.src_l = src_l = SRLatch(sync=False, self.n_src, name="src")
229 m.submodules.alu_l = alu_l = SRLatch(sync=False, name="alu")
230 m.submodules.adr_l = adr_l = SRLatch(sync=False, name="adr")
231 m.submodules.lod_l = lod_l = SRLatch(sync=False, name="lod")
232 m.submodules.sto_l = sto_l = SRLatch(sync=False, name="sto")
233 m.submodules.wri_l = wri_l = SRLatch(sync=False, self.n_dst, name="req")
234 m.submodules.rst_l = sto_l = SRLatch(sync=False, name="rst")
235
236 # shadow/go_die
237 reset_b = Signal(reset_less=True) # reset opcode
238 reset_w = Signal(self.n_dst, reset_less=True) # reset write
239 reset_a = Signal(reset_less=True) # reset adr latch
240 reset_r = Signal(self.n_src, reset_less=True) # reset src
241 reset_s = Signal(reset_less=True) # reset store
242 wr_reset = Signal(reset_less=True) # final reset condition
243 comb += reset_b.eq(wr_reset | self.go_die_i)
244 comb += reset_w.eq(self.wr.go | self.go_die_i)
245 comb += reset_s.eq(self.go_st_i | self.go_die_i)
246 comb += reset_r.eq(self.rd.go | Repl(self.go_die_i, self.n_src))
247 comb += reset_a.eq(self.go_ad_i | self.go_die_i)
248
249 # opcode decode
250 op_alu = Signal(reset_less=True)
251 op_is_ld = Signal(reset_less=True)
252 op_is_st = Signal(reset_less=True)
253
254 # ALU/LD data output control
255 alu_valid = Signal(reset_less=True) # ALU operands are valid
256 alu_ok = Signal(reset_less=True) # ALU out ok (1 clock delay valid)
257 alulatch = Signal(reset_less=True)
258 ldlatch = Signal(reset_less=True)
259 wr_any = Signal(reset_less=True) # any write (incl. store)
260 rd_done = Signal(reset_less=True) # all *necessary* operands read
261 wr_reset = Signal(reset_less=True) # final reset condition
262
263 # src2 register
264 src2_r = Signal(self.rwid, reset_less=True)
265
266 # select immediate or src2 reg to add
267 src2_or_imm = Signal(self.rwid, reset_less=True)
268 src_sel = Signal(reset_less=True)
269
270 # issue can be either issue_i or issue_alu_i (isalu_i)
271 issue_i = Signal(reset_less=True)
272 comb += issue_i.eq(self.issue_i | self.isalu_i)
273
274 # Ripple-down the latches, each one set cancels the previous.
275 # NOTE: use sync to stop combinatorial loops.
276
277 # opcode latch - inverted so that busy resets to 0
278 sync += opc_l.s.eq(issue_i) # XXX NOTE: INVERTED FROM book!
279 sync += opc_l.r.eq(reset_b) # XXX NOTE: INVERTED FROM book!
280
281 # src operand latch
282 sync += src_l.s.eq(Repl(issue_i, self.n_src))
283 sync += src_l.r.eq(reset_r)
284
285 # addr latch
286 sync += adr_l.s.eq(self.rd.go)
287 sync += adr_l.r.eq(reset_a)
288
289 # dest operand latch
290 sync += wri_l.s.eq(self.go_ad_i | self.go_st_i | self.wr.go)
291 sync += wri_l.r.eq(reset_w)
292
293 # store latch
294 sync += sto_l.s.eq(self.rd.go) # XXX not sure which
295 sync += sto_l.r.eq(reset_s)
296
297 # create a latch/register for the operand
298 oper_r = CompALUOpSubset() # Dest register
299 latchregister(m, self.oper_i, oper_r, self.issue_i, name="oper_r")
300
301 # and for each input from the incoming src operands
302 srl = []
303 for i in range(self.n_src):
304 name = "src_r%d" % i
305 src_r = Signal(self.rwid, name=name, reset_less=True)
306 latchregister(m, self.src_i[i], data_r, src_l.q[i], name)
307 srl.append(data_r)
308
309 # and for each output from the ALU
310 drl = []
311 for i in range(self.n_dst):
312 name = "data_r%d" % i
313 data_r = Signal(self.rwid, name=name, reset_less=True)
314 latchregister(m, self.alu.out[i], data_r, req_l.q[i], name)
315 drl.append(data_r)
316
317 # and one for the output from the ALU (for the EA)
318 addr_r = Signal(self.rwid, reset_less=True) # Effective Address Latch
319 latchregister(m, self.alu.o, addr_r, alulatch, "ea_r")
320
321 # and pass the operation to the ALU
322 comb += self.alu.op.eq(oper_r)
323 comb += self.alu.op.insn_type.eq(InternalOp.OP_ADD) # override insn_type
324
325 # ok let's connect (and name) the 3 src latched regs created above
326 comb += self.alu.i[0].eq(srl[0]) # Op1 goes straight to ALU input 1
327 op2 = srl[0] # op2 needs to be muxed (imm select)
328 st_data = srl[2] # op3 is for STORE operations
329
330 # select immediate if opcode says so (and put that into ALU input 2)
331 op_is_imm = oper_r.imm_data.imm_ok
332 src2_or_imm = Signal(self.rwid, reset_less=True)
333 m.d.comb += src2_or_imm.eq(Mux(op_is_imm, oper_r.imm_data.imm, op2))
334 comb += self.alu.i[1].eq(src2_or_imm) # src2_or_imm into ALU input 2
335
336 # outputs: busy and release signals
337 busy_o = self.busy_o
338 comb += self.busy_o.eq(opc_l.q) # busy out
339 comb += self.rd.rel.eq(src_l.q & busy_o) # src1/src2 req rel
340 comb += self.sto_rel_o.eq(sto_l.q & busy_o & self.shadown_i & op_is_st)
341
342 # request release enabled based on if op is a LD/ST or a plain ALU
343 # if op is an ADD/SUB or a LD, req_rel activates.
344 wr_q = Signal(reset_less=True)
345 comb += wr_q.eq(wri_l.q & (~op_ldst | op_is_ld))
346
347 comb += alulatch.eq((op_ldst & self.adr_rel_o) |
348 (~op_ldst & self.wr.rel))
349
350 # select immediate if opcode says so. however also change the latch
351 # to trigger *from* the opcode latch instead.
352 comb += src_sel.eq(Mux(op_is_imm, opc_l.qn, src_l.q))
353 comb += src2_or_imm.eq(Mux(op_is_imm, oper_r.imm_data.imm,
354 self.src2_i))
355
356 # create a latch/register for src1/src2 (include immediate select)
357 latchregister(m, self.src1_i, self.alu.a, src_l.q, name="src1_r")
358 latchregister(m, self.src2_i, src2_r, src_l.q, name="src2_r")
359 latchregister(m, src2_or_imm, self.alu.b, src_sel, name="imm_r")
360
361 # decode bits of operand (latched)
362 comb += op_is_st.eq(oper_r.insn_type == InternalOp.OP_STORE) # ST
363 comb += op_is_ld.eq(oper_r.insn_type == InternalOp.OP_LOAD) # LD
364 op_is_update = oper_r.update # UPDATE
365 comb += op_ldst.eq(op_is_ld | op_is_st)
366 comb += self.load_mem_o.eq(op_is_ld & self.go_ad_i)
367 comb += self.stwd_mem_o.eq(op_is_st & self.go_st_i)
368 comb += self.ld_o.eq(op_is_ld)
369 comb += self.st_o.eq(op_is_st)
370
371 # on a go_read, tell the ALU we're accepting data.
372 # NOTE: this spells TROUBLE if the ALU isn't ready!
373 # go_read is only valid for one clock!
374 with m.If(self.rd.go): # src operands ready, GO!
375 with m.If(~self.alu.p_ready_o): # no ACK yet
376 m.d.comb += self.alu.p_valid_i.eq(1) # so indicate valid
377
378 # only proceed if ALU says its output is valid
379 with m.If(self.alu.n_valid_o):
380 # write req release out. waits until shadow is dropped.
381 comb += self.wr.rel.eq(wr_q & busy_o & self.shadown_i)
382 # address release only happens on LD/ST, and is shadowed.
383 comb += self.adr_rel_o.eq(adr_l.q & busy_o &
384 self.shadown_i)
385 # when output latch is ready, and ALU says ready, accept ALU output
386 with m.If(self.wr.rel):
387 # tells ALU "thanks got it"
388 m.d.comb += self.alu.n_ready_i.eq(1)
389
390 # provide "done" signal: select req_rel for non-LD/ST, adr_rel for LD/ST
391 comb += self.done_o.eq((self.wr.rel & ~op_ldst) |
392 (self.adr_rel_o & op_ldst))
393
394 # put the register directly onto the output bus on a go_write
395 # this is "ALU mode". go_wr_i *must* be deasserted on next clock
396 with m.If(self.wr.go):
397 comb += self.data_o.eq(data_r)
398
399 # "LD/ST" mode: put the register directly onto the *address* bus
400 with m.If(self.go_ad_i | self.go_st_i):
401 comb += self.addr_o.eq(data_r)
402
403 # TODO: think about moving these to another module
404
405 if self.debugtest:
406 return m
407
408 # connect ST to memory. NOTE: unit *must* be set back
409 # to start again by dropping go_st_i on next clock
410 with m.If(self.stwd_mem_o):
411 wrport = self.mem.wrport
412 comb += wrport.addr.eq(self.addr_o)
413 comb += wrport.data.eq(src2_r)
414 comb += wrport.en.eq(1)
415
416 # connect LD to memory. NOTE: unit *must* be set back
417 # to start again by dropping go_ad_i on next clock
418 rdport = self.mem.rdport
419 ldd_r = Signal(self.rwid, reset_less=True) # Dest register
420 # latch LD-out
421 latchregister(m, rdport.data, ldd_r, ldlatch, "ldo_r")
422 sync += ldlatch.eq(self.load_mem_o)
423 with m.If(self.load_mem_o):
424 comb += rdport.addr.eq(self.addr_o)
425 # comb += rdport.en.eq(1) # only when transparent=False
426
427 # if LD-latch, put ld-reg out onto output
428 with m.If(ldlatch | self.load_mem_o):
429 comb += self.data_o.eq(ldd_r)
430
431 return m
432
433 def __iter__(self):
434 yield self.rd.go
435 yield self.go_ad_i
436 yield self.wr.go
437 yield self.go_st_i
438 yield self.issue_i
439 yield self.isalu_i
440 yield self.shadown_i
441 yield self.go_die_i
442 yield from self.oper_i.ports()
443 yield from self.src_i
444 yield self.busy_o
445 yield self.rd.rel
446 yield self.adr_rel_o
447 yield self.sto_rel_o
448 yield self.wr.rel
449 yield self.data_o
450 yield self.load_mem_o
451 yield self.stwd_mem_o
452
453 def ports(self):
454 return list(self)
455
456
457 def wait_for(sig):
458 v = (yield sig)
459 print("wait for", sig, v)
460 while True:
461 yield
462 v = (yield sig)
463 print(v)
464 if v:
465 break
466
467
468 def store(dut, src1, src2, imm, imm_ok=True):
469 yield dut.oper_i.insn_type.eq(InternalOp.OP_STORE)
470 yield dut.src1_i.eq(src1)
471 yield dut.src2_i.eq(src2)
472 yield dut.oper_i.imm_data.imm.eq(imm)
473 yield dut.oper_i.imm_data.imm_ok.eq(imm_ok)
474 yield dut.issue_i.eq(1)
475 yield
476 yield dut.issue_i.eq(0)
477 yield
478 yield dut.rd.go.eq(0b11)
479 yield from wait_for(dut.rd.rel)
480 yield dut.rd.go.eq(0)
481 yield from wait_for(dut.adr_rel_o)
482 yield dut.go_st_i.eq(1)
483 yield from wait_for(dut.sto_rel_o)
484 wait_for(dut.stwd_mem_o)
485 yield dut.go_st_i.eq(0)
486 yield
487
488
489 def load(dut, src1, src2, imm, imm_ok=True):
490 yield dut.oper_i.insn_type.eq(InternalOp.OP_LOAD)
491 yield dut.src1_i.eq(src1)
492 yield dut.src2_i.eq(src2)
493 yield dut.oper_i.imm_data.imm.eq(imm)
494 yield dut.oper_i.imm_data.imm_ok.eq(imm_ok)
495 yield dut.issue_i.eq(1)
496 yield
497 yield dut.issue_i.eq(0)
498 yield
499 yield dut.rd.go.eq(0b11)
500 yield from wait_for(dut.rd.rel)
501 yield dut.rd.go.eq(0)
502 yield from wait_for(dut.adr_rel_o)
503 yield dut.go_ad_i.eq(1)
504 yield from wait_for(dut.busy_o)
505 yield
506 data = (yield dut.data_o)
507 yield dut.go_ad_i.eq(0)
508 # wait_for(dut.stwd_mem_o)
509 return data
510
511
512 def add(dut, src1, src2, imm, imm_ok=False):
513 yield dut.oper_i.insn_type.eq(InternalOp.OP_ADD)
514 yield dut.src1_i.eq(src1)
515 yield dut.src2_i.eq(src2)
516 yield dut.oper_i.imm_data.imm.eq(imm)
517 yield dut.oper_i.imm_data.imm_ok.eq(imm_ok)
518 yield dut.issue_i.eq(1)
519 yield
520 yield dut.issue_i.eq(0)
521 yield
522 yield dut.rd.go.eq(1)
523 yield from wait_for(dut.rd.rel)
524 yield dut.rd.go.eq(0)
525 yield from wait_for(dut.wr.rel)
526 yield dut.wr.go.eq(1)
527 yield from wait_for(dut.busy_o)
528 yield
529 data = (yield dut.data_o)
530 yield dut.wr.go.eq(0)
531 yield
532 # wait_for(dut.stwd_mem_o)
533 return data
534
535
536 def scoreboard_sim(dut):
537 # two STs (different addresses)
538 yield from store(dut, 4, 3, 2)
539 yield from store(dut, 2, 9, 2)
540 yield
541 # two LDs (deliberately LD from the 1st address then 2nd)
542 data = yield from load(dut, 4, 0, 2)
543 assert data == 0x0003
544 data = yield from load(dut, 2, 0, 2)
545 assert data == 0x0009
546 yield
547
548 # now do an add
549 data = yield from add(dut, 4, 3, 0xfeed)
550 assert data == 0x7
551
552 # and an add-immediate
553 data = yield from add(dut, 4, 0xdeef, 2, imm_ok=True)
554 assert data == 0x6
555
556
557 class TestLDSTCompUnit(LDSTCompUnit):
558
559 def __init__(self, rwid):
560 from alu_hier import ALU
561 self.alu = alu = ALU(rwid)
562 self.mem = mem = TestMemory(rwid, 8)
563 LDSTCompUnit.__init__(self, rwid, alu, mem)
564
565 def elaborate(self, platform):
566 m = LDSTCompUnit.elaborate(self, platform)
567 m.submodules.mem = self.mem
568 return m
569
570
571 def test_scoreboard():
572
573 dut = TestLDSTCompUnit(16)
574 vl = rtlil.convert(dut, ports=dut.ports())
575 with open("test_ldst_comp.il", "w") as f:
576 f.write(vl)
577
578 run_simulation(dut, scoreboard_sim(dut), vcd_name='test_ldst_comp.vcd')
579
580
581 if __name__ == '__main__':
582 test_scoreboard()