new version of LDSTCompUnit
[soc.git] / src / soc / experiment / compldst_multi.py
1 """ LOAD / STORE Computation Unit.
2
3 This module covers POWER9-compliant Load and Store operations,
4 with selection on each between immediate and indexed mode as
5 options for the calculation of the Effective Address (EA),
6 and also "update" mode which optionally stores that EA into
7 an additional register.
8
9 Stores are activated when Go_Store is enabled, and uses the ALU to
10 compute the "Effective Address", and, when ready (go_st_i and the
11 ALU ready) the operand (src3_i) is stored in the computed address.
12
13 Loads are activated when Go_Write[0] is enabled. They also use the ALU
14 to compute the EA, and the data comes out (at any time from the
15 PortInterface), and is captured by the LDCompSTUnit.
16
17 Both LD and ST may request that the address be computed from summing
18 operand1 (src[0]) with operand2 (src[1]) *or* by summing operand1 with
19 the immediate (from the opcode).
20
21 Both LD and ST may also request "update" mode (op_is_update) which
22 activates the use of Go_Write[1] to control storage of the EA into
23 a *second* operand in the register file.
24
25 Thus this module has *TWO* write-requests to the register file and
26 *THREE* read-requests to the register file.
27
28 It's a multi-level Finite State Machine that (unfortunately) nmigen.FSM
29 is not suited to (nmigen.FSM is clock-driven, and some aspects of
30 the FSM below are *combinatorial*).
31
32 * One FSM covers Operand collection and communication address-side
33 with the LD/ST PortInterface. its role ends when "RD_DONE" is asserted
34
35 * A second FSM activates to cover LD. it activates if op_is_ld is true
36
37 * A third FSM activates to cover ST. it activates if op_is_st is true
38
39 * The "overall" (fourth) FSM coordinates the progression and completion
40 of the three other FSMs, firing "WR_RESET" which switches off "busy"
41
42 Full diagram:
43 https://libre-soc.org/3d_gpu/ld_st_comp_unit.jpg
44
45 Links including to walk-through videos:
46 * https://libre-soc.org/3d_gpu/architecture/6600scoreboard/
47 """
48
49 from nmigen.compat.sim import run_simulation
50 from nmigen.cli import verilog, rtlil
51 from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array
52 from nmigen.hdl.rec import Record, Layout
53
54 from nmutil.latch import SRLatch, latchregister
55
56 from soc.experiment.compalu_multi import go_record
57 from soc.experiment.l0_cache import PortInterface
58 from soc.experiment.testmem import TestMemory
59 from soc.decoder.power_enums import InternalOp
60
61 from soc.experiment.alu_hier import CompALUOpSubset
62
63 from soc.decoder.power_enums import InternalOp, Function
64
65
66 class CompLDSTOpSubset(Record):
67 """CompLDSTOpSubset
68
69 a copy of the relevant subset information from Decode2Execute1Type
70 needed for LD/ST operations. use with eq_from_execute1 (below) to
71 grab subsets.
72 """
73 def __init__(self, name=None):
74 layout = (('insn_type', InternalOp),
75 ('imm_data', Layout((("imm", 64), ("imm_ok", 1)))),
76 ('is_32bit', 1),
77 ('is_signed', 1),
78 ('data_len', 4), # TODO: should be in separate CompLDSTSubset
79 ('byte_reverse', 1),
80 ('sign_extend', 1),
81 ('update', 1))
82
83 Record.__init__(self, Layout(layout), name=name)
84
85 # grrr. Record does not have kwargs
86 self.insn_type.reset_less = True
87 self.is_32bit.reset_less = True
88 self.is_signed.reset_less = True
89 self.data_len.reset_less = True
90 self.byte_reverse.reset_less = True
91 self.sign_extend.reset_less = True
92 self.update.reset_less = True
93
94 def eq_from_execute1(self, other):
95 """ use this to copy in from Decode2Execute1Type
96 """
97 res = []
98 for fname, sig in self.fields.items():
99 eqfrom = other.fields[fname]
100 res.append(sig.eq(eqfrom))
101 return res
102
103 def ports(self):
104 return [self.insn_type,
105 self.is_32bit,
106 self.is_signed,
107 self.data_len,
108 self.byte_reverse,
109 self.sign_extend,
110 self.update,
111 ]
112
113
114 class LDSTCompUnit(Elaboratable):
115 """ LOAD / STORE Computation Unit
116
117 Inputs
118 ------
119
120 * :rwid: register width
121 * :alu: an ALU module
122 * :mem: a Memory Module (read-write capable)
123 * :src_i: Source Operands (RA/RB/RC) - managed by rd[0-3] go/req
124
125 Control Signals (In)
126 --------------------
127
128 * :oper_i: operation being carried out (POWER9 decode LD/ST subset)
129 * :issue_i: LD/ST is being "issued".
130 * :shadown_i: Inverted-shadow is being held (stops STORE *and* WRITE)
131 * :go_rd_i: read is being actioned (latches in src regs)
132 * :go_wr_i: write mode (exactly like ALU CompUnit)
133 * :go_ad_i: address is being actioned (triggers actual mem LD)
134 * :go_st_i: store is being actioned (triggers actual mem STORE)
135 * :go_die_i: resets the unit back to "wait for issue"
136
137 Control Signals (Out)
138 ---------------------
139
140 * :busy_o: function unit is busy
141 * :rd_rel_o: request src1/src2
142 * :adr_rel_o: request address (from mem)
143 * :sto_rel_o: request store (to mem)
144 * :req_rel_o: request write (result)
145 * :load_mem_o: activate memory LOAD
146 * :stwd_mem_o: activate memory STORE
147
148 Note: load_mem_o, stwd_mem_o and req_rel_o MUST all be acknowledged
149 in a single cycle and the CompUnit set back to doing another op.
150 This means deasserting go_st_i, go_ad_i or go_wr_i as appropriate
151 depending on whether the operation is a STORE, LD, or a straight
152 ALU operation respectively.
153
154 Control Data (out)
155 ------------------
156 * :data_o: Dest out (LD) - managed by wr[0] go/req
157 * :addr_o: Address out (LD or ST) - managed by wr[1] go/req
158 """
159
160 def __init__(self, rwid, alu, mem, debugtest=False):
161 self.rwid = rwid
162 self.alu = alu
163 self.mem = mem
164 self.debugtest = debugtest
165
166 # POWER-compliant LD/ST has index and update: *fixed* number of ports
167 self.n_src = n_src = 3 # RA, RB, RT/RS
168 self.n_dst = n_dest = 2 # RA, RT/RS
169
170 self.counter = Signal(4)
171 src = []
172 for i in range(n_src):
173 j = i + 1 # name numbering to match src1/src2
174 src.append(Signal(rwid, name="src%d_i" % j, reset_less=True))
175
176 dst = []
177 for i in range(n_dst):
178 j = i + 1 # name numbering to match dest1/2...
179 dst.append(Signal(rwid, name="dest%d_i" % j, reset_less=True))
180
181 self.rd = go_record(n_src, name="rd") # read in, req out
182 self.wr = go_record(n_dst, name="wr") # write in, req out
183 self.go_rd_i = self.rd.go # temporary naming
184 self.go_wr_i = self.wr.go # temporary naming
185 self.rd_rel_o = self.rd.rel # temporary naming
186 self.req_rel_o = self.wr.rel # temporary naming
187
188 self.ad = go_record(1, name="ad") # address go in, req out
189 self.st = go_record(1, name="st") # store go in, req out
190 self.go_ad_i = self.ad.go # temp naming: go address in
191 self.go_st_i = self.st.go # temp naming: go store in
192 self.issue_i = Signal(reset_less=True) # fn issue in
193 self.isalu_i = Signal(reset_less=True) # fn issue as ALU in
194 self.shadown_i = Signal(reset=1) # shadow function, defaults to ON
195 self.go_die_i = Signal() # go die (reset)
196
197 # operation / data input
198 self.oper_i = CompALUOpSubset() # operand
199 self.src_i = Array(src)
200 self.src1_i = src[0] # oper1 in: RA
201 self.src2_i = src[1] # oper2 in: RB
202 self.src3_i = src[3] # oper2 in: RC (RS)
203
204 # outputs
205 self.busy_o = Signal(reset_less=True) # fn busy out
206 self.dest = Array(dst)
207 self.data_o = dst[0] # Dest1 out: RT
208
209 self.adr_rel_o = self.ad.rel # request address (from mem)
210 self.sto_rel_o = self.st.rel # request store (to mem)
211 self.done_o = Signal(reset_less=True) # final release signal
212 self.addr_o = dst[1] # Address out (LD or ST) - Update => RA
213
214 # hmm... TODO... move these to outside of LDSTCompUnit?
215 self.load_mem_o = Signal(reset_less=True) # activate memory LOAD
216 self.stwd_mem_o = Signal(reset_less=True) # activate memory STORE
217 self.ld_o = Signal(reset_less=True) # operation is a LD
218 self.st_o = Signal(reset_less=True) # operation is a ST
219
220 def elaborate(self, platform):
221 m = Module()
222 comb = m.d.comb
223 sync = m.d.sync
224
225 m.submodules.alu = self.alu
226 #m.submodules.mem = self.mem
227 m.submodules.opc_l = opc_l = SRLatch(sync=False, name="opc")
228 m.submodules.src_l = src_l = SRLatch(sync=False, self.n_src, name="src")
229 m.submodules.alu_l = alu_l = SRLatch(sync=False, name="alu")
230 m.submodules.adr_l = adr_l = SRLatch(sync=False, name="adr")
231 m.submodules.lod_l = lod_l = SRLatch(sync=False, name="lod")
232 m.submodules.sto_l = sto_l = SRLatch(sync=False, name="sto")
233 m.submodules.wri_l = wri_l = SRLatch(sync=False, self.n_dst, name="req")
234 m.submodules.rst_l = sto_l = SRLatch(sync=False, name="rst")
235
236 # shadow/go_die
237 reset_b = Signal(reset_less=True)
238 reset_w = Signal(self.n_dst, reset_less=True) # reset write
239 reset_a = Signal(reset_less=True) # reset adr latch
240 reset_s = Signal(reset_less=True)
241 reset_r = Signal(reset_less=True)
242 comb += reset_b.eq(self.go_st_i | self.wr.go |
243 self.go_ad_i | self.go_die_i)
244 comb += reset_w.eq(self.wr.go | self.go_die_i)
245 comb += reset_s.eq(self.go_st_i | self.go_die_i)
246 comb += reset_r.eq(self.rd.go | self.go_die_i)
247 comb += reset_a.eq(self.go_ad_i | self.go_die_i)
248
249 # opcode decode
250 op_alu = Signal(reset_less=True)
251 op_is_ld = Signal(reset_less=True)
252 op_is_st = Signal(reset_less=True)
253 op_is_imm = Signal(reset_less=True)
254
255 # ALU/LD data output control
256 alulatch = Signal(reset_less=True)
257 ldlatch = Signal(reset_less=True)
258
259 # src2 register
260 src2_r = Signal(self.rwid, reset_less=True)
261
262 # select immediate or src2 reg to add
263 src2_or_imm = Signal(self.rwid, reset_less=True)
264 src_sel = Signal(reset_less=True)
265
266 # issue can be either issue_i or issue_alu_i (isalu_i)
267 issue_i = Signal(reset_less=True)
268 comb += issue_i.eq(self.issue_i | self.isalu_i)
269
270 # Ripple-down the latches, each one set cancels the previous.
271 # NOTE: use sync to stop combinatorial loops.
272
273 # opcode latch - inverted so that busy resets to 0
274 sync += opc_l.s.eq(issue_i) # XXX NOTE: INVERTED FROM book!
275 sync += opc_l.r.eq(reset_b) # XXX NOTE: INVERTED FROM book!
276
277 # src operand latch
278 sync += src_l.s.eq(issue_i)
279 sync += src_l.r.eq(reset_r)
280
281 # addr latch
282 sync += adr_l.s.eq(self.rd.go)
283 sync += adr_l.r.eq(reset_a)
284
285 # dest operand latch
286 sync += wri_l.s.eq(self.go_ad_i | self.go_st_i | self.wr.go)
287 sync += wri_l.r.eq(reset_w)
288
289 # store latch
290 sync += sto_l.s.eq(self.rd.go) # XXX not sure which
291 sync += sto_l.r.eq(reset_s)
292
293 # create a latch/register for the operand
294 oper_r = CompALUOpSubset() # Dest register
295 latchregister(m, self.oper_i, oper_r, self.issue_i, name="oper_r")
296
297 # and for each output from the ALU
298 drl = []
299 for i in range(self.n_dst):
300 name = "data_r%d" % i
301 data_r = Signal(self.rwid, name=name, reset_less=True)
302 latchregister(m, self.alu.out[i], data_r, req_l.q[i], name)
303 drl.append(data_r)
304
305 # and one for the output from the ALU (for the EA)
306 addr_r = Signal(self.rwid, reset_less=True) # Effective Address Latch
307 latchregister(m, self.alu.o, addr_r, alulatch, "ea_r")
308
309 # and pass the operation to the ALU
310 comb += self.alu.op.eq(oper_r)
311 comb += self.alu.op.insn_type.eq(InternalOp.OP_ADD) # override insn_type
312
313 # outputs: busy and release signals
314 busy_o = self.busy_o
315 comb += self.busy_o.eq(opc_l.q) # busy out
316 comb += self.rd.rel.eq(src_l.q & busy_o) # src1/src2 req rel
317 comb += self.sto_rel_o.eq(sto_l.q & busy_o & self.shadown_i & op_is_st)
318
319 # request release enabled based on if op is a LD/ST or a plain ALU
320 # if op is an ADD/SUB or a LD, req_rel activates.
321 wr_q = Signal(reset_less=True)
322 comb += wr_q.eq(wri_l.q & (~op_ldst | op_is_ld))
323
324 comb += alulatch.eq((op_ldst & self.adr_rel_o) |
325 (~op_ldst & self.wr.rel))
326
327 # select immediate if opcode says so. however also change the latch
328 # to trigger *from* the opcode latch instead.
329 comb += src_sel.eq(Mux(op_is_imm, opc_l.qn, src_l.q))
330 comb += src2_or_imm.eq(Mux(op_is_imm, oper_r.imm_data.imm,
331 self.src2_i))
332
333 # create a latch/register for src1/src2 (include immediate select)
334 latchregister(m, self.src1_i, self.alu.a, src_l.q, name="src1_r")
335 latchregister(m, self.src2_i, src2_r, src_l.q, name="src2_r")
336 latchregister(m, src2_or_imm, self.alu.b, src_sel, name="imm_r")
337
338 # decode bits of operand (latched)
339 comb += op_is_imm.eq(oper_r.imm_data.imm_ok) # IMM mode
340 comb += op_is_st.eq(oper_r.insn_type == InternalOp.OP_STORE) # ST
341 comb += op_is_ld.eq(oper_r.insn_type == InternalOp.OP_LOAD) # LD
342 op_is_update = oper_r.update # UPDATE
343 comb += op_ldst.eq(op_is_ld | op_is_st)
344 comb += self.load_mem_o.eq(op_is_ld & self.go_ad_i)
345 comb += self.stwd_mem_o.eq(op_is_st & self.go_st_i)
346 comb += self.ld_o.eq(op_is_ld)
347 comb += self.st_o.eq(op_is_st)
348
349 # on a go_read, tell the ALU we're accepting data.
350 # NOTE: this spells TROUBLE if the ALU isn't ready!
351 # go_read is only valid for one clock!
352 with m.If(self.rd.go): # src operands ready, GO!
353 with m.If(~self.alu.p_ready_o): # no ACK yet
354 m.d.comb += self.alu.p_valid_i.eq(1) # so indicate valid
355
356 # only proceed if ALU says its output is valid
357 with m.If(self.alu.n_valid_o):
358 # write req release out. waits until shadow is dropped.
359 comb += self.wr.rel.eq(wr_q & busy_o & self.shadown_i)
360 # address release only happens on LD/ST, and is shadowed.
361 comb += self.adr_rel_o.eq(adr_l.q & busy_o &
362 self.shadown_i)
363 # when output latch is ready, and ALU says ready, accept ALU output
364 with m.If(self.wr.rel):
365 # tells ALU "thanks got it"
366 m.d.comb += self.alu.n_ready_i.eq(1)
367
368 # provide "done" signal: select req_rel for non-LD/ST, adr_rel for LD/ST
369 comb += self.done_o.eq((self.wr.rel & ~op_ldst) |
370 (self.adr_rel_o & op_ldst))
371
372 # put the register directly onto the output bus on a go_write
373 # this is "ALU mode". go_wr_i *must* be deasserted on next clock
374 with m.If(self.wr.go):
375 comb += self.data_o.eq(data_r)
376
377 # "LD/ST" mode: put the register directly onto the *address* bus
378 with m.If(self.go_ad_i | self.go_st_i):
379 comb += self.addr_o.eq(data_r)
380
381 # TODO: think about moving these to another module
382
383 if self.debugtest:
384 return m
385
386 # connect ST to memory. NOTE: unit *must* be set back
387 # to start again by dropping go_st_i on next clock
388 with m.If(self.stwd_mem_o):
389 wrport = self.mem.wrport
390 comb += wrport.addr.eq(self.addr_o)
391 comb += wrport.data.eq(src2_r)
392 comb += wrport.en.eq(1)
393
394 # connect LD to memory. NOTE: unit *must* be set back
395 # to start again by dropping go_ad_i on next clock
396 rdport = self.mem.rdport
397 ldd_r = Signal(self.rwid, reset_less=True) # Dest register
398 # latch LD-out
399 latchregister(m, rdport.data, ldd_r, ldlatch, "ldo_r")
400 sync += ldlatch.eq(self.load_mem_o)
401 with m.If(self.load_mem_o):
402 comb += rdport.addr.eq(self.addr_o)
403 # comb += rdport.en.eq(1) # only when transparent=False
404
405 # if LD-latch, put ld-reg out onto output
406 with m.If(ldlatch | self.load_mem_o):
407 comb += self.data_o.eq(ldd_r)
408
409 return m
410
411 def __iter__(self):
412 yield self.rd.go
413 yield self.go_ad_i
414 yield self.wr.go
415 yield self.go_st_i
416 yield self.issue_i
417 yield self.isalu_i
418 yield self.shadown_i
419 yield self.go_die_i
420 yield from self.oper_i.ports()
421 yield from self.src_i
422 yield self.busy_o
423 yield self.rd.rel
424 yield self.adr_rel_o
425 yield self.sto_rel_o
426 yield self.wr.rel
427 yield self.data_o
428 yield self.load_mem_o
429 yield self.stwd_mem_o
430
431 def ports(self):
432 return list(self)
433
434
435 def wait_for(sig):
436 v = (yield sig)
437 print("wait for", sig, v)
438 while True:
439 yield
440 v = (yield sig)
441 print(v)
442 if v:
443 break
444
445
446 def store(dut, src1, src2, imm, imm_ok=True):
447 yield dut.oper_i.insn_type.eq(InternalOp.OP_STORE)
448 yield dut.src1_i.eq(src1)
449 yield dut.src2_i.eq(src2)
450 yield dut.oper_i.imm_data.imm.eq(imm)
451 yield dut.oper_i.imm_data.imm_ok.eq(imm_ok)
452 yield dut.issue_i.eq(1)
453 yield
454 yield dut.issue_i.eq(0)
455 yield
456 yield dut.rd.go.eq(0b11)
457 yield from wait_for(dut.rd.rel)
458 yield dut.rd.go.eq(0)
459 yield from wait_for(dut.adr_rel_o)
460 yield dut.go_st_i.eq(1)
461 yield from wait_for(dut.sto_rel_o)
462 wait_for(dut.stwd_mem_o)
463 yield dut.go_st_i.eq(0)
464 yield
465
466
467 def load(dut, src1, src2, imm, imm_ok=True):
468 yield dut.oper_i.insn_type.eq(InternalOp.OP_LOAD)
469 yield dut.src1_i.eq(src1)
470 yield dut.src2_i.eq(src2)
471 yield dut.oper_i.imm_data.imm.eq(imm)
472 yield dut.oper_i.imm_data.imm_ok.eq(imm_ok)
473 yield dut.issue_i.eq(1)
474 yield
475 yield dut.issue_i.eq(0)
476 yield
477 yield dut.rd.go.eq(0b11)
478 yield from wait_for(dut.rd.rel)
479 yield dut.rd.go.eq(0)
480 yield from wait_for(dut.adr_rel_o)
481 yield dut.go_ad_i.eq(1)
482 yield from wait_for(dut.busy_o)
483 yield
484 data = (yield dut.data_o)
485 yield dut.go_ad_i.eq(0)
486 # wait_for(dut.stwd_mem_o)
487 return data
488
489
490 def add(dut, src1, src2, imm, imm_ok=False):
491 yield dut.oper_i.insn_type.eq(InternalOp.OP_ADD)
492 yield dut.src1_i.eq(src1)
493 yield dut.src2_i.eq(src2)
494 yield dut.oper_i.imm_data.imm.eq(imm)
495 yield dut.oper_i.imm_data.imm_ok.eq(imm_ok)
496 yield dut.issue_i.eq(1)
497 yield
498 yield dut.issue_i.eq(0)
499 yield
500 yield dut.rd.go.eq(1)
501 yield from wait_for(dut.rd.rel)
502 yield dut.rd.go.eq(0)
503 yield from wait_for(dut.wr.rel)
504 yield dut.wr.go.eq(1)
505 yield from wait_for(dut.busy_o)
506 yield
507 data = (yield dut.data_o)
508 yield dut.wr.go.eq(0)
509 yield
510 # wait_for(dut.stwd_mem_o)
511 return data
512
513
514 def scoreboard_sim(dut):
515 # two STs (different addresses)
516 yield from store(dut, 4, 3, 2)
517 yield from store(dut, 2, 9, 2)
518 yield
519 # two LDs (deliberately LD from the 1st address then 2nd)
520 data = yield from load(dut, 4, 0, 2)
521 assert data == 0x0003
522 data = yield from load(dut, 2, 0, 2)
523 assert data == 0x0009
524 yield
525
526 # now do an add
527 data = yield from add(dut, 4, 3, 0xfeed)
528 assert data == 0x7
529
530 # and an add-immediate
531 data = yield from add(dut, 4, 0xdeef, 2, imm_ok=True)
532 assert data == 0x6
533
534
535 class TestLDSTCompUnit(LDSTCompUnit):
536
537 def __init__(self, rwid):
538 from alu_hier import ALU
539 self.alu = alu = ALU(rwid)
540 self.mem = mem = TestMemory(rwid, 8)
541 LDSTCompUnit.__init__(self, rwid, alu, mem)
542
543 def elaborate(self, platform):
544 m = LDSTCompUnit.elaborate(self, platform)
545 m.submodules.mem = self.mem
546 return m
547
548
549 def test_scoreboard():
550
551 dut = TestLDSTCompUnit(16)
552 vl = rtlil.convert(dut, ports=dut.ports())
553 with open("test_ldst_comp.il", "w") as f:
554 f.write(vl)
555
556 run_simulation(dut, scoreboard_sim(dut), vcd_name='test_ldst_comp.vcd')
557
558
559 if __name__ == '__main__':
560 test_scoreboard()