Extract the fetch FSM out from the main FSM
[soc.git] / src / soc / simple / issuer.py
1 """simple core issuer
2
3 not in any way intended for production use. this runs a FSM that:
4
5 * reads the Program Counter from StateRegs
6 * reads an instruction from a fixed-size Test Memory
7 * issues it to the Simple Core
8 * waits for it to complete
9 * increments the PC
10 * does it all over again
11
12 the purpose of this module is to verify the functional correctness
13 of the Function Units in the absolute simplest and clearest possible
14 way, and to at provide something that can be further incrementally
15 improved.
16 """
17
18 from nmigen import (Elaboratable, Module, Signal, ClockSignal, ResetSignal,
19 ClockDomain, DomainRenamer)
20 from nmigen.cli import rtlil
21 from nmigen.cli import main
22 import sys
23
24 from soc.decoder.power_decoder import create_pdecode
25 from soc.decoder.power_decoder2 import PowerDecode2
26 from soc.decoder.decode2execute1 import IssuerDecode2ToOperand
27 from soc.decoder.decode2execute1 import Data
28 from soc.experiment.testmem import TestMemory # test only for instructions
29 from soc.regfile.regfiles import StateRegs, FastRegs
30 from soc.simple.core import NonProductionCore
31 from soc.config.test.test_loadstore import TestMemPspec
32 from soc.config.ifetch import ConfigFetchUnit
33 from soc.decoder.power_enums import MicrOp
34 from soc.debug.dmi import CoreDebug, DMIInterface
35 from soc.debug.jtag import JTAG
36 from soc.config.pinouts import get_pinspecs
37 from soc.config.state import CoreState
38 from soc.interrupts.xics import XICS_ICP, XICS_ICS
39 from soc.bus.simple_gpio import SimpleGPIO
40 from soc.clock.select import ClockSelect
41 from soc.clock.dummypll import DummyPLL
42
43
44 from nmutil.util import rising_edge
45
46
47 class TestIssuerInternal(Elaboratable):
48 """TestIssuer - reads instructions from TestMemory and issues them
49
50 efficiency and speed is not the main goal here: functional correctness is.
51 """
52 def __init__(self, pspec):
53
54 # JTAG interface. add this right at the start because if it's
55 # added it *modifies* the pspec, by adding enable/disable signals
56 # for parts of the rest of the core
57 self.jtag_en = hasattr(pspec, "debug") and pspec.debug == 'jtag'
58 if self.jtag_en:
59 subset = {'uart', 'mtwi', 'eint', 'gpio', 'mspi0', 'mspi1',
60 'pwm', 'sd0', 'sdr'}
61 self.jtag = JTAG(get_pinspecs(subset=subset))
62 # add signals to pspec to enable/disable icache and dcache
63 # (or data and intstruction wishbone if icache/dcache not included)
64 # https://bugs.libre-soc.org/show_bug.cgi?id=520
65 # TODO: do we actually care if these are not domain-synchronised?
66 # honestly probably not.
67 pspec.wb_icache_en = self.jtag.wb_icache_en
68 pspec.wb_dcache_en = self.jtag.wb_dcache_en
69
70 # add interrupt controller?
71 self.xics = hasattr(pspec, "xics") and pspec.xics == True
72 if self.xics:
73 self.xics_icp = XICS_ICP()
74 self.xics_ics = XICS_ICS()
75 self.int_level_i = self.xics_ics.int_level_i
76
77 # add GPIO peripheral?
78 self.gpio = hasattr(pspec, "gpio") and pspec.gpio == True
79 if self.gpio:
80 self.simple_gpio = SimpleGPIO()
81 self.gpio_o = self.simple_gpio.gpio_o
82
83 # main instruction core25
84 self.core = core = NonProductionCore(pspec)
85
86 # instruction decoder. goes into Trap Record
87 pdecode = create_pdecode()
88 self.cur_state = CoreState("cur") # current state (MSR/PC/EINT)
89 self.pdecode2 = PowerDecode2(pdecode, state=self.cur_state,
90 opkls=IssuerDecode2ToOperand)
91
92 # Test Instruction memory
93 self.imem = ConfigFetchUnit(pspec).fu
94 # one-row cache of instruction read
95 self.iline = Signal(64) # one instruction line
96 self.iprev_adr = Signal(64) # previous address: if different, do read
97
98 # DMI interface
99 self.dbg = CoreDebug()
100
101 # instruction go/monitor
102 self.pc_o = Signal(64, reset_less=True)
103 self.pc_i = Data(64, "pc_i") # set "ok" to indicate "please change me"
104 self.core_bigendian_i = Signal()
105 self.busy_o = Signal(reset_less=True)
106 self.memerr_o = Signal(reset_less=True)
107
108 # FAST regfile read /write ports for PC, MSR, DEC/TB
109 staterf = self.core.regs.rf['state']
110 self.state_r_pc = staterf.r_ports['cia'] # PC rd
111 self.state_w_pc = staterf.w_ports['d_wr1'] # PC wr
112 self.state_r_msr = staterf.r_ports['msr'] # MSR rd
113
114 # DMI interface access
115 intrf = self.core.regs.rf['int']
116 crrf = self.core.regs.rf['cr']
117 xerrf = self.core.regs.rf['xer']
118 self.int_r = intrf.r_ports['dmi'] # INT read
119 self.cr_r = crrf.r_ports['full_cr_dbg'] # CR read
120 self.xer_r = xerrf.r_ports['full_xer'] # XER read
121
122 # hack method of keeping an eye on whether branch/trap set the PC
123 self.state_nia = self.core.regs.rf['state'].w_ports['nia']
124 self.state_nia.wen.name = 'state_nia_wen'
125
126 def elaborate(self, platform):
127 m = Module()
128 comb, sync = m.d.comb, m.d.sync
129
130 m.submodules.core = core = DomainRenamer("coresync")(self.core)
131 m.submodules.imem = imem = self.imem
132 m.submodules.dbg = dbg = self.dbg
133 if self.jtag_en:
134 m.submodules.jtag = jtag = self.jtag
135 # TODO: UART2GDB mux, here, from external pin
136 # see https://bugs.libre-soc.org/show_bug.cgi?id=499
137 sync += dbg.dmi.connect_to(jtag.dmi)
138
139 cur_state = self.cur_state
140
141 # XICS interrupt handler
142 if self.xics:
143 m.submodules.xics_icp = icp = self.xics_icp
144 m.submodules.xics_ics = ics = self.xics_ics
145 comb += icp.ics_i.eq(ics.icp_o) # connect ICS to ICP
146 sync += cur_state.eint.eq(icp.core_irq_o) # connect ICP to core
147
148 # GPIO test peripheral
149 if self.gpio:
150 m.submodules.simple_gpio = simple_gpio = self.simple_gpio
151
152 # connect one GPIO output to ICS bit 15 (like in microwatt soc.vhdl)
153 # XXX causes litex ECP5 test to get wrong idea about input and output
154 # (but works with verilator sim *sigh*)
155 #if self.gpio and self.xics:
156 # comb += self.int_level_i[15].eq(simple_gpio.gpio_o[0])
157
158 # instruction decoder
159 pdecode = create_pdecode()
160 m.submodules.dec2 = pdecode2 = self.pdecode2
161
162 # convenience
163 dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer
164 intrf = self.core.regs.rf['int']
165
166 # clock delay power-on reset
167 cd_por = ClockDomain(reset_less=True)
168 cd_sync = ClockDomain()
169 core_sync = ClockDomain("coresync")
170 m.domains += cd_por, cd_sync, core_sync
171
172 ti_rst = Signal(reset_less=True)
173 delay = Signal(range(4), reset=3)
174 with m.If(delay != 0):
175 m.d.por += delay.eq(delay - 1)
176 comb += cd_por.clk.eq(ClockSignal())
177
178 # power-on reset delay
179 core_rst = ResetSignal("coresync")
180 comb += ti_rst.eq(delay != 0 | dbg.core_rst_o | ResetSignal())
181 comb += core_rst.eq(ti_rst)
182
183 # busy/halted signals from core
184 comb += self.busy_o.eq(core.busy_o)
185 comb += pdecode2.dec.bigendian.eq(self.core_bigendian_i)
186
187 # temporary hack: says "go" immediately for both address gen and ST
188 l0 = core.l0
189 ldst = core.fus.fus['ldst0']
190 st_go_edge = rising_edge(m, ldst.st.rel_o)
191 m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o) # link addr-go direct to rel
192 m.d.comb += ldst.st.go_i.eq(st_go_edge) # link store-go to rising rel
193
194 # PC and instruction from I-Memory
195 pc_changed = Signal() # note write to PC
196 comb += self.pc_o.eq(cur_state.pc)
197 ilatch = Signal(32)
198
199 # next instruction (+4 on current)
200 nia = Signal(64, reset_less=True)
201 comb += nia.eq(cur_state.pc + 4)
202
203 # read the PC
204 pc = Signal(64, reset_less=True)
205 pc_ok_delay = Signal()
206 sync += pc_ok_delay.eq(~self.pc_i.ok)
207 with m.If(self.pc_i.ok):
208 # incoming override (start from pc_i)
209 comb += pc.eq(self.pc_i.data)
210 with m.Else():
211 # otherwise read StateRegs regfile for PC...
212 comb += self.state_r_pc.ren.eq(1<<StateRegs.PC)
213 # ... but on a 1-clock delay
214 with m.If(pc_ok_delay):
215 comb += pc.eq(self.state_r_pc.data_o)
216
217 # don't write pc every cycle
218 comb += self.state_w_pc.wen.eq(0)
219 comb += self.state_w_pc.data_i.eq(0)
220
221 # don't read msr every cycle
222 comb += self.state_r_msr.ren.eq(0)
223 msr_read = Signal(reset=1)
224
225 # connect up debug signals
226 # TODO comb += core.icache_rst_i.eq(dbg.icache_rst_o)
227 comb += dbg.terminate_i.eq(core.core_terminate_o)
228 comb += dbg.state.pc.eq(pc)
229 #comb += dbg.state.pc.eq(cur_state.pc)
230 comb += dbg.state.msr.eq(cur_state.msr)
231
232 # temporaries
233 core_busy_o = core.busy_o # core is busy
234 core_ivalid_i = core.ivalid_i # instruction is valid
235 core_issue_i = core.issue_i # instruction is issued
236 dec_opcode_i = pdecode2.dec.raw_opcode_in # raw opcode
237
238 insn_type = core.e.do.insn_type
239
240 # handshake signals between fetch and decode/execute
241 # fetch FSM can run as soon as the PC is valid
242 fetch_pc_valid_i = Signal()
243 fetch_pc_ready_o = Signal()
244 # when done, deliver the instruction to the next FSM
245 fetch_insn_o = Signal(32, reset_less=True)
246 fetch_insn_valid_o = Signal()
247 fetch_insn_ready_i = Signal()
248
249 # actually use a nmigen FSM for the first time (w00t)
250 # this FSM is perhaps unusual in that it detects conditions
251 # then "holds" information, combinatorially, for the core
252 # (as opposed to using sync - which would be on a clock's delay)
253 # this includes the actual opcode, valid flags and so on.
254 with m.FSM(name='fetch_fsm'):
255
256 # waiting (zzz)
257 with m.State("IDLE"):
258 with m.If(~dbg.core_stop_o & ~core_rst):
259 comb += fetch_pc_ready_o.eq(1)
260 with m.If(fetch_pc_valid_i):
261 # instruction allowed to go: start by reading the PC
262 # capture the PC and also drop it into Insn Memory
263 # we have joined a pair of combinatorial memory
264 # lookups together. this is Generally Bad.
265 comb += self.imem.a_pc_i.eq(pc)
266 comb += self.imem.a_valid_i.eq(1)
267 comb += self.imem.f_valid_i.eq(1)
268 sync += cur_state.pc.eq(pc)
269
270 # initiate read of MSR. arrives one clock later
271 comb += self.state_r_msr.ren.eq(1 << StateRegs.MSR)
272 sync += msr_read.eq(0)
273
274 m.next = "INSN_READ" # move to "wait for bus" phase
275 with m.Else():
276 comb += core.core_stopped_i.eq(1)
277 comb += dbg.core_stopped_i.eq(1)
278
279 # dummy pause to find out why simulation is not keeping up
280 with m.State("INSN_READ"):
281 # one cycle later, msr read arrives. valid only once.
282 with m.If(~msr_read):
283 sync += msr_read.eq(1) # yeah don't read it again
284 sync += cur_state.msr.eq(self.state_r_msr.data_o)
285 with m.If(self.imem.f_busy_o): # zzz...
286 # busy: stay in wait-read
287 comb += self.imem.a_valid_i.eq(1)
288 comb += self.imem.f_valid_i.eq(1)
289 with m.Else():
290 # not busy: instruction fetched
291 f_instr_o = self.imem.f_instr_o
292 if f_instr_o.width == 32:
293 insn = f_instr_o
294 else:
295 insn = f_instr_o.word_select(cur_state.pc[2], 32)
296 # capture and hold the instruction from memory
297 sync += fetch_insn_o.eq(insn)
298 m.next = "INSN_READY"
299
300 with m.State("INSN_READY"):
301 # hand over the instruction, to be decoded
302 comb += fetch_insn_valid_o.eq(1)
303 with m.If(fetch_insn_ready_i):
304 m.next = "IDLE"
305
306 # decode / issue / execute FSM
307 with m.FSM():
308
309 # go fetch the instruction at the current PC
310 # at this point, there is no instruction running, that
311 # could inadvertently update the PC.
312 with m.State("INSN_FETCH"):
313 comb += fetch_pc_valid_i.eq(1)
314 with m.If(fetch_pc_ready_o):
315 m.next = "INSN_WAIT"
316
317 # decode the instruction when it arrives
318 with m.State("INSN_WAIT"):
319 comb += fetch_insn_ready_i.eq(1)
320 with m.If(fetch_insn_valid_o):
321 # decode the instruction
322 comb += dec_opcode_i.eq(fetch_insn_o) # actual opcode
323 sync += core.e.eq(pdecode2.e)
324 sync += core.state.eq(cur_state)
325 sync += core.raw_insn_i.eq(dec_opcode_i)
326 sync += core.bigendian_i.eq(self.core_bigendian_i)
327 sync += ilatch.eq(insn) # latch current insn
328 # also drop PC and MSR into decode "state"
329 m.next = "INSN_START" # move to "start"
330
331 # waiting for instruction bus (stays there until not busy)
332 with m.State("INSN_START"):
333 comb += core_ivalid_i.eq(1) # instruction is valid
334 comb += core_issue_i.eq(1) # and issued
335 sync += pc_changed.eq(0)
336
337 m.next = "INSN_ACTIVE" # move to "wait completion"
338
339 # instruction started: must wait till it finishes
340 with m.State("INSN_ACTIVE"):
341 with m.If(insn_type != MicrOp.OP_NOP):
342 comb += core_ivalid_i.eq(1) # instruction is valid
343 with m.If(self.state_nia.wen & (1<<StateRegs.PC)):
344 sync += pc_changed.eq(1)
345 with m.If(~core_busy_o): # instruction done!
346 # ok here we are not reading the branch unit. TODO
347 # this just blithely overwrites whatever pipeline
348 # updated the PC
349 with m.If(~pc_changed):
350 comb += self.state_w_pc.wen.eq(1<<StateRegs.PC)
351 comb += self.state_w_pc.data_i.eq(nia)
352 sync += core.e.eq(0)
353 sync += core.raw_insn_i.eq(0)
354 sync += core.bigendian_i.eq(0)
355 m.next = "INSN_FETCH" # back to fetch
356
357 # this bit doesn't have to be in the FSM: connect up to read
358 # regfiles on demand from DMI
359 with m.If(d_reg.req): # request for regfile access being made
360 # TODO: error-check this
361 # XXX should this be combinatorial? sync better?
362 if intrf.unary:
363 comb += self.int_r.ren.eq(1<<d_reg.addr)
364 else:
365 comb += self.int_r.addr.eq(d_reg.addr)
366 comb += self.int_r.ren.eq(1)
367 d_reg_delay = Signal()
368 sync += d_reg_delay.eq(d_reg.req)
369 with m.If(d_reg_delay):
370 # data arrives one clock later
371 comb += d_reg.data.eq(self.int_r.data_o)
372 comb += d_reg.ack.eq(1)
373
374 # sigh same thing for CR debug
375 with m.If(d_cr.req): # request for regfile access being made
376 comb += self.cr_r.ren.eq(0b11111111) # enable all
377 d_cr_delay = Signal()
378 sync += d_cr_delay.eq(d_cr.req)
379 with m.If(d_cr_delay):
380 # data arrives one clock later
381 comb += d_cr.data.eq(self.cr_r.data_o)
382 comb += d_cr.ack.eq(1)
383
384 # aaand XER...
385 with m.If(d_xer.req): # request for regfile access being made
386 comb += self.xer_r.ren.eq(0b111111) # enable all
387 d_xer_delay = Signal()
388 sync += d_xer_delay.eq(d_xer.req)
389 with m.If(d_xer_delay):
390 # data arrives one clock later
391 comb += d_xer.data.eq(self.xer_r.data_o)
392 comb += d_xer.ack.eq(1)
393
394 # DEC and TB inc/dec FSM
395 self.tb_dec_fsm(m, cur_state.dec)
396
397 return m
398
399 def tb_dec_fsm(self, m, spr_dec):
400 """tb_dec_fsm
401
402 this is a FSM for updating either dec or tb. it runs alternately
403 DEC, TB, DEC, TB. note that SPR pipeline could have written a new
404 value to DEC, however the regfile has "passthrough" on it so this
405 *should* be ok.
406
407 see v3.0B p1097-1099 for Timeer Resource and p1065 and p1076
408 """
409
410 comb, sync = m.d.comb, m.d.sync
411 fast_rf = self.core.regs.rf['fast']
412 fast_r_dectb = fast_rf.r_ports['issue'] # DEC/TB
413 fast_w_dectb = fast_rf.w_ports['issue'] # DEC/TB
414
415 with m.FSM() as fsm:
416
417 # initiates read of current DEC
418 with m.State("DEC_READ"):
419 comb += fast_r_dectb.addr.eq(FastRegs.DEC)
420 comb += fast_r_dectb.ren.eq(1)
421 m.next = "DEC_WRITE"
422
423 # waits for DEC read to arrive (1 cycle), updates with new value
424 with m.State("DEC_WRITE"):
425 new_dec = Signal(64)
426 # TODO: MSR.LPCR 32-bit decrement mode
427 comb += new_dec.eq(fast_r_dectb.data_o - 1)
428 comb += fast_w_dectb.addr.eq(FastRegs.DEC)
429 comb += fast_w_dectb.wen.eq(1)
430 comb += fast_w_dectb.data_i.eq(new_dec)
431 sync += spr_dec.eq(new_dec) # copy into cur_state for decoder
432 m.next = "TB_READ"
433
434 # initiates read of current TB
435 with m.State("TB_READ"):
436 comb += fast_r_dectb.addr.eq(FastRegs.TB)
437 comb += fast_r_dectb.ren.eq(1)
438 m.next = "TB_WRITE"
439
440 # waits for read TB to arrive, initiates write of current TB
441 with m.State("TB_WRITE"):
442 new_tb = Signal(64)
443 comb += new_tb.eq(fast_r_dectb.data_o + 1)
444 comb += fast_w_dectb.addr.eq(FastRegs.TB)
445 comb += fast_w_dectb.wen.eq(1)
446 comb += fast_w_dectb.data_i.eq(new_tb)
447 m.next = "DEC_READ"
448
449 return m
450
451 def __iter__(self):
452 yield from self.pc_i.ports()
453 yield self.pc_o
454 yield self.memerr_o
455 yield from self.core.ports()
456 yield from self.imem.ports()
457 yield self.core_bigendian_i
458 yield self.busy_o
459
460 def ports(self):
461 return list(self)
462
463 def external_ports(self):
464 ports = self.pc_i.ports()
465 ports += [self.pc_o, self.memerr_o, self.core_bigendian_i, self.busy_o,
466 ]
467
468 if self.jtag_en:
469 ports += list(self.jtag.external_ports())
470 else:
471 # don't add DMI if JTAG is enabled
472 ports += list(self.dbg.dmi.ports())
473
474 ports += list(self.imem.ibus.fields.values())
475 ports += list(self.core.l0.cmpi.lsmem.lsi.slavebus.fields.values())
476
477 if self.xics:
478 ports += list(self.xics_icp.bus.fields.values())
479 ports += list(self.xics_ics.bus.fields.values())
480 ports.append(self.int_level_i)
481
482 if self.gpio:
483 ports += list(self.simple_gpio.bus.fields.values())
484 ports.append(self.gpio_o)
485
486 return ports
487
488 def ports(self):
489 return list(self)
490
491
492 class TestIssuer(Elaboratable):
493 def __init__(self, pspec):
494 self.ti = TestIssuerInternal(pspec)
495
496 self.pll = DummyPLL()
497
498 # PLL direct clock or not
499 self.pll_en = hasattr(pspec, "use_pll") and pspec.use_pll
500 if self.pll_en:
501 self.pll_18_o = Signal(reset_less=True)
502
503 def elaborate(self, platform):
504 m = Module()
505 comb = m.d.comb
506
507 # TestIssuer runs at direct clock
508 m.submodules.ti = ti = self.ti
509 cd_int = ClockDomain("coresync")
510
511 if self.pll_en:
512 # ClockSelect runs at PLL output internal clock rate
513 m.submodules.pll = pll = self.pll
514
515 # add clock domains from PLL
516 cd_pll = ClockDomain("pllclk")
517 m.domains += cd_pll
518
519 # PLL clock established. has the side-effect of running clklsel
520 # at the PLL's speed (see DomainRenamer("pllclk") above)
521 pllclk = ClockSignal("pllclk")
522 comb += pllclk.eq(pll.clk_pll_o)
523
524 # wire up external 24mhz to PLL
525 comb += pll.clk_24_i.eq(ClockSignal())
526
527 # output 18 mhz PLL test signal
528 comb += self.pll_18_o.eq(pll.pll_18_o)
529
530 # now wire up ResetSignals. don't mind them being in this domain
531 pll_rst = ResetSignal("pllclk")
532 comb += pll_rst.eq(ResetSignal())
533
534 # internal clock is set to selector clock-out. has the side-effect of
535 # running TestIssuer at this speed (see DomainRenamer("intclk") above)
536 intclk = ClockSignal("coresync")
537 if self.pll_en:
538 comb += intclk.eq(pll.clk_pll_o)
539 else:
540 comb += intclk.eq(ClockSignal())
541
542 return m
543
544 def ports(self):
545 return list(self.ti.ports()) + list(self.pll.ports()) + \
546 [ClockSignal(), ResetSignal()]
547
548 def external_ports(self):
549 ports = self.ti.external_ports()
550 ports.append(ClockSignal())
551 ports.append(ResetSignal())
552 if self.pll_en:
553 ports.append(self.pll.clk_sel_i)
554 ports.append(self.pll_18_o)
555 ports.append(self.pll.pll_lck_o)
556 return ports
557
558
559 if __name__ == '__main__':
560 units = {'alu': 1, 'cr': 1, 'branch': 1, 'trap': 1, 'logical': 1,
561 'spr': 1,
562 'div': 1,
563 'mul': 1,
564 'shiftrot': 1
565 }
566 pspec = TestMemPspec(ldst_ifacetype='bare_wb',
567 imem_ifacetype='bare_wb',
568 addr_wid=48,
569 mask_wid=8,
570 reg_wid=64,
571 units=units)
572 dut = TestIssuer(pspec)
573 vl = main(dut, ports=dut.ports(), name="test_issuer")
574
575 if len(sys.argv) == 1:
576 vl = rtlil.convert(dut, ports=dut.external_ports(), name="test_issuer")
577 with open("test_issuer.il", "w") as f:
578 f.write(vl)