X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fadd%2Fsinglepipe.py;h=96fb86f3e91a46f0f8f52d57458d3f2e9cbb3b2c;hb=f997f92aae9a42a1482c6273a807c1e49fc0ab28;hp=c13d19995bd9cda361c9106a27c6992b0b1d5a70;hpb=c7b2626f89f9c419f14e88e7f87490bf14b88659;p=ieee754fpu.git diff --git a/src/add/singlepipe.py b/src/add/singlepipe.py index c13d1999..96fb86f3 100644 --- a/src/add/singlepipe.py +++ b/src/add/singlepipe.py @@ -1,4 +1,5 @@ -""" Pipeline and BufferedPipeline implementation, conforming to the same API. +""" Pipeline and BufferedHandshake implementation, conforming to the same API. + For multi-input and multi-output variants, see multipipe. eq: -- @@ -33,27 +34,83 @@ the submodules must be combinatorial blocks and must have their inputs and output linked combinatorially. + Both StageCls (for use with non-static classes) and Stage (for use + by static classes) are abstract classes from which, for convenience + and as a courtesy to other developers, anything conforming to the + Stage API may *choose* to derive. + StageChain: ---------- A useful combinatorial wrapper around stages that chains them together - and then presents a Stage-API-conformant interface. + and then presents a Stage-API-conformant interface. By presenting + the same API as the stages it wraps, it can clearly be used recursively. + + RecordBasedStage: + ---------------- + + A convenience class that takes an input shape, output shape, a + "processing" function and an optional "setup" function. Honestly + though, there's not much more effort to just... create a class + that returns a couple of Records (see ExampleAddRecordStage in + examples). + + PassThroughStage: + ---------------- + + A convenience class that takes a single function as a parameter, + that is chain-called to create the exact same input and output spec. + It has a process() function that simply returns its input. + + Instances of this class are completely redundant if handed to + StageChain, however when passed to UnbufferedPipeline they + can be used to introduce a single clock delay. + + ControlBase: + ----------- + + The base class for pipelines. Contains previous and next ready/valid/data. + Also has an extremely useful "connect" function that can be used to + connect a chain of pipelines and present the exact same prev/next + ready/valid/data API. UnbufferedPipeline: ------------------ A simple stalling clock-synchronised pipeline that has no buffering - (unlike BufferedPipeline). A stall anywhere along the line will - result in a stall back-propagating down the entire chain. + (unlike BufferedHandshake). Data flows on *every* clock cycle when + the conditions are right (this is nominally when the input is valid + and the output is ready). - The BufferedPipeline by contrast will buffer incoming data, allowing - previous stages one clock cycle's grace before also having to stall. + A stall anywhere along the line will result in a stall back-propagating + down the entire chain. The BufferedHandshake by contrast will buffer + incoming data, allowing previous stages one clock cycle's grace before + also having to stall. An advantage of the UnbufferedPipeline over the Buffered one is that the amount of logic needed (number of gates) is greatly - reduced. + reduced (no second set of buffers basically) + + The disadvantage of the UnbufferedPipeline is that the valid/ready + logic, if chained together, is *combinatorial*, resulting in + progressively larger gate delay. + + PassThroughHandshake: + ------------------ + + A Control class that introduces a single clock delay, passing its + data through unaltered. Unlike RegisterPipeline (which relies + on UnbufferedPipeline and PassThroughStage) it handles ready/valid + itself. + + RegisterPipeline: + ---------------- + + A convenience class that, because UnbufferedPipeline introduces a single + clock delay, when its stage is a PassThroughStage, it results in a Pipeline + stage that, duh, delays its (unmodified) input by one clock cycle. - BufferedPipeline: + BufferedHandshake: ---------------- nmigen implementation of buffered pipeline stage, based on zipcpu: @@ -99,14 +156,39 @@ dan calls a "buffered handshake". it's quite a complex state machine! + + SimpleHandshake + --------------- + + Synchronised pipeline, Based on: + https://github.com/ZipCPU/dbgbus/blob/master/hexbus/rtl/hbdeword.v """ -from nmigen import Signal, Cat, Const, Mux, Module +from nmigen import Signal, Cat, Const, Mux, Module, Value from nmigen.cli import verilog, rtlil +from nmigen.lib.fifo import SyncFIFO, SyncFIFOBuffered +from nmigen.hdl.ast import ArrayProxy from nmigen.hdl.rec import Record, Layout from abc import ABCMeta, abstractmethod from collections.abc import Sequence +from queue import Queue + + +class RecordObject(Record): + def __init__(self, layout=None, name=None): + Record.__init__(self, layout=layout or [], name=None) + + def __setattr__(self, k, v): + if k in dir(Record) or "fields" not in self.__dict__: + return object.__setattr__(self, k, v) + self.fields[k] = v + if isinstance(v, Record): + newlayout = {k: (k, v.layout)} + else: + newlayout = {k: (k, v.shape())} + self.layout.fields.update(newlayout) + class PrevControl: @@ -118,26 +200,50 @@ class PrevControl: * i_data : an input - added by the user of this class """ - def __init__(self, i_width=1): + def __init__(self, i_width=1, stage_ctl=False): + self.stage_ctl = stage_ctl self.i_valid = Signal(i_width, name="p_i_valid") # prev >>in self - self.o_ready = Signal(name="p_o_ready") # prev < 1: # multi-bit case: valid only when i_valid is all 1s + if vlen > 1: + # multi-bit case: valid only when i_valid is all 1s all1s = Const(-1, (len(self.i_valid), False)) - return self.i_valid == all1s - # single-bit i_valid case - return self.i_valid + i_valid = (self.i_valid == all1s) + else: + # single-bit i_valid case + i_valid = self.i_valid + + # when stage indicates not ready, incoming data + # must "appear" to be not ready too + if self.stage_ctl: + i_valid = i_valid & self.s_o_ready + + return i_valid class NextControl: @@ -146,9 +252,19 @@ class NextControl: * i_ready: input from next stage indicating that it can accept data * o_data : an output - added by the user of this class """ - def __init__(self): + def __init__(self, stage_ctl=False): + self.stage_ctl = stage_ctl self.o_valid = Signal(name="n_o_valid") # self out>> next self.i_ready = Signal(name="n_i_ready") # self <>in stage n.o_valid out>> stage+1 + stage-1 p.o_ready <>in stage n.o_data out>> stage+1 + | | + +--process->--^ + Truth Table + + Inputs Temporary Output Data + ------- ---------- ----- ---- + P P N N PiV& ~NiR& N P + i o i o PoR NoV o o + V R R V V R + + ------- - - - - + 0 0 0 0 0 0 >0 0 reg + 0 0 0 1 0 1 >1 0 reg + 0 0 1 0 0 0 0 1 process(i_data) + 0 0 1 1 0 0 0 1 process(i_data) + ------- - - - - + 0 1 0 0 0 0 >0 0 reg + 0 1 0 1 0 1 >1 0 reg + 0 1 1 0 0 0 0 1 process(i_data) + 0 1 1 1 0 0 0 1 process(i_data) + ------- - - - - + 1 0 0 0 0 0 >0 0 reg + 1 0 0 1 0 1 >1 0 reg + 1 0 1 0 0 0 0 1 process(i_data) + 1 0 1 1 0 0 0 1 process(i_data) + ------- - - - - + 1 1 0 0 1 0 1 0 process(i_data) + 1 1 0 1 1 1 1 0 process(i_data) + 1 1 1 0 1 0 1 1 process(i_data) + 1 1 1 1 1 0 1 1 process(i_data) + ------- - - - - + """ + + def elaborate(self, platform): + self.m = m = ControlBase._elaborate(self, platform) + + r_busy = Signal() + result = self.stage.ospec() + + # establish some combinatorial temporaries + n_i_ready = Signal(reset_less=True, name="n_i_rdy_data") + p_i_valid_p_o_ready = Signal(reset_less=True) + p_i_valid = Signal(reset_less=True) + m.d.comb += [p_i_valid.eq(self.p.i_valid_test), + n_i_ready.eq(self.n.i_ready_test), + p_i_valid_p_o_ready.eq(p_i_valid & self.p.o_ready), + ] + + # store result of processing in combinatorial temporary + m.d.comb += eq(result, self.stage.process(self.p.i_data)) + + # previous valid and ready + with m.If(p_i_valid_p_o_ready): + o_data = self._postprocess(result) + m.d.sync += [r_busy.eq(1), # output valid + eq(self.n.o_data, o_data), # update output ] + # previous invalid or not ready, however next is accepting + with m.Elif(n_i_ready): + o_data = self._postprocess(result) + m.d.sync += [eq(self.n.o_data, o_data)] + # TODO: could still send data here (if there was any) + #m.d.sync += self.n.o_valid.eq(0) # ...so set output invalid + m.d.sync += r_busy.eq(0) # ...so set output invalid - # (n.i_ready) false and (n.o_valid) true: - with m.Elif(i_p_valid_o_p_ready): - # If next stage *is* ready, and not stalled yet, accept input - m.d.sync += self.p.o_ready.eq(~(p_i_valid & self.n.o_valid)) + m.d.comb += self.n.o_valid.eq(r_busy) + # if next is ready, so is previous + m.d.comb += self.p._o_ready.eq(n_i_ready) - return m + return self.m class UnbufferedPipeline(ControlBase): @@ -502,7 +845,7 @@ class UnbufferedPipeline(ControlBase): Note that a stall in one stage will result in the entire pipeline chain stalling. - Also that unlike BufferedPipeline, the valid/ready signalling does NOT + Also that unlike BufferedHandshake, the valid/ready signalling does NOT travel synchronously with the data: the valid/ready signalling combines in a *combinatorial* fashion. Therefore, a long pipeline chain will lengthen propagation delays. @@ -530,49 +873,217 @@ class UnbufferedPipeline(ControlBase): result: output_shape according to ospec The output of the combinatorial logic. it is updated COMBINATORIALLY (no clock dependence). + + Truth Table + + Inputs Temp Output Data + ------- - ----- ---- + P P N N ~NiR& N P + i o i o NoV o o + V R R V V R + + ------- - - - + 0 0 0 0 0 0 1 reg + 0 0 0 1 1 1 0 reg + 0 0 1 0 0 0 1 reg + 0 0 1 1 0 0 1 reg + ------- - - - + 0 1 0 0 0 0 1 reg + 0 1 0 1 1 1 0 reg + 0 1 1 0 0 0 1 reg + 0 1 1 1 0 0 1 reg + ------- - - - + 1 0 0 0 0 1 1 reg + 1 0 0 1 1 1 0 reg + 1 0 1 0 0 1 1 reg + 1 0 1 1 0 1 1 reg + ------- - - - + 1 1 0 0 0 1 1 process(i_data) + 1 1 0 1 1 1 0 process(i_data) + 1 1 1 0 0 1 1 process(i_data) + 1 1 1 1 0 1 1 process(i_data) + ------- - - - + + Note: PoR is *NOT* involved in the above decision-making. """ - def __init__(self, stage): - ControlBase.__init__(self) - self.stage = stage - self._data_valid = Signal() + def elaborate(self, platform): + self.m = m = ControlBase._elaborate(self, platform) - # set up the input and output data - self.p.i_data = stage.ispec() # input type - self.n.o_data = stage.ospec() # output type + data_valid = Signal() # is data valid or not + r_data = self.stage.ospec() # output type + + # some temporaries + p_i_valid = Signal(reset_less=True) + pv = Signal(reset_less=True) + buf_full = Signal(reset_less=True) + m.d.comb += p_i_valid.eq(self.p.i_valid_test) + m.d.comb += pv.eq(self.p.i_valid & self.p.o_ready) + m.d.comb += buf_full.eq(~self.n.i_ready_test & data_valid) + + m.d.comb += self.n.o_valid.eq(data_valid) + m.d.comb += self.p._o_ready.eq(~data_valid | self.n.i_ready_test) + m.d.sync += data_valid.eq(p_i_valid | buf_full) + + with m.If(pv): + m.d.sync += eq(r_data, self.stage.process(self.p.i_data)) + o_data = self._postprocess(r_data) + m.d.comb += eq(self.n.o_data, o_data) + + return self.m + + +class UnbufferedPipeline2(ControlBase): + """ A simple pipeline stage with single-clock synchronisation + and two-way valid/ready synchronised signalling. + + Note that a stall in one stage will result in the entire pipeline + chain stalling. + + Also that unlike BufferedHandshake, the valid/ready signalling does NOT + travel synchronously with the data: the valid/ready signalling + combines in a *combinatorial* fashion. Therefore, a long pipeline + chain will lengthen propagation delays. + + Argument: stage. see Stage API, above + + stage-1 p.i_valid >>in stage n.o_valid out>> stage+1 + stage-1 p.o_ready <>in stage n.o_data out>> stage+1 + | | | + +- process-> buf <-+ + Attributes: + ----------- + p.i_data : StageInput, shaped according to ispec + The pipeline input + p.o_data : StageOutput, shaped according to ospec + The pipeline output + buf : output_shape according to ospec + A temporary (buffered) copy of a valid output + This is HELD if the output is not ready. It is updated + SYNCHRONOUSLY. + + Inputs Temp Output Data + ------- - ----- + P P N N ~NiR& N P (buf_full) + i o i o NoV o o + V R R V V R + + ------- - - - + 0 0 0 0 0 0 1 process(i_data) + 0 0 0 1 1 1 0 reg (odata, unchanged) + 0 0 1 0 0 0 1 process(i_data) + 0 0 1 1 0 0 1 process(i_data) + ------- - - - + 0 1 0 0 0 0 1 process(i_data) + 0 1 0 1 1 1 0 reg (odata, unchanged) + 0 1 1 0 0 0 1 process(i_data) + 0 1 1 1 0 0 1 process(i_data) + ------- - - - + 1 0 0 0 0 1 1 process(i_data) + 1 0 0 1 1 1 0 reg (odata, unchanged) + 1 0 1 0 0 1 1 process(i_data) + 1 0 1 1 0 1 1 process(i_data) + ------- - - - + 1 1 0 0 0 1 1 process(i_data) + 1 1 0 1 1 1 0 reg (odata, unchanged) + 1 1 1 0 0 1 1 process(i_data) + 1 1 1 1 0 1 1 process(i_data) + ------- - - - + + Note: PoR is *NOT* involved in the above decision-making. + """ def elaborate(self, platform): - m = Module() + self.m = m = ControlBase._elaborate(self, platform) - r_data = self.stage.ispec() # input type - result = self.stage.ospec() # output data - if hasattr(self.stage, "setup"): - self.stage.setup(m, r_data) + buf_full = Signal() # is data valid or not + buf = self.stage.ospec() # output type + # some temporaries p_i_valid = Signal(reset_less=True) - m.d.comb += p_i_valid.eq(self.p.i_valid_logic()) - m.d.comb += eq(result, self.stage.process(r_data)) - m.d.comb += self.n.o_valid.eq(self._data_valid) - m.d.comb += self.p.o_ready.eq(~self._data_valid | self.n.i_ready) - m.d.sync += self._data_valid.eq(p_i_valid | \ - (~self.n.i_ready & self._data_valid)) - with m.If(self.p.i_valid & self.p.o_ready): - m.d.sync += eq(r_data, self.p.i_data) - m.d.comb += eq(self.n.o_data, result) - return m + m.d.comb += p_i_valid.eq(self.p.i_valid_test) + + m.d.comb += self.n.o_valid.eq(buf_full | p_i_valid) + m.d.comb += self.p._o_ready.eq(~buf_full) + m.d.sync += buf_full.eq(~self.n.i_ready_test & self.n.o_valid) + + o_data = Mux(buf_full, buf, self.stage.process(self.p.i_data)) + if hasattr(self.stage, "postprocess"): + o_data = self.stage.postprocess(o_data) + m.d.comb += eq(self.n.o_data, o_data) + m.d.sync += eq(buf, self.n.o_data) + + return self.m class PassThroughStage(StageCls): """ a pass-through stage which has its input data spec equal to its output, and "passes through" its data from input to output. """ - def __init__(self, iospec): + def __init__(self, iospecfn): self.iospecfn = iospecfn def ispec(self): return self.iospecfn() def ospec(self): return self.iospecfn() def process(self, i): return i +class PassThroughHandshake(ControlBase): + """ A control block that delays by one clock cycle. + + Inputs Temporary Output Data + ------- ------------------ ----- ---- + P P N N PiV& PiV| NiR| pvr N P (pvr) + i o i o PoR ~PoR ~NoV o o + V R R V V R + + ------- - - - - - - + 0 0 0 0 0 1 1 0 1 1 odata (unchanged) + 0 0 0 1 0 1 0 0 1 0 odata (unchanged) + 0 0 1 0 0 1 1 0 1 1 odata (unchanged) + 0 0 1 1 0 1 1 0 1 1 odata (unchanged) + ------- - - - - - - + 0 1 0 0 0 0 1 0 0 1 odata (unchanged) + 0 1 0 1 0 0 0 0 0 0 odata (unchanged) + 0 1 1 0 0 0 1 0 0 1 odata (unchanged) + 0 1 1 1 0 0 1 0 0 1 odata (unchanged) + ------- - - - - - - + 1 0 0 0 0 1 1 1 1 1 process(in) + 1 0 0 1 0 1 0 0 1 0 odata (unchanged) + 1 0 1 0 0 1 1 1 1 1 process(in) + 1 0 1 1 0 1 1 1 1 1 process(in) + ------- - - - - - - + 1 1 0 0 1 1 1 1 1 1 process(in) + 1 1 0 1 1 1 0 0 1 0 odata (unchanged) + 1 1 1 0 1 1 1 1 1 1 process(in) + 1 1 1 1 1 1 1 1 1 1 process(in) + ------- - - - - - - + + """ + + def elaborate(self, platform): + self.m = m = ControlBase._elaborate(self, platform) + + r_data = self.stage.ospec() # output type + + # temporaries + p_i_valid = Signal(reset_less=True) + pvr = Signal(reset_less=True) + m.d.comb += p_i_valid.eq(self.p.i_valid_test) + m.d.comb += pvr.eq(p_i_valid & self.p.o_ready) + + m.d.comb += self.p.o_ready.eq(~self.n.o_valid | self.n.i_ready_test) + m.d.sync += self.n.o_valid.eq(p_i_valid | ~self.p.o_ready) + + odata = Mux(pvr, self.stage.process(self.p.i_data), r_data) + m.d.sync += eq(r_data, odata) + if hasattr(self.stage, "postprocess"): + r_data = self.stage.postprocess(r_data) + m.d.comb += eq(self.n.o_data, r_data) + + return m + + class RegisterPipeline(UnbufferedPipeline): """ A pipeline stage that delays by one clock cycle, creating a sync'd latch out of o_data and o_valid as an indirect byproduct @@ -581,3 +1092,88 @@ class RegisterPipeline(UnbufferedPipeline): def __init__(self, iospecfn): UnbufferedPipeline.__init__(self, PassThroughStage(iospecfn)) + +class FIFOControl(ControlBase): + """ FIFO Control. Uses SyncFIFO to store data, coincidentally + happens to have same valid/ready signalling as Stage API. + + i_data -> fifo.din -> FIFO -> fifo.dout -> o_data + """ + + def __init__(self, depth, stage, in_multi=None, stage_ctl=False, + fwft=True, buffered=False): + """ FIFO Control + + * depth: number of entries in the FIFO + * stage: data processing block + * fwft : first word fall-thru mode (non-fwft introduces delay) + * buffered: use buffered FIFO (introduces extra cycle delay) + + NOTE 1: FPGAs may have trouble with the defaults for SyncFIFO + (fwft=True, buffered=False) + + NOTE 2: i_data *must* have a shape function. it can therefore + be a Signal, or a Record, or a RecordObject. + + data is processed (and located) as follows: + + self.p self.stage temp fn temp fn temp fp self.n + i_data->process()->result->flatten->din.FIFO.dout->flatten(o_data) + + yes, really: flatten produces a Cat() which can be assigned to. + this is how the FIFO gets de-flattened without needing a de-flatten + function + """ + + assert not (fwft and buffered), "buffered cannot do fwft" + if buffered: + depth += 1 + self.fwft = fwft + self.buffered = buffered + self.fdepth = depth + ControlBase.__init__(self, stage, in_multi, stage_ctl) + + def elaborate(self, platform): + self.m = m = ControlBase._elaborate(self, platform) + + # make a FIFO with a signal of equal width to the o_data. + (fwidth, _) = self.n.o_data.shape() + if self.buffered: + fifo = SyncFIFOBuffered(fwidth, self.fdepth) + else: + fifo = Queue(fwidth, self.fdepth, fwft=self.fwft) + m.submodules.fifo = fifo + + # store result of processing in combinatorial temporary + result = self.stage.ospec() + m.d.comb += eq(result, self.stage.process(self.p.i_data)) + + # connect previous rdy/valid/data - do flatten on i_data + # NOTE: cannot do the PrevControl-looking trick because + # of need to process the data. shaaaame.... + m.d.comb += [fifo.we.eq(self.p.i_valid_test), + self.p.o_ready.eq(fifo.writable), + eq(fifo.din, flatten(result)), + ] + + # connect next rdy/valid/data - do flatten on o_data + connections = [self.n.o_valid.eq(fifo.readable), + fifo.re.eq(self.n.i_ready_test), + ] + if self.fwft or self.buffered: + m.d.comb += connections + else: + m.d.sync += connections # unbuffered fwft mode needs sync + o_data = flatten(self.n.o_data).eq(fifo.dout) + if hasattr(self.stage, "postprocess"): + o_data = self.stage.postprocess(o_data) + m.d.comb += o_data + + return m + +""" +class BufferedHandshake(FIFOControl): + def __init__(self, stage, in_multi=None, stage_ctl=False): + FIFOControl.__init__(self, 2, stage, in_multi, stage_ctl, + fwft=True, buffered=False) +"""