From: Luke Kenneth Casson Leighton Date: Wed, 27 Mar 2019 09:17:16 +0000 (+0000) Subject: split out pipeline classes into singlepipe.py X-Git-Tag: ls180-24jan2020~1475 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=8b1f314d3150b42787aaceea54ce06fb07b1a20e;p=ieee754fpu.git split out pipeline classes into singlepipe.py --- diff --git a/src/add/example_buf_pipe.py b/src/add/example_buf_pipe.py index 2e8e21d6..92575a73 100644 --- a/src/add/example_buf_pipe.py +++ b/src/add/example_buf_pipe.py @@ -1,106 +1,10 @@ -""" Pipeline and BufferedPipeline implementation, conforming to the same API. - - eq: - -- - - a strategically very important function that is identical in function - to nmigen's Signal.eq function, except it may take objects, or a list - of objects, or a tuple of objects, and where objects may also be - Records. - - Stage API: - --------- - - stage requires compliance with a strict API that may be - implemented in several means, including as a static class. - the methods of a stage instance must be as follows: - - * ispec() - Input data format specification - returns an object or a list or tuple of objects, or - a Record, each object having an "eq" function which - takes responsibility for copying by assignment all - sub-objects - * ospec() - Output data format specification - requirements as for ospec - * process(m, i) - Processes an ispec-formatted object - returns a combinatorial block of a result that - may be assigned to the output, by way of the "eq" - function - * setup(m, i) - Optional function for setting up submodules - may be used for more complex stages, to link - the input (i) to submodules. must take responsibility - for adding those submodules to the module (m). - the submodules must be combinatorial blocks and - must have their inputs and output linked combinatorially. - - StageChain: - ---------- - - A useful combinatorial wrapper around stages that chains them together - and then presents a Stage-API-conformant interface. - - UnbufferedPipeline: - ------------------ - - A simple stalling clock-synchronised pipeline that has no buffering - (unlike BufferedPipeline). A stall anywhere along the line will - result in a stall back-propagating down the entire chain. - - The BufferedPipeline by contrast will buffer incoming data, allowing - previous stages one clock cycle's grace before also having to stall. - - An advantage of the UnbufferedPipeline over the Buffered one is - that the amount of logic needed (number of gates) is greatly - reduced. - - BufferedPipeline: - ---------------- - - nmigen implementation of buffered pipeline stage, based on zipcpu: - https://zipcpu.com/blog/2017/08/14/strategies-for-pipelining.html - - this module requires quite a bit of thought to understand how it works - (and why it is needed in the first place). reading the above is - *strongly* recommended. - - unlike john dawson's IEEE754 FPU STB/ACK signalling, which requires - the STB / ACK signals to raise and lower (on separate clocks) before - data may proceeed (thus only allowing one piece of data to proceed - on *ALTERNATE* cycles), the signalling here is a true pipeline - where data will flow on *every* clock when the conditions are right. - - input acceptance conditions are when: - * incoming previous-stage strobe (p.i_valid) is HIGH - * outgoing previous-stage ready (p.o_ready) is LOW - - output transmission conditions are when: - * outgoing next-stage strobe (n.o_valid) is HIGH - * outgoing next-stage ready (n.i_ready) is LOW - - the tricky bit is when the input has valid data and the output is not - ready to accept it. if it wasn't for the clock synchronisation, it - would be possible to tell the input "hey don't send that data, we're - not ready". unfortunately, it's not possible to "change the past": - the previous stage *has no choice* but to pass on its data. - - therefore, the incoming data *must* be accepted - and stored: that - is the responsibility / contract that this stage *must* accept. - on the same clock, it's possible to tell the input that it must - not send any more data. this is the "stall" condition. - - we now effectively have *two* possible pieces of data to "choose" from: - the buffered data, and the incoming data. the decision as to which - to process and output is based on whether we are in "stall" or not. - i.e. when the next stage is no longer ready, the output comes from - the buffer if a stall had previously occurred, otherwise it comes - direct from processing the input. - - this allows us to respect a synchronous "travelling STB" with what - dan calls a "buffered handshake". - - it's quite a complex state machine! +""" Pipeline and BufferedPipeline examples """ +from singlepipe import (PrevControl, NextControl, ControlBase, + StageCls, Stage, StageChain, + BufferedPipeline, UnbufferedPipeline, eq) + from nmigen import Signal, Cat, Const, Mux, Module from nmigen.cli import verilog, rtlil from nmigen.hdl.rec import Record, Layout @@ -109,378 +13,6 @@ from abc import ABCMeta, abstractmethod from collections.abc import Sequence -class PrevControl: - """ contains signals that come *from* the previous stage (both in and out) - * i_valid: previous stage indicating all incoming data is valid. - may be a multi-bit signal, where all bits are required - to be asserted to indicate "valid". - * o_ready: output to next stage indicating readiness to accept data - * i_data : an input - added by the user of this class - """ - - def __init__(self, i_width=1): - self.i_valid = Signal(i_width, name="p_i_valid") # prev >>in self - self.o_ready = Signal(name="p_o_ready") # prev < 1: # multi-bit case: valid only when i_valid is all 1s - all1s = Const(-1, (len(self.i_valid), False)) - return self.i_valid == all1s - # single-bit i_valid case - return self.i_valid - - -class NextControl: - """ contains the signals that go *to* the next stage (both in and out) - * o_valid: output indicating to next stage that data is valid - * i_ready: input from next stage indicating that it can accept data - * o_data : an output - added by the user of this class - """ - def __init__(self): - self.o_valid = Signal(name="n_o_valid") # self out>> next - self.i_ready = Signal(name="n_i_ready") # self < self <---> out - | ^ - v | - [pipe1, pipe2, pipe3, pipe4] - | ^ | ^ | ^ - v | v | v | - out---in out--in out---in - - Also takes care of allocating i_data/o_data, by looking up - the data spec for each end of the pipechain. i.e It is NOT - necessary to allocate self.p.i_data or self.n.o_data manually: - this is handled AUTOMATICALLY, here. - - Basically this function is the direct equivalent of StageChain, - except that unlike StageChain, the Pipeline logic is followed. - - Just as StageChain presents an object that conforms to the - Stage API from a list of objects that also conform to the - Stage API, an object that calls this Pipeline connect function - has the exact same pipeline API as the list of pipline objects - it is called with. - - Thus it becomes possible to build up larger chains recursively. - More complex chains (multi-input, multi-output) will have to be - done manually. - """ - eqs = [] # collated list of assignment statements - - # connect inter-chain - for i in range(len(pipechain)-1): - pipe1 = pipechain[i] - pipe2 = pipechain[i+1] - eqs += pipe1.connect_to_next(pipe2) - - # connect front of chain to ourselves - front = pipechain[0] - self.p.i_data = front.stage.ispec() - eqs += front._connect_in(self) - - # connect end of chain to ourselves - end = pipechain[-1] - self.n.o_data = end.stage.ospec() - eqs += end._connect_out(self) - - # activate the assignments - m.d.comb += eqs - - def set_input(self, i): - """ helper function to set the input data - """ - return eq(self.p.i_data, i) - - def ports(self): - return [self.p.i_valid, self.n.i_ready, - self.n.o_valid, self.p.o_ready, - self.p.i_data, self.n.o_data # XXX need flattening! - ] - - -class BufferedPipeline(ControlBase): - """ buffered pipeline stage. data and strobe signals travel in sync. - if ever the input is ready and the output is not, processed data - is shunted in a temporary register. - - Argument: stage. see Stage API above - - stage-1 p.i_valid >>in stage n.o_valid out>> stage+1 - stage-1 p.o_ready <>in stage n.o_data out>> stage+1 - | | - process --->----^ - | | - +-- r_data ->-+ - - input data p.i_data is read (only), is processed and goes into an - intermediate result store [process()]. this is updated combinatorially. - - in a non-stall condition, the intermediate result will go into the - output (update_output). however if ever there is a stall, it goes - into r_data instead [update_buffer()]. - - when the non-stall condition is released, r_data is the first - to be transferred to the output [flush_buffer()], and the stall - condition cleared. - - on the next cycle (as long as stall is not raised again) the - input may begin to be processed and transferred directly to output. - - """ - def __init__(self, stage): - ControlBase.__init__(self) - self.stage = stage - - # set up the input and output data - self.p.i_data = stage.ispec() # input type - self.n.o_data = stage.ospec() - - def elaborate(self, platform): - m = Module() - - result = self.stage.ospec() - r_data = self.stage.ospec() - if hasattr(self.stage, "setup"): - self.stage.setup(m, self.p.i_data) - - # establish some combinatorial temporaries - o_n_validn = Signal(reset_less=True) - i_p_valid_o_p_ready = Signal(reset_less=True) - p_i_valid = Signal(reset_less=True) - m.d.comb += [p_i_valid.eq(self.p.i_valid_logic()), - o_n_validn.eq(~self.n.o_valid), - i_p_valid_o_p_ready.eq(p_i_valid & self.p.o_ready), - ] - - # store result of processing in combinatorial temporary - m.d.comb += eq(result, self.stage.process(self.p.i_data)) - - # if not in stall condition, update the temporary register - with m.If(self.p.o_ready): # not stalled - m.d.sync += eq(r_data, result) # update buffer - - with m.If(self.n.i_ready): # next stage is ready - with m.If(self.p.o_ready): # not stalled - # nothing in buffer: send (processed) input direct to output - m.d.sync += [self.n.o_valid.eq(p_i_valid), - eq(self.n.o_data, result), # update output - ] - with m.Else(): # p.o_ready is false, and something is in buffer. - # Flush the [already processed] buffer to the output port. - m.d.sync += [self.n.o_valid.eq(1), # declare reg empty - eq(self.n.o_data, r_data), # flush buffer - self.p.o_ready.eq(1), # clear stall condition - ] - # ignore input, since p.o_ready is also false. - - # (n.i_ready) is false here: next stage is ready - with m.Elif(o_n_validn): # next stage being told "ready" - m.d.sync += [self.n.o_valid.eq(p_i_valid), - self.p.o_ready.eq(1), # Keep the buffer empty - eq(self.n.o_data, result), # set output data - ] - - # (n.i_ready) false and (n.o_valid) true: - with m.Elif(i_p_valid_o_p_ready): - # If next stage *is* ready, and not stalled yet, accept input - m.d.sync += self.p.o_ready.eq(~(p_i_valid & self.n.o_valid)) - - return m - - class ExampleAddStage(StageCls): """ an example of how to use the buffered pipeline, as a class instance """ @@ -553,75 +85,8 @@ class ExampleBufPipe(BufferedPipeline): BufferedPipeline.__init__(self, ExampleStage) -class UnbufferedPipeline(ControlBase): - """ A simple pipeline stage with single-clock synchronisation - and two-way valid/ready synchronised signalling. - - Note that a stall in one stage will result in the entire pipeline - chain stalling. - - Also that unlike BufferedPipeline, the valid/ready signalling does NOT - travel synchronously with the data: the valid/ready signalling - combines in a *combinatorial* fashion. Therefore, a long pipeline - chain will lengthen propagation delays. - - Argument: stage. see Stage API, above - - stage-1 p.i_valid >>in stage n.o_valid out>> stage+1 - stage-1 p.o_ready <>in stage n.o_data out>> stage+1 - | | - r_data result - | | - +--process ->-+ - - Attributes: - ----------- - p.i_data : StageInput, shaped according to ispec - The pipeline input - p.o_data : StageOutput, shaped according to ospec - The pipeline output - r_data : input_shape according to ispec - A temporary (buffered) copy of a prior (valid) input. - This is HELD if the output is not ready. It is updated - SYNCHRONOUSLY. - result: output_shape according to ospec - The output of the combinatorial logic. it is updated - COMBINATORIALLY (no clock dependence). - """ - - def __init__(self, stage): - ControlBase.__init__(self) - self.stage = stage - self._data_valid = Signal() - - # set up the input and output data - self.p.i_data = stage.ispec() # input type - self.n.o_data = stage.ospec() # output type - - def elaborate(self, platform): - m = Module() - - r_data = self.stage.ispec() # input type - result = self.stage.ospec() # output data - if hasattr(self.stage, "setup"): - self.stage.setup(m, r_data) - - p_i_valid = Signal(reset_less=True) - m.d.comb += p_i_valid.eq(self.p.i_valid_logic()) - m.d.comb += eq(result, self.stage.process(r_data)) - m.d.comb += self.n.o_valid.eq(self._data_valid) - m.d.comb += self.p.o_ready.eq(~self._data_valid | self.n.i_ready) - m.d.sync += self._data_valid.eq(p_i_valid | \ - (~self.n.i_ready & self._data_valid)) - with m.If(self.p.i_valid & self.p.o_ready): - m.d.sync += eq(r_data, self.p.i_data) - m.d.comb += eq(self.n.o_data, result) - return m - - class ExamplePipeline(UnbufferedPipeline): - """ an example of how to use the combinatorial pipeline. + """ an example of how to use the unbuffered pipeline. """ def __init__(self): diff --git a/src/add/singlepipe.py b/src/add/singlepipe.py new file mode 100644 index 00000000..38175dd2 --- /dev/null +++ b/src/add/singlepipe.py @@ -0,0 +1,549 @@ +""" Pipeline and BufferedPipeline implementation, conforming to the same API. + + eq: + -- + + a strategically very important function that is identical in function + to nmigen's Signal.eq function, except it may take objects, or a list + of objects, or a tuple of objects, and where objects may also be + Records. + + Stage API: + --------- + + stage requires compliance with a strict API that may be + implemented in several means, including as a static class. + the methods of a stage instance must be as follows: + + * ispec() - Input data format specification + returns an object or a list or tuple of objects, or + a Record, each object having an "eq" function which + takes responsibility for copying by assignment all + sub-objects + * ospec() - Output data format specification + requirements as for ospec + * process(m, i) - Processes an ispec-formatted object + returns a combinatorial block of a result that + may be assigned to the output, by way of the "eq" + function + * setup(m, i) - Optional function for setting up submodules + may be used for more complex stages, to link + the input (i) to submodules. must take responsibility + for adding those submodules to the module (m). + the submodules must be combinatorial blocks and + must have their inputs and output linked combinatorially. + + StageChain: + ---------- + + A useful combinatorial wrapper around stages that chains them together + and then presents a Stage-API-conformant interface. + + UnbufferedPipeline: + ------------------ + + A simple stalling clock-synchronised pipeline that has no buffering + (unlike BufferedPipeline). A stall anywhere along the line will + result in a stall back-propagating down the entire chain. + + The BufferedPipeline by contrast will buffer incoming data, allowing + previous stages one clock cycle's grace before also having to stall. + + An advantage of the UnbufferedPipeline over the Buffered one is + that the amount of logic needed (number of gates) is greatly + reduced. + + BufferedPipeline: + ---------------- + + nmigen implementation of buffered pipeline stage, based on zipcpu: + https://zipcpu.com/blog/2017/08/14/strategies-for-pipelining.html + + this module requires quite a bit of thought to understand how it works + (and why it is needed in the first place). reading the above is + *strongly* recommended. + + unlike john dawson's IEEE754 FPU STB/ACK signalling, which requires + the STB / ACK signals to raise and lower (on separate clocks) before + data may proceeed (thus only allowing one piece of data to proceed + on *ALTERNATE* cycles), the signalling here is a true pipeline + where data will flow on *every* clock when the conditions are right. + + input acceptance conditions are when: + * incoming previous-stage strobe (p.i_valid) is HIGH + * outgoing previous-stage ready (p.o_ready) is LOW + + output transmission conditions are when: + * outgoing next-stage strobe (n.o_valid) is HIGH + * outgoing next-stage ready (n.i_ready) is LOW + + the tricky bit is when the input has valid data and the output is not + ready to accept it. if it wasn't for the clock synchronisation, it + would be possible to tell the input "hey don't send that data, we're + not ready". unfortunately, it's not possible to "change the past": + the previous stage *has no choice* but to pass on its data. + + therefore, the incoming data *must* be accepted - and stored: that + is the responsibility / contract that this stage *must* accept. + on the same clock, it's possible to tell the input that it must + not send any more data. this is the "stall" condition. + + we now effectively have *two* possible pieces of data to "choose" from: + the buffered data, and the incoming data. the decision as to which + to process and output is based on whether we are in "stall" or not. + i.e. when the next stage is no longer ready, the output comes from + the buffer if a stall had previously occurred, otherwise it comes + direct from processing the input. + + this allows us to respect a synchronous "travelling STB" with what + dan calls a "buffered handshake". + + it's quite a complex state machine! +""" + +from nmigen import Signal, Cat, Const, Mux, Module +from nmigen.cli import verilog, rtlil +from nmigen.hdl.rec import Record, Layout + +from abc import ABCMeta, abstractmethod +from collections.abc import Sequence + + +class PrevControl: + """ contains signals that come *from* the previous stage (both in and out) + * i_valid: previous stage indicating all incoming data is valid. + may be a multi-bit signal, where all bits are required + to be asserted to indicate "valid". + * o_ready: output to next stage indicating readiness to accept data + * i_data : an input - added by the user of this class + """ + + def __init__(self, i_width=1): + self.i_valid = Signal(i_width, name="p_i_valid") # prev >>in self + self.o_ready = Signal(name="p_o_ready") # prev < 1: # multi-bit case: valid only when i_valid is all 1s + all1s = Const(-1, (len(self.i_valid), False)) + return self.i_valid == all1s + # single-bit i_valid case + return self.i_valid + + +class NextControl: + """ contains the signals that go *to* the next stage (both in and out) + * o_valid: output indicating to next stage that data is valid + * i_ready: input from next stage indicating that it can accept data + * o_data : an output - added by the user of this class + """ + def __init__(self): + self.o_valid = Signal(name="n_o_valid") # self out>> next + self.i_ready = Signal(name="n_i_ready") # self < self <---> out + | ^ + v | + [pipe1, pipe2, pipe3, pipe4] + | ^ | ^ | ^ + v | v | v | + out---in out--in out---in + + Also takes care of allocating i_data/o_data, by looking up + the data spec for each end of the pipechain. i.e It is NOT + necessary to allocate self.p.i_data or self.n.o_data manually: + this is handled AUTOMATICALLY, here. + + Basically this function is the direct equivalent of StageChain, + except that unlike StageChain, the Pipeline logic is followed. + + Just as StageChain presents an object that conforms to the + Stage API from a list of objects that also conform to the + Stage API, an object that calls this Pipeline connect function + has the exact same pipeline API as the list of pipline objects + it is called with. + + Thus it becomes possible to build up larger chains recursively. + More complex chains (multi-input, multi-output) will have to be + done manually. + """ + eqs = [] # collated list of assignment statements + + # connect inter-chain + for i in range(len(pipechain)-1): + pipe1 = pipechain[i] + pipe2 = pipechain[i+1] + eqs += pipe1.connect_to_next(pipe2) + + # connect front of chain to ourselves + front = pipechain[0] + self.p.i_data = front.stage.ispec() + eqs += front._connect_in(self) + + # connect end of chain to ourselves + end = pipechain[-1] + self.n.o_data = end.stage.ospec() + eqs += end._connect_out(self) + + # activate the assignments + m.d.comb += eqs + + def set_input(self, i): + """ helper function to set the input data + """ + return eq(self.p.i_data, i) + + def ports(self): + return [self.p.i_valid, self.n.i_ready, + self.n.o_valid, self.p.o_ready, + self.p.i_data, self.n.o_data # XXX need flattening! + ] + + +class BufferedPipeline(ControlBase): + """ buffered pipeline stage. data and strobe signals travel in sync. + if ever the input is ready and the output is not, processed data + is shunted in a temporary register. + + Argument: stage. see Stage API above + + stage-1 p.i_valid >>in stage n.o_valid out>> stage+1 + stage-1 p.o_ready <>in stage n.o_data out>> stage+1 + | | + process --->----^ + | | + +-- r_data ->-+ + + input data p.i_data is read (only), is processed and goes into an + intermediate result store [process()]. this is updated combinatorially. + + in a non-stall condition, the intermediate result will go into the + output (update_output). however if ever there is a stall, it goes + into r_data instead [update_buffer()]. + + when the non-stall condition is released, r_data is the first + to be transferred to the output [flush_buffer()], and the stall + condition cleared. + + on the next cycle (as long as stall is not raised again) the + input may begin to be processed and transferred directly to output. + + """ + def __init__(self, stage): + ControlBase.__init__(self) + self.stage = stage + + # set up the input and output data + self.p.i_data = stage.ispec() # input type + self.n.o_data = stage.ospec() + + def elaborate(self, platform): + m = Module() + + result = self.stage.ospec() + r_data = self.stage.ospec() + if hasattr(self.stage, "setup"): + self.stage.setup(m, self.p.i_data) + + # establish some combinatorial temporaries + o_n_validn = Signal(reset_less=True) + i_p_valid_o_p_ready = Signal(reset_less=True) + p_i_valid = Signal(reset_less=True) + m.d.comb += [p_i_valid.eq(self.p.i_valid_logic()), + o_n_validn.eq(~self.n.o_valid), + i_p_valid_o_p_ready.eq(p_i_valid & self.p.o_ready), + ] + + # store result of processing in combinatorial temporary + m.d.comb += eq(result, self.stage.process(self.p.i_data)) + + # if not in stall condition, update the temporary register + with m.If(self.p.o_ready): # not stalled + m.d.sync += eq(r_data, result) # update buffer + + with m.If(self.n.i_ready): # next stage is ready + with m.If(self.p.o_ready): # not stalled + # nothing in buffer: send (processed) input direct to output + m.d.sync += [self.n.o_valid.eq(p_i_valid), + eq(self.n.o_data, result), # update output + ] + with m.Else(): # p.o_ready is false, and something is in buffer. + # Flush the [already processed] buffer to the output port. + m.d.sync += [self.n.o_valid.eq(1), # declare reg empty + eq(self.n.o_data, r_data), # flush buffer + self.p.o_ready.eq(1), # clear stall condition + ] + # ignore input, since p.o_ready is also false. + + # (n.i_ready) is false here: next stage is ready + with m.Elif(o_n_validn): # next stage being told "ready" + m.d.sync += [self.n.o_valid.eq(p_i_valid), + self.p.o_ready.eq(1), # Keep the buffer empty + eq(self.n.o_data, result), # set output data + ] + + # (n.i_ready) false and (n.o_valid) true: + with m.Elif(i_p_valid_o_p_ready): + # If next stage *is* ready, and not stalled yet, accept input + m.d.sync += self.p.o_ready.eq(~(p_i_valid & self.n.o_valid)) + + return m + + +class UnbufferedPipeline(ControlBase): + """ A simple pipeline stage with single-clock synchronisation + and two-way valid/ready synchronised signalling. + + Note that a stall in one stage will result in the entire pipeline + chain stalling. + + Also that unlike BufferedPipeline, the valid/ready signalling does NOT + travel synchronously with the data: the valid/ready signalling + combines in a *combinatorial* fashion. Therefore, a long pipeline + chain will lengthen propagation delays. + + Argument: stage. see Stage API, above + + stage-1 p.i_valid >>in stage n.o_valid out>> stage+1 + stage-1 p.o_ready <>in stage n.o_data out>> stage+1 + | | + r_data result + | | + +--process ->-+ + + Attributes: + ----------- + p.i_data : StageInput, shaped according to ispec + The pipeline input + p.o_data : StageOutput, shaped according to ospec + The pipeline output + r_data : input_shape according to ispec + A temporary (buffered) copy of a prior (valid) input. + This is HELD if the output is not ready. It is updated + SYNCHRONOUSLY. + result: output_shape according to ospec + The output of the combinatorial logic. it is updated + COMBINATORIALLY (no clock dependence). + """ + + def __init__(self, stage): + ControlBase.__init__(self) + self.stage = stage + self._data_valid = Signal() + + # set up the input and output data + self.p.i_data = stage.ispec() # input type + self.n.o_data = stage.ospec() # output type + + def elaborate(self, platform): + m = Module() + + r_data = self.stage.ispec() # input type + result = self.stage.ospec() # output data + if hasattr(self.stage, "setup"): + self.stage.setup(m, r_data) + + p_i_valid = Signal(reset_less=True) + m.d.comb += p_i_valid.eq(self.p.i_valid_logic()) + m.d.comb += eq(result, self.stage.process(r_data)) + m.d.comb += self.n.o_valid.eq(self._data_valid) + m.d.comb += self.p.o_ready.eq(~self._data_valid | self.n.i_ready) + m.d.sync += self._data_valid.eq(p_i_valid | \ + (~self.n.i_ready & self._data_valid)) + with m.If(self.p.i_valid & self.p.o_ready): + m.d.sync += eq(r_data, self.p.i_data) + m.d.comb += eq(self.n.o_data, result) + return m +