normalise XER regs carry/32 and SO
[soc.git] / src / soc / fu / logical / main_stage.py
index 39c2400dd718e8ae6ff24acc805ecb10af36b4dd..4885708997b835b62a0be10b25bb2c5ee8cdf7ac 100644 (file)
@@ -7,11 +7,11 @@
 
 from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
 from nmutil.pipemodbase import PipeModBase
-from soc.fu.logical.pipe_data import ALUInputData
+from nmutil.clz import CLZ
+from soc.fu.logical.pipe_data import LogicalInputData
 from soc.fu.alu.pipe_data import ALUOutputData
 from ieee754.part.partsig import PartitionedSignal
 from soc.decoder.power_enums import InternalOp
-from soc.countzero.countzero import ZeroCounter
 
 from soc.decoder.power_fields import DecodeFields
 from soc.decoder.power_fieldsn import SignalBitRange
@@ -20,7 +20,8 @@ from soc.decoder.power_fieldsn import SignalBitRange
 def array_of(count, bitwidth):
     res = []
     for i in range(count):
-        res.append(Signal(bitwidth, reset_less=True))
+        res.append(Signal(bitwidth, reset_less=True,
+                          name=f"pop_{bitwidth}_{i}"))
     return res
 
 
@@ -31,7 +32,7 @@ class LogicalMainStage(PipeModBase):
         self.fields.create_specs()
 
     def ispec(self):
-        return ALUInputData(self.pspec)
+        return LogicalInputData(self.pspec)
 
     def ospec(self):
         return ALUOutputData(self.pspec) # TODO: ALUIntermediateData
@@ -68,7 +69,7 @@ class LogicalMainStage(PipeModBase):
                 # creating arrays big enough to store the sum, each time
                 pc = [a]
                 # QTY32 2-bit (to take 2x 1-bit sums) etc.
-                work = [(32, 2), (16, 3), (8, 4), (4, 5), (2, 6), (1, 6)]
+                work = [(32, 2), (16, 3), (8, 4), (4, 5), (2, 6), (1, 7)]
                 for l, b in work:
                     pc.append(array_of(l, b))
                 pc8 = pc[3]     # array of 8 8-bit counts (popcntb)
@@ -82,14 +83,14 @@ class LogicalMainStage(PipeModBase):
                         comb += dst[i].eq(Cat(src[stt], Const(0, 1)) +
                                           Cat(src[end], Const(0, 1)))
                 # decode operation length
-                with m.If(op.data_len[2:4] == 0b00):
+                with m.If(op.data_len == 1):
                     # popcntb - pack 8x 4-bit answers into output
                     for i in range(8):
-                        comb += o[i*8:i*8+4].eq(pc8[i])
-                with m.Elif(op.data_len[3] == 0):
+                        comb += o[i*8:(i+1)*8].eq(pc8[i])
+                with m.Elif(op.data_len == 4):
                     # popcntw - pack 2x 5-bit answers into output
                     for i in range(2):
-                        comb += o[i*32:i*32+5].eq(pc32[i])
+                        comb += o[i*32:(i+1)*32].eq(pc32[i])
                 with m.Else():
                     # popcntd - put 1x 6-bit answer into output
                     comb += o.eq(popcnt[0])
@@ -110,18 +111,28 @@ class LogicalMainStage(PipeModBase):
             ###### cntlz #######
             with m.Case(InternalOp.OP_CNTZ):
                 XO = self.fields.FormX.XO[0:-1]
-                m.submodules.countz = countz = ZeroCounter()
-                comb += countz.rs_i.eq(a)
-                comb += countz.is_32bit_i.eq(op.is_32bit)
-                comb += countz.count_right_i.eq(XO[-1])
-                comb += o.eq(countz.result_o)
+                count_right = Signal(reset_less=True)
+                comb += count_right.eq(XO[-1])
+
+                cntz_i = Signal(64, reset_less=True)
+                a32 = Signal(32, reset_less=True)
+                comb += a32.eq(a[0:32])
+
+                with m.If(op.is_32bit):
+                    comb += cntz_i.eq(Mux(count_right, a32[::-1], a32))
+                with m.Else():
+                    comb += cntz_i.eq(Mux(count_right, a[::-1], a))
+
+                m.submodules.clz = clz = CLZ(64)
+                comb += clz.sig_in.eq(cntz_i)
+                comb += o.eq(Mux(op.is_32bit, clz.lz-32, clz.lz))
 
             ###### bpermd #######
             # TODO with m.Case(InternalOp.OP_BPERM): - not in microwatt
 
         ###### sticky overflow and context, both pass-through #####
 
-        comb += self.o.so.eq(self.i.so)
+        comb += self.o.xer_so.data.eq(self.i.xer_so)
         comb += self.o.ctx.eq(self.i.ctx)
 
         return m