icache.py fix several subtle bugs that were lines that I had missed from

[soc.git] / src / soc / experiment / score6600_multi.py
diff --git a/src/soc/experiment/score6600_multi.py b/src/soc/experiment/score6600_multi.py

index 4a932109fb87de332e3e7d2be19e5a2bf60e7426..22d8e2d1a76d93d8b33318db42b18c1e66bce0f2 100644 (file)
--- a/src/soc/experiment/score6600_multi.py
+++ b/src/soc/experiment/score6600_multi.py
@@ -4,7 +4,7 @@ from nmigen.hdl.ast import unsigned
  from nmigen import Module, Const, Signal, Array, Cat, Elaboratable, Memory
  from nmigen.back.pysim import Delay
  
-from soc.regfile.regfile import RegFileArray, treereduce
+from soc.regfile.regfile import RegFileArray, ortreereduce
  from soc.scoremulti.fu_fu_matrix import FUFUDepMatrix
  from soc.scoremulti.fu_reg_matrix import FURegDepMatrix
  from soc.scoreboard.global_pending import GlobalPending
@@ -15,14 +15,19 @@ from soc.scoreboard.instruction_q import Instruction, InstructionQ
  from soc.scoreboard.memfu import MemFunctionUnits
  
  from soc.experiment.compalu import ComputationUnitNoDelay
-from soc.experiment.compldst import LDSTCompUnit
-from soc.experiment.testmem import TestMemory
+from soc.experiment.compalu_multi import MultiCompUnit, go_record
+from soc.experiment.compldst_multi import LDSTCompUnit
+from soc.experiment.compldst_multi import CompLDSTOpSubset
+from soc.experiment.l0_cache import TstL0CacheBuffer
  
-from soc.experiment.alu_hier import ALU, BranchALU, CompALUOpSubset
+from soc.experiment.alu_hier import ALU, BranchALU
+from soc.fu.alu.alu_input_record import CompALUOpSubset
  
-from soc.decoder.power_enums import InternalOp, Function
+from soc.decoder.power_enums import MicrOp, Function
  from soc.decoder.power_decoder import (create_pdecode)
  from soc.decoder.power_decoder2 import (PowerDecode2)
+from soc.decoder.power_decoder2 import Decode2ToExecute1Type
+
  from soc.simulator.program import Program
  
  
@@ -84,8 +89,11 @@ class CompUnitsBase(Elaboratable):
  
          # inputs
          self.issue_i = Signal(n_units, reset_less=True)
-        self.go_rd_i = Signal(n_units, reset_less=True)
-        self.go_wr_i = Signal(n_units, reset_less=True)
+        self.rd0 = go_record(n_units, "rd0")
+        self.rd1 = go_record(n_units, "rd1")
+        self.go_rd_i = [self.rd0.go, self.rd1.go]  # XXX HACK!
+        self.wr0 = go_record(n_units, "wr0")
+        self.go_wr_i = [self.wr0.go]
          self.shadown_i = Signal(n_units, reset_less=True)
          self.go_die_i = Signal(n_units, reset_less=True)
          if ldstmode:
@@ -94,8 +102,8 @@ class CompUnitsBase(Elaboratable):
  
          # outputs
          self.busy_o = Signal(n_units, reset_less=True)
-        self.rd_rel_o = Signal(n_units, reset_less=True)
-        self.req_rel_o = Signal(n_units, reset_less=True)
+        self.rd_rel_o = [self.rd0.rel, self.rd1.rel]  # HACK!
+        self.req_rel_o = self.wr0.rel
          self.done_o = Signal(n_units, reset_less=True)
          if ldstmode:
              self.ld_o = Signal(n_units, reset_less=True)  # op is LD
@@ -119,43 +127,53 @@ class CompUnitsBase(Elaboratable):
          for i, alu in enumerate(self.units):
              setattr(m.submodules, "comp%d" % i, alu)
  
-        go_rd_l = []
+        go_rd_l0 = []
+        go_rd_l1 = []
          go_wr_l = []
          issue_l = []
          busy_l = []
          req_rel_l = []
          done_l = []
-        rd_rel_l = []
+        rd_rel0_l = []
+        rd_rel1_l = []
          shadow_l = []
          godie_l = []
          for alu in self.units:
              req_rel_l.append(alu.req_rel_o)
              done_l.append(alu.done_o)
-            rd_rel_l.append(alu.rd_rel_o)
              shadow_l.append(alu.shadown_i)
              godie_l.append(alu.go_die_i)
+            print(alu, "rel", alu.req_rel_o, alu.rd_rel_o)
+            rd_rel0_l.append(alu.rd_rel_o[0])
+            rd_rel1_l.append(alu.rd_rel_o[1])
              go_wr_l.append(alu.go_wr_i)
-            go_rd_l.append(alu.go_rd_i)
+            go_rd_l0.append(alu.go_rd_i[0])
+            go_rd_l1.append(alu.go_rd_i[1])
              issue_l.append(alu.issue_i)
              busy_l.append(alu.busy_o)
-        comb += self.rd_rel_o.eq(Cat(*rd_rel_l))
+        comb += self.rd0.rel.eq(Cat(*rd_rel0_l))
+        comb += self.rd1.rel.eq(Cat(*rd_rel1_l))
          comb += self.req_rel_o.eq(Cat(*req_rel_l))
          comb += self.done_o.eq(Cat(*done_l))
          comb += self.busy_o.eq(Cat(*busy_l))
          comb += Cat(*godie_l).eq(self.go_die_i)
          comb += Cat(*shadow_l).eq(self.shadown_i)
-        comb += Cat(*go_wr_l).eq(self.go_wr_i)
-        comb += Cat(*go_rd_l).eq(self.go_rd_i)
+        comb += Cat(*go_wr_l).eq(self.wr0.go)  # XXX TODO
+        comb += Cat(*go_rd_l0).eq(self.rd0.go)
+        comb += Cat(*go_rd_l1).eq(self.rd1.go)
          comb += Cat(*issue_l).eq(self.issue_i)
  
          # connect data register input/output
  
          # merge (OR) all integer FU / ALU outputs to a single value
+        # XXX NOTE: this only works because there is a single "port"
+        # protected by a single go_wr.  multi-issue requires a bus
+        # to be inserted here.
          if self.units:
-            data_o = treereduce(self.units, "data_o")
+            data_o = ortreereduce(self.units, "data_o")
              comb += self.data_o.eq(data_o)
              if self.ldstmode:
-                addr_o = treereduce(self.units, "addr_o")
+                addr_o = ortreereduce(self.units, "addr_o")
                  comb += self.addr_o.eq(addr_o)
  
          for i, alu in enumerate(self.units):
@@ -196,7 +214,7 @@ class CompUnitsBase(Elaboratable):
  
  class CompUnitLDSTs(CompUnitsBase):
  
-    def __init__(self, rwid, opwid, n_ldsts, mem):
+    def __init__(self, rwid, opwid, n_ldsts, l0):
          """ Inputs:
  
              * :rwid:   bit width of register file(s) - both FP and INT
@@ -205,18 +223,13 @@ class CompUnitLDSTs(CompUnitsBase):
          self.opwid = opwid
  
          # inputs
-        self.oper_i = Signal(opwid, reset_less=True)
-        self.imm_i = Signal(rwid, reset_less=True)
-
-        # Int ALUs
-        self.alus = []
-        for i in range(n_ldsts):
-            self.alus.append(ALU(rwid))
+        self.op = CompLDSTOpSubset("cul_i")
  
+        # LD/ST Units
          units = []
-        for alu in self.alus:
-            aluopwid = 4  # see compldst.py for "internal" opcode
-            units.append(LDSTCompUnit(rwid, aluopwid, alu, mem))
+        for i in range(n_ldsts):
+            pi = l0.l0.dports[i].pi
+            units.append(LDSTCompUnit(pi, rwid, awid=48))
  
          CompUnitsBase.__init__(self, rwid, units, ldstmode=True)
  
@@ -224,11 +237,9 @@ class CompUnitLDSTs(CompUnitsBase):
          m = CompUnitsBase.elaborate(self, platform)
          comb = m.d.comb
  
-        # hand the same operation to all units, 4 lower bits though
-        for alu in self.units:
-            comb += alu.oper_i[0:4].eq(self.oper_i)
-            comb += alu.imm_i.eq(self.imm_i)
-            comb += alu.isalu_i.eq(0)
+        # hand the same operation to all units
+        for ldst in self.units:
+            comb += ldst.oper_i.eq(self.op)
  
          return m
  
@@ -245,8 +256,6 @@ class CompUnitALUs(CompUnitsBase):
  
          # inputs
          self.op = CompALUOpSubset("cua_i")
-        self.oper_i = Signal(opwid, reset_less=True)
-        self.imm_i = Signal(rwid, reset_less=True)
  
          # Int ALUs
          alus = []
@@ -256,7 +265,7 @@ class CompUnitALUs(CompUnitsBase):
          units = []
          for alu in alus:
              aluopwid = 3  # extra bit for immediate mode
-            units.append(ComputationUnitNoDelay(rwid, alu))
+            units.append(MultiCompUnit(rwid, alu, CompALUOpSubset))
  
          CompUnitsBase.__init__(self, rwid, units)
  
@@ -267,8 +276,6 @@ class CompUnitALUs(CompUnitsBase):
          # hand the subset of operation to ALUs
          for alu in self.units:
              comb += alu.oper_i.eq(self.op)
-            #comb += alu.oper_i[0:3].eq(self.oper_i)
-            #comb += alu.imm_i.eq(self.imm_i)
  
          return m
  
@@ -287,13 +294,14 @@ class CompUnitBR(CompUnitsBase):
          self.opwid = opwid
  
          # inputs
+        self.op = CompALUOpSubset("cua_i")  # TODO - CompALUBranchSubset
          self.oper_i = Signal(opwid, reset_less=True)
          self.imm_i = Signal(rwid, reset_less=True)
  
          # Branch ALU and CU
          self.bgt = BranchALU(rwid)
          aluopwid = 3  # extra bit for immediate mode
-        self.br1 = ComputationUnitNoDelay(rwid, self.bgt)
+        self.br1 = MultiCompUnit(rwid, self.bgt, CompALUOpSubset)
          CompUnitsBase.__init__(self, rwid, [self.br1])
  
      def elaborate(self, platform):
@@ -302,6 +310,7 @@ class CompUnitBR(CompUnitsBase):
  
          # hand the same operation to all units
          for alu in self.units:
+            # comb += alu.oper_i.eq(self.op) # TODO
              comb += alu.oper_i.eq(self.oper_i)
              #comb += alu.imm_i.eq(self.imm_i)
  
@@ -310,26 +319,55 @@ class CompUnitBR(CompUnitsBase):
  
  class FunctionUnits(Elaboratable):
  
-    def __init__(self, n_regs, n_int_alus):
-        self.n_regs = n_regs
-        self.n_int_alus = n_int_alus
-
-        self.dest_i = Signal(n_regs, reset_less=True)  # Dest R# in
-        self.src1_i = Signal(n_regs, reset_less=True)  # oper1 R# in
-        self.src2_i = Signal(n_regs, reset_less=True)  # oper2 R# in
-
-        self.g_int_rd_pend_o = Signal(n_regs, reset_less=True)
-        self.g_int_wr_pend_o = Signal(n_regs, reset_less=True)
+    def __init__(self, n_reg, n_int_alus, n_src, n_dst):
+        self.n_src, self.n_dst = n_src, n_dst
+        self.n_reg = n_reg
+        self.n_int_alus = nf = n_int_alus
  
-        self.dest_rsel_o = Signal(n_regs, reset_less=True)  # dest reg (bot)
-        self.src1_rsel_o = Signal(n_regs, reset_less=True)  # src1 reg (bot)
-        self.src2_rsel_o = Signal(n_regs, reset_less=True)  # src2 reg (bot)
+        self.g_int_rd_pend_o = Signal(n_reg, reset_less=True)
+        self.g_int_wr_pend_o = Signal(n_reg, reset_less=True)
  
          self.readable_o = Signal(n_int_alus, reset_less=True)
          self.writable_o = Signal(n_int_alus, reset_less=True)
  
-        self.go_rd_i = Signal(n_int_alus, reset_less=True)
-        self.go_wr_i = Signal(n_int_alus, reset_less=True)
+        # arrays
+        src = []
+        rsel = []
+        rd = []
+        for i in range(n_src):
+            j = i + 1  # name numbering to match src1/src2
+            src.append(Signal(n_reg, name="src%d" % j, reset_less=True))
+            rsel.append(Signal(n_reg, name="src%d_rsel_o" %
+                               j, reset_less=True))
+            rd.append(Signal(nf, name="gord%d_i" % j, reset_less=True))
+        dst = []
+        dsel = []
+        wr = []
+        for i in range(n_dst):
+            j = i + 1  # name numbering to match src1/src2
+            dst.append(Signal(n_reg, name="dst%d" % j, reset_less=True))
+            dsel.append(Signal(n_reg, name="dst%d_rsel_o" %
+                               j, reset_less=True))
+            wr.append(Signal(nf, name="gowr%d_i" % j, reset_less=True))
+        wpnd = []
+        pend = []
+        for i in range(nf):
+            j = i + 1  # name numbering to match src1/src2
+            pend.append(Signal(nf, name="rd_src%d_pend_o" %
+                               j, reset_less=True))
+            wpnd.append(Signal(nf, name="wr_dst%d_pend_o" %
+                               j, reset_less=True))
+
+        self.dest_i = Array(dst)     # Dest in (top)
+        self.src_i = Array(src)      # oper in (top)
+
+        # for Register File Select Lines (horizontal), per-reg
+        self.dst_rsel_o = Array(dsel)  # dest reg (bot)
+        self.src_rsel_o = Array(rsel)  # src reg (bot)
+
+        self.go_rd_i = Array(rd)
+        self.go_wr_i = Array(wr)
+
          self.go_die_i = Signal(n_int_alus, reset_less=True)
          self.fn_issue_i = Signal(n_int_alus, reset_less=True)
  
@@ -343,10 +381,10 @@ class FunctionUnits(Elaboratable):
          n_intfus = self.n_int_alus
  
          # Integer FU-FU Dep Matrix
-        intfudeps = FUFUDepMatrix(n_intfus, n_intfus, 1, 1)
+        intfudeps = FUFUDepMatrix(n_intfus, n_intfus, 2, 1)
          m.submodules.intfudeps = intfudeps
          # Integer FU-Reg Dep Matrix
-        intregdeps = FURegDepMatrix(n_intfus, self.n_regs, 2, 1)
+        intregdeps = FURegDepMatrix(n_intfus, self.n_reg, 2, 1)
          m.submodules.intregdeps = intregdeps
  
          comb += self.g_int_rd_pend_o.eq(intregdeps.v_rd_rsel_o)
@@ -360,27 +398,26 @@ class FunctionUnits(Elaboratable):
          self.wr_pend_o = intregdeps.wr_pend_o  # also output for use in WaWGrid
  
          comb += intfudeps.issue_i.eq(self.fn_issue_i)
-        comb += intfudeps.go_rd_i[0].eq(self.go_rd_i)
-        comb += intfudeps.go_wr_i[0].eq(self.go_wr_i)
          comb += intfudeps.go_die_i.eq(self.go_die_i)
          comb += self.readable_o.eq(intfudeps.readable_o)
          comb += self.writable_o.eq(intfudeps.writable_o)
  
          # Connect function issue / arrays, and dest/src1/src2
-        comb += intregdeps.dest_i[0].eq(self.dest_i)
-        comb += intregdeps.src_i[0].eq(self.src1_i)
-        comb += intregdeps.src_i[1].eq(self.src2_i)
-
-        comb += intregdeps.go_rd_i[0].eq(self.go_rd_i)
-        comb += intregdeps.go_rd_i[1].eq(self.go_rd_i)
-        comb += intregdeps.go_wr_i[0].eq(self.go_wr_i)
+        for i in range(self.n_src):
+            print(i, self.go_rd_i, intfudeps.go_rd_i)
+            comb += intfudeps.go_rd_i[i].eq(self.go_rd_i[i])
+            comb += intregdeps.src_i[i].eq(self.src_i[i])
+            comb += intregdeps.go_rd_i[i].eq(self.go_rd_i[i])
+            comb += self.src_rsel_o[i].eq(intregdeps.src_rsel_o[i])
+        for i in range(self.n_dst):
+            print(i, self.go_wr_i, intfudeps.go_wr_i)
+            comb += intfudeps.go_wr_i[i].eq(self.go_wr_i[i])
+            comb += intregdeps.dest_i[i].eq(self.dest_i[i])
+            comb += intregdeps.go_wr_i[i].eq(self.go_wr_i[i])
+            comb += self.dst_rsel_o[i].eq(intregdeps.dest_rsel_o[i])
          comb += intregdeps.go_die_i.eq(self.go_die_i)
          comb += intregdeps.issue_i.eq(self.fn_issue_i)
  
-        comb += self.dest_rsel_o.eq(intregdeps.dest_rsel_o[0])
-        comb += self.src1_rsel_o.eq(intregdeps.src_rsel_o[0])
-        comb += self.src2_rsel_o.eq(intregdeps.src_rsel_o[1])
-
          return m
  
  
@@ -399,18 +436,17 @@ class Scoreboard(Elaboratable):
          self.fpregs = RegFileArray(rwid, n_regs)
  
          # Memory (test for now)
-        self.mem = TestMemory(self.rwid, 8)  # not too big, takes too long
+        self.l0 = TstL0CacheBuffer()
  
          # issue q needs to get at these
          self.aluissue = IssueUnitGroup(2)
          self.lsissue = IssueUnitGroup(2)
          self.brissue = IssueUnitGroup(1)
          # and these
-        self.alu_op = CompALUOpSubset("alu")
+        self.instr = Decode2ToExecute1Type("sc_instr")
          self.br_oper_i = Signal(4, reset_less=True)
          self.br_imm_i = Signal(rwid, reset_less=True)
          self.ls_oper_i = Signal(4, reset_less=True)
-        self.ls_imm_i = Signal(rwid, reset_less=True)
  
          # inputs
          self.int_dest_i = Signal(range(n_regs), reset_less=True)  # Dest R# in
@@ -437,7 +473,7 @@ class Scoreboard(Elaboratable):
  
          m.submodules.intregs = self.intregs
          m.submodules.fpregs = self.fpregs
-        m.submodules.mem = mem = self.mem
+        m.submodules.l0 = l0 = self.l0
  
          # register ports
          int_dest = self.intregs.write_port("dest")
@@ -455,7 +491,7 @@ class Scoreboard(Elaboratable):
  
          # LDST Comp Units
          n_ldsts = 2
-        cul = CompUnitLDSTs(self.rwid, 4, self.lsissue.n_insns, self.mem)
+        cul = CompUnitLDSTs(self.rwid, 4, self.lsissue.n_insns, l0)
  
          # Comp Units
          m.submodules.cu = cu = CompUnitsBase(self.rwid, [cua, cul, cub])
@@ -463,14 +499,17 @@ class Scoreboard(Elaboratable):
          br1 = cub.br1
  
          # Int FUs
-        m.submodules.intfus = intfus = FunctionUnits(self.n_regs, n_int_alus)
+        fu_n_src = 2
+        fu_n_dst = 1
+        m.submodules.intfus = intfus = FunctionUnits(self.n_regs, n_int_alus,
+                                                     fu_n_src, fu_n_dst)
  
          # Memory FUs
          m.submodules.memfus = memfus = MemFunctionUnits(n_ldsts, 5)
  
          # Memory Priority Picker 1: one gateway per memory port
          # picks 1 reader and 1 writer to intreg
-        mempick1 = GroupPicker(n_ldsts)
+        mempick1 = GroupPicker(n_ldsts, 1, 1)
          m.submodules.mempick1 = mempick1
  
          # Count of number of FUs
@@ -479,8 +518,8 @@ class Scoreboard(Elaboratable):
  
          # Integer Priority Picker 1: Adder + Subtractor (and LD/ST)
          # picks 1 reader and 1 writer to intreg
-        intpick1 = GroupPicker(n_intfus)
-        m.submodules.intpick1 = intpick1
+        ipick1 = GroupPicker(n_intfus, fu_n_src, fu_n_dst)
+        m.submodules.intpick1 = ipick1
  
          # INT/FP Issue Unit
          regdecode = RegDecode(self.n_regs)
@@ -519,18 +558,17 @@ class Scoreboard(Elaboratable):
                   ]
  
          # take these to outside (issue needs them)
-        comb += cua.op.eq(self.alu_op)
+        comb += cua.op.eq_from_execute1(self.instr)
          comb += cub.oper_i.eq(self.br_oper_i)
          comb += cub.imm_i.eq(self.br_imm_i)
-        comb += cul.oper_i.eq(self.ls_oper_i)
-        comb += cul.imm_i.eq(self.ls_imm_i)
+        comb += cul.op.eq_from_execute1(self.instr)
  
          # TODO: issueunit.f (FP)
  
          # and int function issue / busy arrays, and dest/src1/src2
-        comb += intfus.dest_i.eq(regdecode.dest_o)
-        comb += intfus.src1_i.eq(regdecode.src1_o)
-        comb += intfus.src2_i.eq(regdecode.src2_o)
+        comb += intfus.dest_i[0].eq(regdecode.dest_o)
+        comb += intfus.src_i[0].eq(regdecode.src1_o)
+        comb += intfus.src_i[1].eq(regdecode.src2_o)
  
          fn_issue_o = issueunit.fn_issue_o
  
@@ -542,7 +580,9 @@ class Scoreboard(Elaboratable):
          # Memory Function Unit
          # ---------
          reset_b = Signal(cul.n_units, reset_less=True)
-        sync += reset_b.eq(cul.go_st_i | cul.go_wr_i | cul.go_die_i)
+        # XXX was cul.go_wr_i not done.o
+        # sync += reset_b.eq(cul.go_st_i | cul.done_o | cul.go_die_i)
+        sync += reset_b.eq(cul.go_st_i | cul.done_o | cul.go_die_i)
  
          comb += memfus.fn_issue_i.eq(cul.issue_i)  # Comp Unit Issue -> Mem FUs
          comb += memfus.addr_en_i.eq(cul.adr_rel_o)  # Match enable on adr rel
@@ -560,7 +600,7 @@ class Scoreboard(Elaboratable):
  
          # TODO: adr_rel_o needs to go into L1 Cache.  for now,
          # just immediately activate go_adr
-        comb += cul.go_ad_i.eq(cul.adr_rel_o)
+        sync += cul.go_ad_i.eq(cul.adr_rel_o)
  
          # connect up address data
          comb += memfus.addrs_i[0].eq(cul.units[0].addr_o)
@@ -601,24 +641,31 @@ class Scoreboard(Elaboratable):
          # ---------
  
          # Group Picker... done manually for now.
-        go_rd_o = intpick1.go_rd_o
-        go_wr_o = intpick1.go_wr_o
+        go_rd_o = ipick1.go_rd_o
+        go_wr_o = ipick1.go_wr_o
          go_rd_i = intfus.go_rd_i
          go_wr_i = intfus.go_wr_i
          go_die_i = intfus.go_die_i
          # NOTE: connect to the shadowed versions so that they can "die" (reset)
-        comb += go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])  # rd
-        comb += go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus])  # wr
+        for i in range(fu_n_src):
+            comb += go_rd_i[i][0:n_intfus].eq(go_rd_o[i][0:n_intfus])  # rd
+        for i in range(fu_n_dst):
+            comb += go_wr_i[i][0:n_intfus].eq(go_wr_o[i][0:n_intfus])  # wr
          comb += go_die_i[0:n_intfus].eq(anydie[0:n_intfus])  # die
  
          # Connect Picker
          # ---------
-        comb += intpick1.rd_rel_i[0:n_intfus].eq(cu.rd_rel_o[0:n_intfus])
-        comb += intpick1.req_rel_i[0:n_intfus].eq(cu.done_o[0:n_intfus])
          int_rd_o = intfus.readable_o
+        rrel_o = cu.rd_rel_o
+        rqrl_o = cu.req_rel_o
+        for i in range(fu_n_src):
+            comb += ipick1.rd_rel_i[i][0:n_intfus].eq(rrel_o[i][0:n_intfus])
+            comb += ipick1.readable_i[i][0:n_intfus].eq(int_rd_o[0:n_intfus])
          int_wr_o = intfus.writable_o
-        comb += intpick1.readable_i[0:n_intfus].eq(int_rd_o[0:n_intfus])
-        comb += intpick1.writable_i[0:n_intfus].eq(int_wr_o[0:n_intfus])
+        for i in range(fu_n_dst):
+            # XXX FIXME: rqrl_o[i] here
+            comb += ipick1.req_rel_i[i][0:n_intfus].eq(rqrl_o[0:n_intfus])
+            comb += ipick1.writable_i[i][0:n_intfus].eq(int_wr_o[0:n_intfus])
  
          # ---------
          # Shadow Matrix
@@ -640,7 +687,10 @@ class Scoreboard(Elaboratable):
  
          # when written, the shadow can be cancelled (and was good)
          for i in range(n_intfus):
-            comb += shadows.s_good_i[i][0:n_intfus].eq(go_wr_o[0:n_intfus])
+            #comb += shadows.s_good_i[i][0:n_intfus].eq(go_wr_o[0:n_intfus])
+            # XXX experiment: use ~cu.busy_o instead.  *should* be good
+            # because the comp unit is only free once completed
+            comb += shadows.s_good_i[i][0:n_intfus].eq(~cu.busy_o[0:n_intfus])
  
          # *previous* instruction shadows *current* instruction, and, obviously,
          # if the previous is completed (!busy) don't cast the shadow!
@@ -694,9 +744,9 @@ class Scoreboard(Elaboratable):
          # ---------
          # Connect Register File(s)
          # ---------
-        comb += int_dest.wen.eq(intfus.dest_rsel_o)
-        comb += int_src1.ren.eq(intfus.src1_rsel_o)
-        comb += int_src2.ren.eq(intfus.src2_rsel_o)
+        comb += int_dest.wen.eq(intfus.dst_rsel_o[0])
+        comb += int_src1.ren.eq(intfus.src_rsel_o[0])
+        comb += int_src2.ren.eq(intfus.src_rsel_o[1])
  
          # connect ALUs to regfile
          comb += int_dest.data_i.eq(cu.data_o)
@@ -704,8 +754,10 @@ class Scoreboard(Elaboratable):
          comb += cu.src2_i.eq(int_src2.data_o)
  
          # connect ALU Computation Units
-        comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
-        comb += cu.go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus])
+        for i in range(fu_n_src):
+            comb += cu.go_rd_i[i][0:n_intfus].eq(go_rd_o[i][0:n_intfus])
+        for i in range(fu_n_dst):
+            comb += cu.go_wr_i[i][0:n_intfus].eq(go_wr_o[i][0:n_intfus])
          comb += cu.issue_i[0:n_intfus].eq(fn_issue_o[0:n_intfus])
  
          return m
@@ -808,28 +860,21 @@ class IssueToScoreboard(Elaboratable):
              comb += sc.int_src1_i.eq(src1)
              comb += sc.int_src2_i.eq(src2)
              comb += sc.reg_enable_i.eq(1)  # enable the regfile
+            comb += sc.instr.eq(instr)
  
              # choose a Function-Unit-Group
              with m.If(fu == Function.ALU):  # alu
-                comb += sc.alu_op.eq_from_execute1(instr)
-                comb += sc.aluissue.insn_i.eq(1)
+                comb += sc.aluissue.insn_i.eq(1)  # enable alu issue
                  comb += wait_issue_alu.eq(1)
+            with m.Elif(fu == Function.LDST):  # ld/st
+                comb += sc.lsissue.insn_i.eq(1)  # enable ldst issue
+                comb += wait_issue_ls.eq(1)
+
              with m.Elif((op & (0x3 << 2)) != 0):  # branch
                  comb += sc.br_oper_i.eq(Cat(op[0:2], opi))
                  comb += sc.br_imm_i.eq(imm)
                  comb += sc.brissue.insn_i.eq(1)
                  comb += wait_issue_br.eq(1)
-            with m.Elif((op & (0x3 << 4)) != 0):  # ld/st
-                # see compldst.py
-                # bit 0: ADD/SUB
-                # bit 1: immed
-                # bit 4: LD
-                # bit 5: ST
-                comb += sc.ls_oper_i.eq(Cat(op[0], opi[0], op[4:6]))
-                comb += sc.ls_imm_i.eq(imm)
-                comb += sc.lsissue.insn_i.eq(1)
-                comb += wait_issue_ls.eq(1)
-
              # XXX TODO
              # these indicate that the instruction is to be made
              # shadow-dependent on
@@ -871,7 +916,7 @@ def power_instr_q(dut, pdecode2, ins, code):
  def instr_q(dut, op, funit, op_imm, imm, src1, src2, dest,
              branch_success, branch_fail):
      instrs = [{'insn_type': op, 'fn_unit': funit, 'write_reg': dest,
-                'imm_data': (imm, op_imm),
+               'imm_data': (imm, op_imm),
                 'read_reg1': src1, 'read_reg2': src2}]
  
      sendlen = 1
@@ -885,11 +930,11 @@ def instr_q(dut, op, funit, op_imm, imm, src1, src2, dest,
          yield dut.data_i[idx].insn_type.eq(insn_type)
          yield dut.data_i[idx].fn_unit.eq(fn_unit)
          yield dut.data_i[idx].read_reg1.data.eq(reg1)
-        yield dut.data_i[idx].read_reg1.ok.eq(1) # XXX TODO
+        yield dut.data_i[idx].read_reg1.ok.eq(1)  # XXX TODO
          yield dut.data_i[idx].read_reg2.data.eq(reg2)
-        yield dut.data_i[idx].read_reg2.ok.eq(1) # XXX TODO
+        yield dut.data_i[idx].read_reg2.ok.eq(1)  # XXX TODO
          yield dut.data_i[idx].write_reg.data.eq(dest)
-        yield dut.data_i[idx].write_reg.ok.eq(1) # XXX TODO
+        yield dut.data_i[idx].write_reg.ok.eq(1)  # XXX TODO
          yield dut.data_i[idx].imm_data.data.eq(imm)
          yield dut.data_i[idx].imm_data.ok.eq(op_imm)
          di = yield dut.data_i[idx]
@@ -1108,21 +1153,30 @@ def power_sim(m, dut, pdecode2, instruction, alusim):
          for i in range(1, dut.n_regs):
              #val = randint(0, (1<<alusim.rwidth)-1)
              #val = 31+i*3
-            val = i # XXX actually, not random at all
+            val = i  # XXX actually, not random at all
              yield dut.intregs.regs[i].reg.eq(val)
              alusim.setval(i, val)
  
          # create some instructions
-        lst = [#"addi 3, 0, 0x1234",
-               #"addi 2, 0, 0x4321",
-               "add  1, 3, 2"]
+        lst = []
+        if False:
+            lst += ["addi 2, 0, 0x4321",
+                    "addi 3, 0, 0x1234",
+                    "add  1, 3, 2",
+                    "add  4, 3, 5"
+                    ]
+        if True:
+            lst += ["lbzu 6, 7(2)",
+
+                    ]
+
          with Program(lst) as program:
              gen = program.generate_instructions()
  
              # issue instruction(s), wait for issue to be free before proceeding
              for ins, code in zip(gen, program.assembly.splitlines()):
                  yield instruction.eq(ins)          # raw binary instr.
-                yield Delay(1e-6)
+                yield  # Delay(1e-6)
  
                  print("binary 0x{:X}".format(ins & 0xffffffff))
                  print("assembly", code)
@@ -1179,15 +1233,15 @@ def scoreboard_sim(dut, alusim):
              instrs.append((1, 7, 2, 2, 0, 0, (0, 0)))
  
          if True:
-            instrs.append((2, 3, 3, InternalOp.OP_ADD, Function.ALU,
+            instrs.append((2, 3, 3, MicrOp.OP_ADD, Function.ALU,
                             0, 0, (0, 0)))
-            instrs.append((5, 3, 3, InternalOp.OP_ADD, Function.ALU,
+            instrs.append((5, 3, 3, MicrOp.OP_ADD, Function.ALU,
                             0, 0, (0, 0)))
          if False:
-            instrs.append((3, 5, 5, InternalOp.OP_MUL_L64, Function.ALU,
+            instrs.append((3, 5, 5, MicrOp.OP_MUL_L64, Function.ALU,
                             1, 7, (0, 0)))
          if False:
-            instrs.append((2, 3, 3, InternalOp.OP_ADD, Function.ALU,
+            instrs.append((2, 3, 3, MicrOp.OP_ADD, Function.ALU,
                             0, 0, (0, 0)))
  
          if False:
@@ -1282,7 +1336,7 @@ def scoreboard_sim(dut, alusim):
  
          # issue instruction(s), wait for issue to be free before proceeding
          for i, instr in enumerate(instrs):
-            print (i, instr)
+            print(i, instr)
              src1, src2, dest, op, fn_unit, opi, imm, (br_ok, br_fail) = instr
  
              print("instr %d: (%d, %d, %d, %s, %s, %d, %d)" %
@@ -1335,7 +1389,7 @@ def test_scoreboard():
      run_simulation(m, power_sim(m, dut, pdecode2, instruction, alusim),
                     vcd_name='test_powerboard6600.vcd')
  
-    #run_simulation(dut, scoreboard_sim(dut, alusim),
+    # run_simulation(dut, scoreboard_sim(dut, alusim),
      #               vcd_name='test_scoreboard6600.vcd')
  
      # run_simulation(dut, scoreboard_branch_sim(dut, alusim),