From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Mon, 9 Mar 2020 13:42:31 +0000 (+0000)
Subject: move all source directories to soc so that "import soc.scoreboard" etc is used
X-Git-Tag: div_pipeline~1749
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=2d1027c7357d92b8cae4c15f55ad97b8fe81707b;p=soc.git

move all source directories to soc so that "import soc.scoreboard" etc is used
---

diff --git a/src/TLB/.gitignore b/src/TLB/.gitignore
deleted file mode 100644
index 3324664b..00000000
--- a/src/TLB/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*.wpr
-__pycache__
diff --git a/src/TLB/AddressEncoder.py b/src/TLB/AddressEncoder.py
deleted file mode 100644
index 128f2c97..00000000
--- a/src/TLB/AddressEncoder.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from nmigen import Module, Signal, Elaboratable
-from nmigen.lib.coding import Encoder, PriorityEncoder
-
-class AddressEncoder(Elaboratable):
-    """Address Encoder
-
-       The purpose of this module is to take in a vector and
-       encode the bits that are one hot into an address. This module
-       combines both nmigen's Encoder and PriorityEncoder and will state
-       whether the input line has a single bit hot, multiple bits hot,
-       or no bits hot. The output line will always have the lowest value
-       address output.
-
-       Usage:
-       The output is valid when either single or multiple match is high.
-       Otherwise output is 0.
-    """
-    def __init__(self, width):
-        """ Arguments:
-            * width: The desired length of the input vector
-        """
-        # Internal
-        self.encoder = Encoder(width)
-        self.p_encoder = PriorityEncoder(width)
-
-        # Input
-        self.i = Signal(width)
-
-        # Output
-        self.single_match = Signal(1)
-        self.multiple_match = Signal(1)
-        self.o = Signal(max=width)
-
-    def elaborate(self, platform=None):
-        m = Module()
-
-        # Add internal submodules
-        m.submodules.encoder = self.encoder
-        m.submodules.p_encoder = self.p_encoder
-
-        m.d.comb += [
-            self.encoder.i.eq(self.i),
-            self.p_encoder.i.eq(self.i)
-        ]
-
-        # Steps:
-        # 1. check if the input vector is non-zero
-        # 2. if non-zero, check if single match or multiple match
-        # 3. set output line to be lowest value address output
-
-        # If the priority encoder recieves an input of 0
-        # If n is 1 then the output is not valid
-        with m.If(self.p_encoder.n):
-            m.d.comb += [
-                self.single_match.eq(0),
-                self.multiple_match.eq(0),
-                self.o.eq(0)
-            ]
-        # If the priority encoder recieves an input > 0
-        with m.Else():
-            # Multiple Match if encoder n is invalid
-            with m.If(self.encoder.n):
-                m.d.comb += [
-                    self.single_match.eq(0),
-                    self.multiple_match.eq(1)
-                ]
-            # Single Match if encoder n is valid
-            with m.Else():
-                m.d.comb += [
-                    self.single_match.eq(1),
-                    self.multiple_match.eq(0)
-                ]
-            # Always set output based on priority encoder output
-            m.d.comb += self.o.eq(self.p_encoder.o)
-        return m
diff --git a/src/TLB/Cam.py b/src/TLB/Cam.py
deleted file mode 100644
index e7d901ff..00000000
--- a/src/TLB/Cam.py
+++ /dev/null
@@ -1,125 +0,0 @@
-from nmigen import Array, Cat, Module, Signal, Elaboratable
-from nmigen.lib.coding import Decoder
-from nmigen.cli import main #, verilog
-
-from .CamEntry import CamEntry
-from .AddressEncoder import AddressEncoder
-
-
-class Cam(Elaboratable):
-    """ Content Addressable Memory (CAM)
-
-        The purpose of this module is to quickly look up whether an
-        entry exists given a data key.
-        This module will search for the given data in all internal entries
-        and output whether a  single or multiple match was found.
-        If an single entry is found the address be returned and single_match
-        is set HIGH. If multiple entries are found the lowest address is
-        returned and multiple_match is set HIGH. If neither single_match or
-        multiple_match are HIGH this implies no match was found. To write
-        to the CAM set the address bus to the desired entry and set write_enable
-        HIGH. Entry managment should be performed one level above this block
-        as lookup is performed within.
-
-        Notes:
-        The read and write operations take one clock cycle to complete.
-        Currently the read_warning line is present for interfacing but
-        is not necessary for this design. This module is capable of writing
-        in the first cycle, reading on the second, and output the correct
-        address on the third.
-    """
-
-    def __init__(self, data_size, cam_size):
-        """ Arguments:
-            * data_size: (bits) The bit size of the data
-            * cam_size: (number) The number of entries in the CAM
-        """
-
-        # Internal
-        self.cam_size = cam_size
-        self.encoder = AddressEncoder(cam_size)
-        self.decoder = Decoder(cam_size)
-        self.entry_array = Array(CamEntry(data_size) for x in range(cam_size))
-
-        # Input
-        self.enable = Signal(1)
-        self.write_enable = Signal(1)
-        self.data_in = Signal(data_size) # The data to be written
-        self.data_mask = Signal(data_size) # mask for ternary writes
-        self.address_in = Signal(max=cam_size) # address of CAM Entry to write
-
-        # Output
-        self.read_warning = Signal(1) # High when a read interrupts a write
-        self.single_match = Signal(1) # High when there is only one match
-        self.multiple_match = Signal(1) # High when there at least two matches
-        self.match_address = Signal(max=cam_size) # The lowest address matched
-
-    def elaborate(self, platform=None):
-        m = Module()
-        # AddressEncoder for match types and output address
-        m.submodules.AddressEncoder = self.encoder
-        # Decoder is used to select which entry will be written to
-        m.submodules.Decoder = self.decoder
-        # CamEntry Array Submodules
-        # Note these area added anonymously
-        entry_array = self.entry_array
-        m.submodules += entry_array
-
-        # Decoder logic
-        m.d.comb += [
-            self.decoder.i.eq(self.address_in),
-            self.decoder.n.eq(0)
-        ]
-
-        encoder_vector = []
-        with m.If(self.enable):
-            # Set the key value for every CamEntry
-            for index in range(self.cam_size):
-
-                # Write Operation
-                with m.If(self.write_enable):
-                    with m.If(self.decoder.o[index]):
-                        m.d.comb += entry_array[index].command.eq(2)
-                    with m.Else():
-                        m.d.comb += entry_array[index].command.eq(0)
-
-                # Read Operation
-                with m.Else():
-                    m.d.comb += entry_array[index].command.eq(1)
-
-                # Send data input to all entries
-                m.d.comb += entry_array[index].data_in.eq(self.data_in)
-                # Send all entry matches to encoder
-                ematch = entry_array[index].match
-                encoder_vector.append(ematch)
-
-            # Give input to and accept output from encoder module
-            m.d.comb += [
-                self.encoder.i.eq(Cat(*encoder_vector)),
-                self.single_match.eq(self.encoder.single_match),
-                self.multiple_match.eq(self.encoder.multiple_match),
-                self.match_address.eq(self.encoder.o)
-            ]
-
-        # If the CAM is not enabled set all outputs to 0
-        with m.Else():
-            m.d.comb += [
-                    self.read_warning.eq(0),
-                    self.single_match.eq(0),
-                    self.multiple_match.eq(0),
-                    self.match_address.eq(0)
-            ]
-
-        return m
-
-    def ports(self):
-        return [self.enable, self.write_enable,
-                     self.data_in, self.data_mask,
-                     self.read_warning, self.single_match,
-                     self.multiple_match, self.match_address]
-
-
-if __name__ == '__main__':
-    cam = Cam(4, 4)
-    main(cam, ports=cam.ports())
-
diff --git a/src/TLB/CamEntry.py b/src/TLB/CamEntry.py
deleted file mode 100644
index b1d93082..00000000
--- a/src/TLB/CamEntry.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from nmigen import Module, Signal, Elaboratable
-
-
-class CamEntry(Elaboratable):
-    """ Content Addressable Memory (CAM) Entry
-
-        The purpose of this module is to represent an entry within a CAM.
-        This module when given a read command will compare  the given data
-        and output whether a match was found or not. When given a write
-        command it will write the given data into internal registers.
-    """
-
-    def __init__(self, data_size):
-        """ Arguments:
-            * data_size: (bit count) The size of the data
-        """
-        # Input
-        self.command = Signal(2) # 00 => NA 01 => Read 10 => Write 11 => Reset
-        self.data_in = Signal(data_size) # Data input when writing
-
-        # Output
-        self.match = Signal(1) # Result of the internal/input key comparison
-        self.data = Signal(data_size)
-
-    def elaborate(self, platform=None):
-        m = Module()
-        with m.Switch(self.command):
-            with m.Case("00"):
-                m.d.sync += self.match.eq(0)
-            with m.Case("01"):
-                with m.If(self.data == self.data_in):
-                    m.d.sync += self.match.eq(1)
-                with m.Else():
-                    m.d.sync += self.match.eq(0)
-            with m.Case("10"):
-                m.d.sync += [
-                    self.data.eq(self.data_in),
-                    self.match.eq(0)
-                ]
-            with m.Case():
-                m.d.sync += [
-                    self.match.eq(0),
-                    self.data.eq(0)
-                ]
-
-        return m
diff --git a/src/TLB/LFSR.py b/src/TLB/LFSR.py
deleted file mode 100644
index d8b606ec..00000000
--- a/src/TLB/LFSR.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# SPDX-License-Identifier: LGPL-2.1-or-later
-# See Notices.txt for copyright information
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-from nmigen.cli import verilog, rtlil
-
-
-class LFSRPolynomial(set):
-    """ implements a polynomial for use in LFSR
-    """
-    def __init__(self, exponents=()):
-        for e in exponents:
-            assert isinstance(e, int), TypeError("%s must be an int" % repr(e))
-            assert (e >= 0), ValueError("%d must not be negative" % e)
-        set.__init__(self, set(exponents).union({0})) # must contain zero
-
-    @property
-    def max_exponent(self):
-        return max(self) # derived from set, so this returns the max exponent
-
-    @property
-    def exponents(self):
-        exponents = list(self) # get elements of set as a list
-        exponents.sort(reverse=True)
-        return exponents
-
-    def __str__(self):
-        expd = {0: "1", 1: 'x', 2: "x^{}"} # case 2 isn't 2, it's min(i,2)
-        retval = map(lambda i: expd[min(i,2)].format(i), self.exponents)
-        return " + ".join(retval)
-
-    def __repr__(self):
-        return "LFSRPolynomial(%s)" % self.exponents
-
-
-# list of selected polynomials from https://web.archive.org/web/20190418121923/https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Some_polynomials_for_maximal_LFSRs  # noqa
-LFSR_POLY_2 = LFSRPolynomial([2, 1, 0])
-LFSR_POLY_3 = LFSRPolynomial([3, 2, 0])
-LFSR_POLY_4 = LFSRPolynomial([4, 3, 0])
-LFSR_POLY_5 = LFSRPolynomial([5, 3, 0])
-LFSR_POLY_6 = LFSRPolynomial([6, 5, 0])
-LFSR_POLY_7 = LFSRPolynomial([7, 6, 0])
-LFSR_POLY_8 = LFSRPolynomial([8, 6, 5, 4, 0])
-LFSR_POLY_9 = LFSRPolynomial([9, 5, 0])
-LFSR_POLY_10 = LFSRPolynomial([10, 7, 0])
-LFSR_POLY_11 = LFSRPolynomial([11, 9, 0])
-LFSR_POLY_12 = LFSRPolynomial([12, 11, 10, 4, 0])
-LFSR_POLY_13 = LFSRPolynomial([13, 12, 11, 8, 0])
-LFSR_POLY_14 = LFSRPolynomial([14, 13, 12, 2, 0])
-LFSR_POLY_15 = LFSRPolynomial([15, 14, 0])
-LFSR_POLY_16 = LFSRPolynomial([16, 15, 13, 4, 0])
-LFSR_POLY_17 = LFSRPolynomial([17, 14, 0])
-LFSR_POLY_18 = LFSRPolynomial([18, 11, 0])
-LFSR_POLY_19 = LFSRPolynomial([19, 18, 17, 14, 0])
-LFSR_POLY_20 = LFSRPolynomial([20, 17, 0])
-LFSR_POLY_21 = LFSRPolynomial([21, 19, 0])
-LFSR_POLY_22 = LFSRPolynomial([22, 21, 0])
-LFSR_POLY_23 = LFSRPolynomial([23, 18, 0])
-LFSR_POLY_24 = LFSRPolynomial([24, 23, 22, 17, 0])
-
-
-class LFSR(LFSRPolynomial, Elaboratable):
-    """ implements a Linear Feedback Shift Register
-    """
-    def __init__(self, polynomial):
-        """ Inputs:
-            ------
-            :polynomial: the polynomial to feedback on.  may be a LFSRPolynomial
-                         instance or an iterable of ints (list/tuple/generator)
-            :enable:     enable (set LO to disable.  NOTE: defaults to HI)
-
-            Outputs:
-            -------
-            :state: the LFSR state.  bitwidth is taken from the polynomial
-                    maximum exponent.
-
-            Note: if an LFSRPolynomial is passed in as the input, because
-            LFSRPolynomial is derived from set() it's ok:
-            LFSRPolynomial(LFSRPolynomial(p)) == LFSRPolynomial(p)
-        """
-        LFSRPolynomial.__init__(self, polynomial)
-        self.state = Signal(self.max_exponent, reset=1)
-        self.enable = Signal(reset=1)
-
-    def elaborate(self, platform):
-        m = Module()
-        # do absolutely nothing if the polynomial is empty (always has a zero)
-        if self.max_exponent <= 1:
-            return m
-
-        # create XOR-bunch, select bits from state based on exponent
-        feedback = Const(0) # doesn't do any harm starting from 0b0 (xor chain)
-        for exponent in self:
-            if exponent > 0: # don't have to skip, saves CPU cycles though
-                feedback ^= self.state[exponent - 1]
-
-        # if enabled, shift-and-feedback
-        with m.If(self.enable):
-            # shift up lower bits by Cat'ing in a new bit zero (feedback)
-            newstate = Cat(feedback, self.state[:-1])
-            m.d.sync += self.state.eq(newstate)
-
-        return m
-
-
-# example: Poly24
-if __name__ == '__main__':
-    p24 = rtlil.convert(LFSR(LFSR_POLY_24))
-    with open("lfsr2_p24.il", "w") as f:
-        f.write(p24)
diff --git a/src/TLB/LFSR.pyi b/src/TLB/LFSR.pyi
deleted file mode 100644
index 64eb9115..00000000
--- a/src/TLB/LFSR.pyi
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-License-Identifier: LGPL-2.1-or-later
-# See Notices.txt for copyright information
-from nmigen import Module
-from typing import Iterable, Optional, Iterator, Any, Union
-from typing_extensions import final
-
-
-@final
-class LFSRPolynomial(set):
-    def __init__(self, exponents: Iterable[int] = ()):
-        def elements() -> Iterable[int]: ...
-    @property
-    def exponents(self) -> list[int]: ...
-    def __str__(self) -> str: ...
-    def __repr__(self) -> str: ...
-
-
-@final
-class LFSR:
-    def __init__(self, polynomial: Union[Iterable[int], LFSRPolynomial]): ...
-    @property
-    def width(self) -> int: ...
-    def elaborate(self, platform: Any) -> Module: ...
diff --git a/src/TLB/Makefile b/src/TLB/Makefile
deleted file mode 100644
index 1eb67acc..00000000
--- a/src/TLB/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-verilog:
-	python3 Cam.py generate -t v > Cam.v
diff --git a/src/TLB/MemorySet.py b/src/TLB/MemorySet.py
deleted file mode 100644
index ea61bdf5..00000000
--- a/src/TLB/MemorySet.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from nmigen import Cat, Memory, Module, Signal, Elaboratable
-from nmigen.cli import main
-from nmigen.cli import verilog, rtlil
-
-
-class MemorySet(Elaboratable):
-    def __init__(self, data_size, tag_size, set_count, active):
-        self.active = active
-        input_size = tag_size + data_size # Size of the input data
-        memory_width = input_size + 1 # The width of the cache memory
-        self.active = active
-        self.data_size = data_size
-        self.tag_size = tag_size
-
-        # XXX TODO, use rd-enable and wr-enable?
-        self.mem = Memory(memory_width, set_count)
-        self.r = self.mem.read_port()
-        self.w = self.mem.write_port()
-
-        # inputs (address)
-        self.cset = Signal(max=set_count)  # The set to be checked
-        self.tag = Signal(tag_size)        # The tag to find
-        self.data_i = Signal(data_size)    # Incoming data
-
-        # outputs
-        self.valid = Signal()
-        self.data_o = Signal(data_size)    # Outgoing data (excludes tag)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.mem = self.mem
-        m.submodules.r = self.r
-        m.submodules.w = self.w
-
-        # temporaries
-        active_bit = Signal()
-        tag_valid = Signal()
-        data_start = self.active + 1
-        data_end = data_start + self.data_size
-        tag_start = data_end
-        tag_end = tag_start + self.tag_size
-
-        # connect the read port address to the set/entry
-        read_port = self.r
-        m.d.comb += read_port.addr.eq(self.cset)
-        # Pull out active bit from data
-        data = read_port.data
-        m.d.comb += active_bit.eq(data[self.active])
-        # Validate given tag vs stored tag
-        tag = data[tag_start:tag_end]
-        m.d.comb += tag_valid.eq(self.tag == tag)
-        # An entry is only valid if the tags match AND
-        # is marked as a valid entry
-        m.d.comb += self.valid.eq(tag_valid & active_bit)
-
-        # output data: TODO, check rd-enable?
-        m.d.comb += self.data_o.eq(data[data_start:data_end])
-
-        # connect the write port addr to the set/entry (only if write enabled)
-        # (which is only done on a match, see SAC.write_entry below)
-        write_port = self.w
-        with m.If(write_port.en):
-            m.d.comb += write_port.addr.eq(self.cset)
-            m.d.comb += write_port.data.eq(Cat(1, self.data_i, self.tag))
-
-        return m
diff --git a/src/TLB/PermissionValidator.py b/src/TLB/PermissionValidator.py
deleted file mode 100644
index 0107c0e9..00000000
--- a/src/TLB/PermissionValidator.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from nmigen import Module, Signal, Elaboratable
-from nmigen.cli import main
-
-from TLB.PteEntry import PteEntry
-
-
-class PermissionValidator(Elaboratable):
-    """ The purpose of this Module is to check the Permissions of a given PTE
-        against the requested access permissions.
-
-        This module will either validate (by setting the valid bit HIGH)
-        the request or find a permission fault and invalidate (by setting
-        the valid bit LOW) the request
-    """
-
-    def __init__(self, asid_size, pte_size):
-        """ Arguments:
-            * asid_size: (bit count) The size of the asid to be processed
-            * pte_size: (bit count) The size of the pte to be processed
-
-            Return:
-            * valid HIGH when permissions are correct
-        """
-        # Internal
-        self.pte_entry = PteEntry(asid_size, pte_size)
-
-        # Input
-        self.data = Signal(asid_size + pte_size);
-        self.xwr = Signal(3) # Execute, Write, Read
-        self.super_mode = Signal(1) # Supervisor Mode
-        self.super_access = Signal(1) # Supervisor Access
-        self.asid = Signal(15) # Address Space IDentifier (ASID)
-
-        # Output
-        self.valid = Signal(1) # Denotes if the permissions are correct
-
-    def elaborate(self, platform=None):
-        m = Module()
-
-        m.submodules.pte_entry = self.pte_entry
-
-        m.d.comb += self.pte_entry.i.eq(self.data)
-
-        # Check if the entry is valid
-        with m.If(self.pte_entry.v):
-            # ASID match or Global Permission
-            # Note that the MSB bound is exclusive
-            with m.If((self.pte_entry.asid == self.asid) | self.pte_entry.g):
-                # Check Execute, Write, Read (XWR) Permissions
-                with m.If(self.pte_entry.xwr == self.xwr):
-                    # Supervisor Logic
-                    with m.If(self.super_mode):
-                        # Valid if entry is not in user mode or supervisor
-                        # has Supervisor User Memory (SUM) access via the
-                        # SUM bit in the sstatus register
-                        m.d.comb += self.valid.eq((~self.pte_entry.u) \
-                                                  | self.super_access)
-                    # User logic
-                    with m.Else():
-                        # Valid if the entry is in user mode only
-                        m.d.comb += self.valid.eq(self.pte_entry.u)
-                with m.Else():
-                    m.d.comb += self.valid.eq(0)
-            with m.Else():
-                m.d.comb += self.valid.eq(0)
-        with m.Else():
-            m.d.comb += self.valid.eq(0)
-        return m
diff --git a/src/TLB/PteEntry.py b/src/TLB/PteEntry.py
deleted file mode 100644
index 73ea9220..00000000
--- a/src/TLB/PteEntry.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from nmigen import Module, Signal, Elaboratable
-from nmigen.cli import main
-
-
-class PteEntry(Elaboratable):
-    """ The purpose of this Module is to  centralize the parsing of Page
-        Table Entries (PTE) into one module to prevent common mistakes
-        and duplication of code. The control bits are parsed out for
-        ease of use.
-
-        This module parses according to the standard PTE given by the
-        Volume II: RISC-V Privileged Architectures V1.10 Pg 60.
-        The Address Space IDentifier (ASID) is appended to the MSB of the input
-        and is parsed out as such.
-
-        An valid input Signal would be:
-              ASID   PTE
-        Bits:[78-64][63-0]
-
-        The output PTE value will include the control bits.
-    """
-    def __init__(self, asid_size, pte_size):
-        """ Arguments:
-            * asid_size: (bit count) The size of the asid to be processed
-            * pte_size: (bit count) The size of the pte to be processed
-
-            Return:
-            * d The Dirty bit from the PTE portion of i
-            * a The Accessed bit from the PTE portion of i
-            * g The Global bit from the PTE portion of i
-            * u The User Mode bit from the PTE portion of i
-            * xwr The Execute/Write/Read bit from the PTE portion of i
-            * v The Valid bit from the PTE portion of i
-            * asid The asid portion of i
-            * pte The pte portion of i
-        """
-        # Internal
-        self.asid_start = pte_size
-        self.asid_end = pte_size + asid_size
-
-        # Input
-        self.i = Signal(asid_size + pte_size)
-
-        # Output
-        self.d = Signal(1) # Dirty bit (From pte)
-        self.a = Signal(1) # Accessed bit (From pte)
-        self.g = Signal(1) # Global Access (From pte)
-        self.u = Signal(1) # User Mode (From pte)
-        self.xwr = Signal(3) # Execute Read Write (From pte)
-        self.v = Signal(1) # Valid (From pte)
-        self.asid = Signal(asid_size) # Associated Address Space IDentifier
-        self.pte = Signal(pte_size) # Full Page Table Entry
-
-    def elaborate(self, platform=None):
-        m = Module()
-        # Pull out all control bites from PTE
-        m.d.comb += [
-            self.d.eq(self.i[7]),
-            self.a.eq(self.i[6]),
-            self.g.eq(self.i[5]),
-            self.u.eq(self.i[4]),
-            self.xwr.eq(self.i[1:4]),
-            self.v.eq(self.i[0])
-        ]
-        m.d.comb += self.asid.eq(self.i[self.asid_start:self.asid_end])
-        m.d.comb += self.pte.eq(self.i[0:self.asid_start])
-        return m
diff --git a/src/TLB/SetAssociativeCache.py b/src/TLB/SetAssociativeCache.py
deleted file mode 100644
index 70c075da..00000000
--- a/src/TLB/SetAssociativeCache.py
+++ /dev/null
@@ -1,272 +0,0 @@
-"""
-
-Online simulator of 4-way set-associative cache:
-http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/sa4.html
-
-Python simulator of a N-way set-associative cache:
-https://github.com/vaskevich/CacheSim/blob/master/cachesim.py
-"""
-
-from nmigen import Array, Cat, Memory, Module, Signal, Mux, Elaboratable
-from nmigen.compat.genlib import fsm
-from nmigen.cli import main
-from nmigen.cli import verilog, rtlil
-
-from .AddressEncoder import AddressEncoder
-from .MemorySet import MemorySet
-
-# TODO: use a LFSR that advances continuously and picking the bottom
-# few bits from it to select which cache line to replace, instead of PLRU
-# http://bugs.libre-riscv.org/show_bug.cgi?id=71
-from .ariane.plru import PLRU
-from .LFSR import LFSR, LFSR_POLY_24
-
-SA_NA = "00" # no action (none)
-SA_RD = "01" # read
-SA_WR = "10" # write
-
-
-class SetAssociativeCache(Elaboratable):
-    """ Set Associative Cache Memory
-
-        The purpose of this module is to generate a memory cache given the
-        constraints passed in. This will create a n-way set associative cache.
-        It is expected for the SV TLB that the VMA will provide the set number
-        while the ASID provides the tag (still to be decided).
-
-    """
-    def __init__(self, tag_size, data_size, set_count, way_count, lfsr=False):
-        """ Arguments
-            * tag_size (bits): The bit count of the tag
-            * data_size (bits): The bit count of the data to be stored
-            * set_count (number): The number of sets/entries in the cache
-            * way_count (number): The number of slots a data can be stored
-                                  in one set
-            * lfsr: if set, use an LFSR for (pseudo-randomly) selecting
-                    set/entry to write to.  otherwise, use a PLRU
-        """
-        # Internals
-        self.lfsr_mode = lfsr
-        self.way_count = way_count  # The number of slots in one set
-        self.tag_size = tag_size    # The bit count of the tag
-        self.data_size = data_size  # The bit count of the data to be stored
-
-        # set up Memory array
-        self.mem_array = Array() # memory array
-        for i in range(way_count):
-            ms = MemorySet(data_size, tag_size, set_count, active=0)
-            self.mem_array.append(ms)
-
-        # Finds valid entries
-        self.encoder = AddressEncoder(way_count)
-
-        # setup PLRU or LFSR
-        if lfsr:
-            # LFSR mode
-            self.lfsr = LFSR(LFSR_POLY_24)
-        else:
-            # PLRU mode
-            self.plru = PLRU(way_count) # One block to handle plru calculations
-            self.plru_array = Array() # PLRU data on each set
-            for i in range(set_count):
-                name="plru%d" % i
-                self.plru_array.append(Signal(self.plru.TLBSZ, name=name))
-
-        # Input
-        self.enable = Signal(1)   # Whether the cache is enabled
-        self.command = Signal(2)  # 00=None, 01=Read, 10=Write (see SA_XX)
-        self.cset = Signal(max=set_count)  # The set to be checked
-        self.tag = Signal(tag_size)        # The tag to find
-        self.data_i = Signal(data_size)    # The input data
-
-        # Output
-        self.ready = Signal(1) # 0 => Processing 1 => Ready for commands
-        self.hit = Signal(1)            # Tag matched one way in the given set
-        self.multiple_hit = Signal(1)   # Tag matched many ways in the given set
-        self.data_o = Signal(data_size) # The data linked to the matched tag
-
-    def check_tags(self, m):
-        """ Validate the tags in the selected set. If one and only one
-            tag matches set its state to zero and increment all others
-            by one. We only advance to next state if a single hit is found.
-        """
-        # Vector to store way valid results
-        # A zero denotes a way is invalid
-        valid_vector = []
-        # Loop through memory to prep read/write ports and set valid_vector
-        for i in range(self.way_count):
-            valid_vector.append(self.mem_array[i].valid)
-
-        # Pass encoder the valid vector
-        m.d.comb += self.encoder.i.eq(Cat(*valid_vector))
-
-        # Only one entry should be marked
-        # This is due to already verifying the tags
-        # matched and the valid bit is high
-        with m.If(self.hit):
-            m.next = "FINISHED_READ"
-            # Pull out data from the read port
-            data = self.mem_array[self.encoder.o].data_o
-            m.d.comb += self.data_o.eq(data)
-            if not self.lfsr_mode:
-                self.access_plru(m)
-
-        # Oh no! Seal the gates! Multiple tags matched?!? kasd;ljkafdsj;k
-        with m.Elif(self.multiple_hit):
-            # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck
-            m.d.comb += self.data_o.eq(0)
-
-        # No tag matches means no data
-        with m.Else():
-            # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck
-            m.d.comb += self.data_o.eq(0)
-
-    def access_plru(self, m):
-        """ An entry was accessed and the plru tree must now be updated
-        """
-        # Pull out the set's entry being edited
-        plru_entry = self.plru_array[self.cset]
-        m.d.comb += [
-            # Set the plru data to the current state
-            self.plru.plru_tree.eq(plru_entry),
-            # Set that the cache was accessed
-            self.plru.lu_access_i.eq(1)
-        ]
-
-    def read(self, m):
-        """ Go through the read process of the cache.
-            This takes two cycles to complete. First it checks for a valid tag
-            and secondly it updates the LRU values.
-        """
-        with m.FSM() as fsm_read:
-            with m.State("READY"):
-                m.d.comb += self.ready.eq(0)
-                # check_tags will set the state if the conditions are met
-                self.check_tags(m)
-            with m.State("FINISHED_READ"):
-                m.next = "READY"
-                m.d.comb += self.ready.eq(1)
-                if not self.lfsr_mode:
-                    plru_tree_o = self.plru.plru_tree_o
-                    m.d.sync += self.plru_array[self.cset].eq(plru_tree_o)
-
-    def write_entry(self, m):
-        if not self.lfsr_mode:
-            m.d.comb += [# set cset (mem address) into PLRU
-                         self.plru.plru_tree.eq(self.plru_array[self.cset]),
-                         # and connect plru to encoder for write
-                         self.encoder.i.eq(self.plru.replace_en_o)
-                        ]
-            write_port = self.mem_array[self.encoder.o].w
-        else:
-            # use the LFSR to generate a random(ish) one of the mem array
-            lfsr_output = Signal(max=self.way_count)
-            lfsr_random = Signal(max=self.way_count)
-            m.d.comb += lfsr_output.eq(self.lfsr.state) # lose some bits
-            # address too big, limit to range of array
-            m.d.comb += lfsr_random.eq(Mux(lfsr_output > self.way_count,
-                                           lfsr_output - self.way_count,
-                                           lfsr_output))
-            write_port = self.mem_array[lfsr_random].w
-
-        # then if there is a match from the encoder, enable the selected write
-        with m.If(self.encoder.single_match):
-            m.d.comb += write_port.en.eq(1)
-
-    def write(self, m):
-        """ Go through the write process of the cache.
-            This takes two cycles to complete. First it writes the entry,
-            and secondly it updates the PLRU (in plru mode)
-        """
-        with m.FSM() as fsm_write:
-            with m.State("READY"):
-                m.d.comb += self.ready.eq(0)
-                self.write_entry(m)
-                m.next ="FINISHED_WRITE"
-            with m.State("FINISHED_WRITE"):
-                m.d.comb += self.ready.eq(1)
-                if not self.lfsr_mode:
-                    plru_entry = self.plru_array[self.cset]
-                    m.d.sync += plru_entry.eq(self.plru.plru_tree_o)
-                m.next = "READY"
-
-
-    def elaborate(self, platform=None):
-        m = Module()
-
-        # ----
-        # set up Modules: AddressEncoder, LFSR/PLRU, Mem Array
-        # ----
-
-        m.submodules.AddressEncoder = self.encoder
-        if self.lfsr_mode:
-            m.submodules.LFSR = self.lfsr
-        else:
-            m.submodules.PLRU = self.plru
-
-        for i, mem in enumerate(self.mem_array):
-            setattr(m.submodules, "mem%d" % i, mem)
-
-        # ----
-        # select mode: PLRU connect to encoder, LFSR do... something
-        # ----
-
-        if not self.lfsr_mode:
-            # Set what entry was hit
-            m.d.comb += self.plru.lu_hit.eq(self.encoder.o)
-        else:
-            # enable LFSR
-            m.d.comb += self.lfsr.enable.eq(self.enable)
-
-        # ----
-        # connect hit/multiple hit to encoder output
-        # ----
-
-        m.d.comb += [
-            self.hit.eq(self.encoder.single_match),
-            self.multiple_hit.eq(self.encoder.multiple_match),
-        ]
-
-        # ----
-        # connect incoming data/tag/cset(addr) to mem_array
-        # ----
-
-        for mem in self.mem_array:
-            write_port = mem.w
-            m.d.comb += [mem.cset.eq(self.cset),
-                         mem.tag.eq(self.tag),
-                         mem.data_i.eq(self.data_i),
-                         write_port.en.eq(0), # default: disable write
-                        ]
-        # ----
-        # Commands: READ/WRITE/TODO
-        # ----
-
-        with m.If(self.enable):
-            with m.Switch(self.command):
-                # Search all sets at a particular tag
-                with m.Case(SA_RD):
-                    self.read(m)
-                with m.Case(SA_WR):
-                    self.write(m)
-                    # Maybe catch multiple tags write here?
-                    # TODO
-                # TODO: invalidate/flush, flush-all?
-
-        return m
-
-    def ports(self):
-        return [self.enable, self.command, self.cset, self.tag, self.data_i,
-                self.ready, self.hit, self.multiple_hit, self.data_o]
-
-
-if __name__ == '__main__':
-    sac = SetAssociativeCache(4, 8, 4, 6)
-    vl = rtlil.convert(sac, ports=sac.ports())
-    with open("SetAssociativeCache.il", "w") as f:
-        f.write(vl)
-
-    sac_lfsr = SetAssociativeCache(4, 8, 4, 6, True)
-    vl = rtlil.convert(sac_lfsr, ports=sac_lfsr.ports())
-    with open("SetAssociativeCacheLFSR.il", "w") as f:
-        f.write(vl)
diff --git a/src/TLB/TLB.py b/src/TLB/TLB.py
deleted file mode 100644
index 98c9af72..00000000
--- a/src/TLB/TLB.py
+++ /dev/null
@@ -1,175 +0,0 @@
-""" TLB Module
-
-    The expected form of the data is:
-    * Item (Bits)
-    * Tag (N - 79) / ASID (78 - 64) / PTE (63 - 0)
-"""
-
-from nmigen import Memory, Module, Signal, Cat, Elaboratable
-from nmigen.cli import main
-
-from .PermissionValidator import PermissionValidator
-from .Cam import Cam
-
-class TLB(Elaboratable):
-    def __init__(self, asid_size, vma_size, pte_size, L1_size):
-        """ Arguments
-            * asid_size: Address Space IDentifier (ASID) typically 15 bits
-            * vma_size: Virtual Memory Address (VMA) typically 36 bits
-            * pte_size: Page Table Entry (PTE) typically 64 bits
-
-            Notes:
-            These arguments should represent the largest possible size
-            defined by the MODE settings. See
-            Volume II: RISC-V Privileged Architectures V1.10 Page 57
-        """
-
-        # Internal
-        self.state = 0
-        # L1 Cache Modules
-        self.cam_L1 = Cam(vma_size, L1_size)
-        self.mem_L1 = Memory(asid_size + pte_size, L1_size)
-
-        # Permission Validator
-        self.perm_validator = PermissionValidator(asid_size, pte_size)
-
-        # Inputs
-        self.supermode = Signal(1) # Supervisor Mode
-        self.super_access = Signal(1) # Supervisor Access
-        self.command = Signal(2) # 00=None, 01=Search, 10=Write L1, 11=Write L2
-        self.xwr = Signal(3) # Execute, Write, Read
-        self.mode = Signal(4) # 4 bits for access to Sv48 on Rv64
-        self.address_L1 = Signal(max=L1_size)
-        self.asid = Signal(asid_size) # Address Space IDentifier (ASID)
-        self.vma = Signal(vma_size) # Virtual Memory Address (VMA)
-        self.pte_in = Signal(pte_size) # To be saved Page Table Entry (PTE)
-
-        # Outputs
-        self.hit = Signal(1) # Denotes if the VMA had a mapped PTE
-        self.perm_valid = Signal(1) # Denotes if the permissions are correct
-        self.pte_out = Signal(pte_size) # PTE that was mapped to by the VMA
-
-    def search(self, m, read_L1, write_L1):
-        """ searches the TLB
-        """
-        m.d.comb += [
-            write_L1.en.eq(0),
-            self.cam_L1.write_enable.eq(0),
-            self.cam_L1.data_in.eq(self.vma)
-        ]
-        # Match found in L1 CAM
-        match_found = Signal(reset_less=True)
-        m.d.comb += match_found.eq(self.cam_L1.single_match
-                              | self.cam_L1.multiple_match)
-        with m.If(match_found):
-            # Memory shortcut variables
-            mem_address = self.cam_L1.match_address
-            # Memory Logic
-            m.d.comb += read_L1.addr.eq(mem_address)
-            # Permission Validator Logic
-            m.d.comb += [
-                self.hit.eq(1),
-                # Set permission validator data to the correct
-                # register file data according to CAM match
-                # address
-                self.perm_validator.data.eq(read_L1.data),
-                # Execute, Read, Write
-                self.perm_validator.xwr.eq(self.xwr),
-                # Supervisor Mode
-                self.perm_validator.super_mode.eq(self.supermode),
-                # Supverisor Access
-                self.perm_validator.super_access.eq(self.super_access),
-                # Address Space IDentifier (ASID)
-                self.perm_validator.asid.eq(self.asid),
-                # Output result of permission validation
-                self.perm_valid.eq(self.perm_validator.valid)
-            ]
-            # Only output PTE if permissions are valid
-            with m.If(self.perm_validator.valid):
-                # XXX TODO - dummy for now
-                reg_data = Signal.like(self.pte_out)
-                m.d.comb += [
-                    self.pte_out.eq(reg_data)
-                ]
-            with m.Else():
-                m.d.comb += [
-                    self.pte_out.eq(0)
-                ]
-        # Miss Logic
-        with m.Else():
-            m.d.comb += [
-                self.hit.eq(0),
-                self.perm_valid.eq(0),
-                self.pte_out.eq(0)
-            ]
-
-    def write_l1(self, m, read_L1, write_L1):
-        """ writes to the L1 cache
-        """
-        # Memory_L1 Logic
-        m.d.comb += [
-            write_L1.en.eq(1),
-            write_L1.addr.eq(self.address_L1),
-            # The Cat places arguments from LSB -> MSB
-            write_L1.data.eq(Cat(self.pte_in, self.asid))
-        ]
-        # CAM_L1 Logic
-        m.d.comb += [
-            self.cam_L1.write_enable.eq(1),
-            self.cam_L1.data_in.eq(self.vma), #data_in is sent to all entries
-            # self.cam_L1.address_in.eq(todo) # a CAM entry needs to be selected
-            
-        ]
-
-    def elaborate(self, platform):
-        m = Module()
-        # Add submodules
-        # Submodules for L1 Cache
-        m.submodules.cam_L1 = self.cam_L1
-        m.submodules.read_L1 = read_L1 = self.mem_L1.read_port()
-        m.submodules.write_L1 = write_L1 = self.mem_L1.write_port()
-        
-        # Permission Validator Submodule
-        m.submodules.perm_valididator = self.perm_validator
-
-        # When MODE specifies translation
-        # TODO add in different bit length handling ie prefix 0s
-        tlb_enable = Signal(reset_less=True)
-        m.d.comb += tlb_enable.eq(self.mode != 0)
-
-        with m.If(tlb_enable):
-            m.d.comb += [
-                self.cam_L1.enable.eq(1)
-            ]
-            with m.Switch(self.command):
-                # Search
-                with m.Case("01"):
-                    self.search(m, read_L1, write_L1)
-
-                # Write L1
-                # Expected that the miss will be handled in software
-                with m.Case("10"):
-                    self.write_l1(m, read_L1, write_L1)
-
-                # TODO
-                #with m.Case("11"):
-
-        # When disabled
-        with m.Else():
-            m.d.comb += [
-                self.cam_L1.enable.eq(0),
-                # XXX TODO - self.reg_file.enable.eq(0),
-                self.hit.eq(0),
-                self.perm_valid.eq(0), # XXX TODO, check this
-                self.pte_out.eq(0)
-            ]
-        return m
-
-
-if __name__ == '__main__':
-    tlb = TLB(15, 36, 64, 4)
-    main(tlb, ports=[ tlb.supermode, tlb.super_access, tlb.command,
-        tlb.xwr, tlb.mode, tlb.address_L1, tlb.asid,
-        tlb.vma, tlb.pte_in,
-        tlb.hit, tlb.perm_valid, tlb.pte_out,
-        ] + tlb.cam_L1.ports())
diff --git a/src/TLB/__init__.py b/src/TLB/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/TLB/ariane/TreePLRU.cpp b/src/TLB/ariane/TreePLRU.cpp
deleted file mode 100644
index 2f6aeea5..00000000
--- a/src/TLB/ariane/TreePLRU.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-#include <cstdint>
-#include <iostream>
-#include <cmath>
-
-
-#define NWAY 4
-#define NLINE 256
-#define HIT 0
-#define MISS 1
-#define MS 1000
-/*
-Detailed TreePLRU inference see here: https://docs.google.com/spreadsheets/d/14zQpPYPwDAbCCjBT_a3KLaE5FEk-RNhI8Z7Qm_biW8g/edit?usp=sharing
-Ref: https://people.cs.clemson.edu/~mark/464/p_lru.txt
-four-way set associative - three bits
-   each bit represents one branch point in a binary decision tree; let 1
-   represent that the left side has been referenced more recently than the
-   right side, and 0 vice-versa
-              are all 4 lines valid?
-                   /       \
-                 yes        no, use an invalid line
-                  |
-                  |
-                  |
-             bit_0 == 0?            state | replace      ref to | next state
-              /       \             ------+--------      -------+-----------
-             y         n             00x  |  line_0      line_0 |    11_
-            /           \            01x  |  line_1      line_1 |    10_
-     bit_1 == 0?    bit_2 == 0?      1x0  |  line_2      line_2 |    0_1
-       /    \          /    \        1x1  |  line_3      line_3 |    0_0
-      y      n        y      n
-     /        \      /        \        ('x' means       ('_' means unchanged)
-   line_0  line_1  line_2  line_3      don't care)
- 8-way set associative - 7  = 1+2+4 bits
-16-way set associative - 15 = 1+2+4+8 bits
-32-way set associative - 31 = 1+2+4+8+16 bits
-64-way set associative - 63 = 1+2+4+8+16+32 bits
-*/
-using namespace std;
-struct AddressField {
-    uint64_t wd_idx : 2;//Unused
-    uint64_t offset : 4;//Unused
-    uint64_t index  : 8;//NLINE = 256 = 2^8
-    uint64_t tag    : 50;
-};
-
-union Address {
-    uint32_t* p;
-    AddressField fields;
-};
-
-struct Cell {
-    bool v;
-    uint64_t tag;
-
-    Cell() : v(false), tag(0) {}
-
-    bool isHit(uint64_t tag) {
-        return v && (tag == this->tag);
-    }
-
-    void fetch(uint32_t* address) {
-        Address addr;
-        addr.p = address;
-        addr.fields.offset = 0;
-        addr.fields.wd_idx = 0;
-        tag = addr.fields.tag;
-        v = true;
-    }
-};
-
-ostream& operator<<(ostream & out, const Cell& cell) {
-    out << " v:" << cell.v << " tag:" << hex << cell.tag;
-    return out;
-}
-
-struct Block {
-    Cell cell[NWAY];
-    uint32_t state;
-    uint64_t *mask;//Mask the state to get accurate value for specified 1 bit.
-    uint64_t *value;
-    uint64_t *next_value;
-
-    Block() : state(0) {
-        switch (NWAY) {
-            case 4:
-                mask = new uint64_t[4]{0b110, 0b110, 0b101, 0b101};
-                value = new uint64_t[4]{0b000, 0b010, 0b100, 0b101};
-                next_value = new uint64_t[4]{0b110, 0b100, 0b001, 0b000};
-                break;
-            case 8:
-                mask = new uint64_t[8]{0b1101000, 0b1101000, 0b1100100, 0b1100100, 0b1010010, 0b1010010, 0b1010001,
-                                       0b1010001};
-                value = new uint64_t[8]{0b0000000, 0b0001000, 0b0100000, 0b0100100, 0b1000000, 0b1000010, 0b1010000,
-                                        0b1010001};
-                next_value = new uint64_t[8]{0b1101000, 0b1100000, 0b1000100, 0b1000000, 0b0010010, 0b0010000,
-                                             0b0000001, 0b0000000};
-                break;
-                //TODO - more NWAY goes here.
-            default:
-                std::cout << "Error definition NWAY = " << NWAY << std::endl;
-        }
-    }
-
-    uint32_t *getByTag(uint64_t tag, uint32_t *pway) {
-        for (int i = 0; i < NWAY; ++i) {
-            if (cell[i].isHit(tag)) {
-                *pway = i;
-                return pway;
-            }
-        }
-        return NULL;
-    }
-
-    void setLRU(uint32_t *address) {
-        int way = 0;
-        uint32_t st = state;
-        for (int i = 0; i < NWAY; ++i) {
-            if ((state & mask[i]) == value[i]) {
-                state ^= mask[i];
-                way = i;
-                break;
-            }
-        }
-        cell[way].fetch(address);
-        cout << "MISS: way:" << way << " address:" << address << " state:" << st << "->" << state << endl;
-    }
-
-    uint32_t *get(uint32_t *address, uint32_t *pway) {
-        Address addr;
-        addr.p = address;
-        uint32_t *d = getByTag(addr.fields.tag, pway);
-        if (d != NULL) {
-            return &d[addr.fields.offset];
-        }
-        return d;
-    }
-
-    int set(uint32_t *address) {
-        uint32_t way = 0;
-        uint32_t *p = get(address, &way);
-        if (p != NULL) {
-            printf("HIT: address:%p ref_to way:%d state %X --> ", address, way, state);
-            state &= ~mask[way];
-            printf("%X --> ", state);
-            state |= next_value[way];
-            printf("%X\n", state);
-            // *p = *address; //skip since address is fake.
-            return HIT;
-        } else {
-            setLRU(address);
-            return MISS;
-        }
-    }
-};
-
-ostream& operator<<(ostream & out, const Block& block) {
-    out << "state:" << block.state << " ";
-    for (int i = 0; i<NWAY; i++) {
-        out << block.cell[i];
-    }
-    return out;
-}
-
-struct Cache {
-    Block block[NLINE];
-    uint32_t count[2];
-    Cache() { count[HIT] = 0; count[MISS] = 0; }
-
-    void access(uint32_t* address) {
-        Address addr;
-        addr.p = address;
-        Block& b = block[addr.fields.index];
-        ++count[b.set(address)];
-    }
-
-};
-ostream& operator<<(ostream & out, const Cache& cache) {
-    out << "\n==Summary==\n\tHit: " << cache.count[HIT] <<  " Miss: " << cache.count[MISS] << std::endl;
-    for (int i = 0; i < NLINE; i++) {
-        out << cache.block[i] << endl;
-    }
-    return out;
-}
-
-Cache cache;
-void multiply(uint32_t* m1, uint32_t* m2, uint32_t* res)
-{
-    int x, i, j;
-    for (i = 0; i < MS; i++) {
-        for (j = 0; j < MS; j++) {
-            cache.access(res + i*MS +j);
-            for (x = 0; x < MS; x++) {
-                cache.access(m1 + i*MS + x);
-                cache.access(m2 + x*MS + j);
-                cache.access(res + i*MS +j);
-                // res[i][j] += m1[i][x] * m2[x][j];
-                cache.access(res + i*MS +j);
-            }
-        }
-    }
-}
-
-int main()
-{
-    uint32_t* m1 = (uint32_t*) 0xFACE00A000000000LL;  // fake virtual address; donât access it
-    uint32_t* m2 = (uint32_t*) 0xFACE00B000000000LL;  // fake virtual address; donât access it
-    uint32_t* res =  (uint32_t*) 0xFACE00C000000000LL; // fake virtual address; donât access it
-    multiply(m1, m2, res);
-    cout << cache << endl;
-    return 0;
-}
diff --git a/src/TLB/ariane/__init__.py b/src/TLB/ariane/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/TLB/ariane/exceptcause.py b/src/TLB/ariane/exceptcause.py
deleted file mode 100644
index 4c5cb2d5..00000000
--- a/src/TLB/ariane/exceptcause.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from nmigen import Const
-
-INSTR_ADDR_MISALIGNED = Const(0, 64)
-INSTR_ACCESS_FAULT    = Const(1, 64)
-ILLEGAL_INSTR         = Const(2, 64)
-BREAKPOINT            = Const(3, 64)
-LD_ADDR_MISALIGNED    = Const(4, 64)
-LD_ACCESS_FAULT       = Const(5, 64)
-ST_ADDR_MISALIGNED    = Const(6, 64)
-ST_ACCESS_FAULT       = Const(7, 64)
-ENV_CALL_UMODE        = Const(8, 64)  # environment call from user mode
-ENV_CALL_SMODE        = Const(9, 64)  # environment call from supervisor mode
-ENV_CALL_MMODE        = Const(11, 64) # environment call from machine mode
-INSTR_PAGE_FAULT      = Const(12, 64) # Instruction page fault
-LOAD_PAGE_FAULT       = Const(13, 64) # Load page fault
-STORE_PAGE_FAULT      = Const(15, 64) # Store page fault
diff --git a/src/TLB/ariane/miss_handler.py b/src/TLB/ariane/miss_handler.py
deleted file mode 100644
index 5ddc7255..00000000
--- a/src/TLB/ariane/miss_handler.py
+++ /dev/null
@@ -1,786 +0,0 @@
-# Copyright 2018 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the "License"); you may not use this file except in
-# compliance with the License.  You may obtain a copy of the License at
-# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-#
-# Author: Florian Zaruba, ETH Zurich
-# Date: 12.11.2017
-# Description: Handles cache misses.
-from nmigen.lib.coding import Encoder, PriorityEncoder
-
-
-# --------------
-# MISS Handler
-# --------------
-import ariane_pkg::*;
-import std_cache_pkg::*;
-
-unsigned NR_PORTS         = 3
-
-class MissReq(RecordObject):
-    def __init__(self, name=None):
-        Record.__init__(self, name)
-        self.valid = Signal()
-        self.addr = Signal(64)
-        self.be = Signal(8)
-        self.size = Signal(2)
-        self.we = Signal()
-        self.wdata = Signal(64)
-        bypass = Signal()
-
-class CacheLine:
-    def __init__(self):
-        self.tag = Signal(DCACHE_TAG_WIDTH) # tag array
-        self.data = Signal(DCACHE_LINE_WIDTH) # data array
-        self.valid = Signal() # state array
-        self.dirty = Signal() # state array
-
-# cache line byte enable
-class CLBE:
-    def __init__(self):
-        self.tag = Signal(DCACHE_TAG_WIDTH+7)//8) # byte enable into tag array
-        self.data = Signal(DCACHE_LINE_WIDTH+7)//8) # byte enable data array
-        # bit enable into state array (valid for a pair of dirty/valid bits)
-        self.vldrty = Signal(DCACHE_SET_ASSOC)
-    } cl_be_t;
-
-
-
-    # FSM states
-"""
-    enum logic [3:0] {
-        IDLE,               # 0
-        FLUSHING,           # 1
-        FLUSH,              # 2
-        WB_CACHELINE_FLUSH, # 3
-        FLUSH_REQ_STATUS,   # 4
-        WB_CACHELINE_MISS,  # 5
-        WAIT_GNT_SRAM,      # 6
-        MISS,               # 7
-        REQ_CACHELINE,      # 8
-        MISS_REPL,          # 9
-        SAVE_CACHELINE,     # A
-        INIT,               # B
-        AMO_LOAD,           # C
-        AMO_SAVE_LOAD,      # D
-        AMO_STORE           # E
-    } state_d, state_q;
-"""
-
-class MissHandler(Elaboratable):
-    def __init__(self, NR_PORTS):
-        self.NR_PORTS = NR_PORTS
-        self.pwid = pwid = ceil(log(NR_PORTS) / log(2))
-        self.flush_i = Signal()      # flush request
-        self.flush_ack_o = Signal()  # acknowledge successful flush
-        self.miss_o = Signal()
-        self.busy_i = Signal()       # dcache is busy with something
-
-        # Bypass or miss
-        self.miss_req_i = Array(MissReq(name="missreq") for i in range(NR_PORTS))
-        # Bypass handling
-        self.bypass_gnt_o = Signal(NR_PORTS)
-        self.bypass_valid_o = Signal(NR_PORTS)
-        self.bypass_data_o = Array(Signal(name="bdata_o", 64) \
-                                    for i in range(NR_PORTS))
-
-        # AXI port
-        output ariane_axi::req_t                            axi_bypass_o,
-        input  ariane_axi::resp_t                           axi_bypass_i,
-
-        # Miss handling (~> cacheline refill)
-        self.miss_gnt_o = Signal(NR_PORTS)
-        self.active_serving_o = Signal(NR_PORTS)
-
-        self.critical_word_o = Signal(64)
-        self.critical_word_valid_o = Signal()
-        output ariane_axi::req_t                            axi_data_o,
-        input  ariane_axi::resp_t                           axi_data_i,
-
-        self.mshr_addr_i = Array(Signal(name="bdata_o", 56) \
-                                    for i in range(NR_PORTS))
-        self.mshr_addr_matches_o = Signal(NR_PORTS)
-        self.mshr_index_matches_o = Signal(NR_PORTS)
-
-        # AMO
-        self.amo_req_i = AMOReq()
-        self.amo_resp_o = AMOResp()
-        # Port to SRAMs, for refill and eviction
-        self.req_o = Signal(DCACHE_SET_ASSOC)
-        self.addr_o = Signal(DCACHE_INDEX_WIDTH) # address into cache array
-        self.data_o = CacheLine()
-        self.be_o = CLBE()
-        self.data_i = Array(CacheLine() \
-                                    for i in range(DCACHE_SET_ASSOC))
-        self.we_o = Signal()
-
-    def elaborate(self, platform):
-        # Registers
-        mshr_t                                  mshr_d, mshr_q;
-        logic [DCACHE_INDEX_WIDTH-1:0]          cnt_d, cnt_q;
-        logic [DCACHE_SET_ASSOC-1:0]            evict_way_d, evict_way_q;
-        # cache line to evict
-        cache_line_t                            evict_cl_d, evict_cl_q;
-
-        logic serve_amo_d, serve_amo_q;
-        # Request from one FSM
-        miss_req_valid = Signal(self.NR_PORTS)
-        miss_req_bypass = Signal(self.NR_PORTS)
-        miss_req_addr = Array(Signal(name="miss_req_addr", 64) \
-                                    for i in range(NR_PORTS))
-        miss_req_wdata = Array(Signal(name="miss_req_wdata", 64) \
-                                    for i in range(NR_PORTS))
-        miss_req_we = Signal(self.NR_PORTS)
-        miss_req_be = Array(Signal(name="miss_req_be", 8) \
-                                    for i in range(NR_PORTS))
-        miss_req_size = Array(Signal(name="miss_req_size", 2) \
-                                    for i in range(NR_PORTS))
-
-        # Cache Line Refill <-> AXI
-        req_fsm_miss_valid = Signal()
-        req_fsm_miss_addr = Signal(64)
-        req_fsm_miss_wdata = Signal(DCACHE_LINE_WIDTH)
-        req_fsm_miss_we = Signal()
-        req_fsm_miss_be = Signal(DCACHE_LINE_WIDTH//8)
-        ariane_axi::ad_req_t                     req_fsm_miss_req;
-        req_fsm_miss_size = Signal(2)
-
-        gnt_miss_fsm = Signal()
-        valid_miss_fsm = Signal()
-        nmiss = DCACHE_LINE_WIDTH//64
-        data_miss_fsm = Array(Signal(name="data_miss_fsm", 64) \
-                                    for i in range(nmiss))
-
-        # Cache Management <-> LFSR
-        lfsr_enable = Signal()
-        lfsr_oh = Signal(DCACHE_SET_ASSOC)
-        lfsr_bin = Signal($clog2(DCACHE_SET_ASSOC-1))
-        # AMOs
-        ariane_pkg::amo_t amo_op;
-        amo_operand_a = Signal(64)
-        amo_operand_b = Signal(64)
-        amo_result_o = Signal(64)
-
-        struct packed {
-            logic [63:3] address;
-            logic        valid;
-        } reservation_d, reservation_q;
-
-        # ------------------------------
-        # Cache Management
-        # ------------------------------
-        evict_way = Signal(DCACHE_SET_ASSOC)
-        valid_way = Signal(DCACHE_SET_ASSOC)
-
-        for (i in range(DCACHE_SET_ASSOC):
-            comb += evict_way[i].eq(data_i[i].valid & data_i[i].dirty)
-            comb += valid_way[i].eq(data_i[i].valid)
-
-        # ----------------------
-        # Default Assignments
-        # ----------------------
-        # to AXI refill
-        req_fsm_miss_req    = ariane_axi::CACHE_LINE_REQ;
-        req_fsm_miss_size   = Const(0b11, 2)
-        # core
-        serve_amo_d         = serve_amo_q;
-        # --------------------------------
-        # Flush and Miss operation
-        # --------------------------------
-        state_d      = state_q;
-        cnt_d        = cnt_q;
-        evict_way_d  = evict_way_q;
-        evict_cl_d   = evict_cl_q;
-        mshr_d       = mshr_q;
-        # communicate to the requester which unit we are currently serving
-        active_serving_o[mshr_q.id] = mshr_q.valid;
-        # AMOs
-        # silence the unit when not used
-        amo_op = amo_req_i.amo_op;
-
-        reservation_d = reservation_q;
-        with m.FSM() as state_q:
-
-            with m.Case("IDLE"):
-                # lowest priority are AMOs, wait until everything else
-                # is served before going for the AMOs
-                with m.If (amo_req_i.req & ~busy_i):
-                    # 1. Flush the cache
-                    with m.If(~serve_amo_q):
-                        m.next = "FLUSH_REQ_STATUS"
-                        serve_amo_d.eq(0b1
-                        cnt_d.eq(0
-                    # 2. Do the AMO
-                    with m.Else():
-                        m.next = "AMO_LOAD"
-                        serve_amo_d.eq(0b0
-
-                # check if we want to flush and can flush
-                # e.g.: we are not busy anymore
-                # TODO: Check that the busy flag is indeed needed
-                with m.If (flush_i & ~busy_i):
-                    m.next = "FLUSH_REQ_STATUS"
-                    cnt_d = 0
-
-                # check if one of the state machines missed
-                for i in range(NR_PORTS):
-                    # here comes the refill portion of code
-                    with m.If (miss_req_valid[i] & ~miss_req_bypass[i]):
-                        m.next = "MISS"
-                        # we are taking another request so don't
-                        # take the AMO
-                        serve_amo_d  = 0b0;
-                        # save to MSHR
-                        wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH
-                        comb += [ mshr_d.valid.eq(0b1),
-                                  mshr_d.we.eq(miss_req_we[i]),
-                                  mshr_d.id.eq(i),
-                                  mshr_d.addr.eq(miss_req_addr[i][0:wid]),
-                                  mshr_d.wdata.eq(miss_req_wdata[i]),
-                                  mshr_d.be.eq(miss_req_be[i]),
-                                ]
-                        break
-
-            #  ~> we missed on the cache
-            with m.Case("MISS"):
-                # 1. Check if there is an empty cache-line
-                # 2. If not -> evict one
-                comb += req_o.eq(1)
-                sync += addr_o.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH]
-                m.next = "MISS_REPL"
-                comb += miss_o.eq(1)
-
-            # ~> second miss cycle
-            with m.Case("MISS_REPL"):
-                # if all are valid we need to evict one, 
-                # pseudo random from LFSR
-                with m.If(~(~valid_way).bool()):
-                    comb += lfsr_enable.eq(0b1)
-                    comb += evict_way_d.eq(lfsr_oh)
-                    # do we need to write back the cache line?
-                    with m.If(data_i[lfsr_bin].dirty):
-                        state_d = WB_CACHELINE_MISS;
-                        comb += evict_cl_d.tag.eq(data_i[lfsr_bin].tag)
-                        comb += evict_cl_d.data.eq(data_i[lfsr_bin].data)
-                        comb += cnt_d.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH])
-                    # no - we can request a cache line now
-                    with m.Else():
-                        m.next = "REQ_CACHELINE"
-                # we have at least one free way
-                with m.Else():
-                    # get victim cache-line by looking for the
-                    # first non-valid bit
-                    comb += evict_way_d.eq(get_victim_cl(~valid_way)
-                    m.next = "REQ_CACHELINE"
-
-            # ~> we can just load the cache-line,
-            # the way is store in evict_way_q
-            with m.Case("REQ_CACHELINE"):
-                comb += req_fsm_miss_valid .eq(1)
-                sync += req_fsm_miss_addr  .eq(mshr_q.addr)
-
-                with m.If (gnt_miss_fsm):
-                    m.next = "SAVE_CACHELINE"
-                    comb += miss_gnt_o[mshr_q.id].eq(1)
-
-            # ~> replace the cacheline
-            with m.Case("SAVE_CACHELINE"):
-                # calculate cacheline offset
-                automatic logic [$clog2(DCACHE_LINE_WIDTH)-1:0] cl_offset;
-                sync += cl_offset.eq(mshr_q.addr[3:DCACHE_BYTE_OFFSET] << 6)
-                # we've got a valid response from refill unit
-                with m.If (valid_miss_fsm):
-                    wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH
-                    sync += addr_o      .eq(mshr_q.addr[:DCACHE_INDEX_WIDTH])
-                    sync += req_o       .eq(evict_way_q)
-                    comb += we_o        .eq(1)
-                    comb += be_o        .eq(1)
-                    sync += be_o.vldrty .eq(evict_way_q)
-                    sync += data_o.tag  .eq(mshr_q.addr[DCACHE_INDEX_WIDTH:wid]
-                    comb += data_o.data .eq(data_miss_fsm)
-                    comb += data_o.valid.eq(1)
-                    comb += data_o.dirty.eq(0)
-
-                    # is this a write?
-                    with m.If (mshr_q.we):
-                        # Yes, so safe the updated data now
-                        for i in range(8):
-                            # check if we really want to write
-                            # the corresponding byte
-                            with m.If (mshr_q.be[i]):
-                                sync += data_o.data[(cl_offset + i*8) +: 8].eq(mshr_q.wdata[i];
-                        # it's immediately dirty if we write
-                        comb += data_o.dirty.eq(1)
-
-                    # reset MSHR
-                    comb += mshr_d.valid.eq(0)
-                    # go back to idle
-                    m.next = 'IDLE'
-
-            # ------------------------------
-            # Write Back Operation
-            # ------------------------------
-            # ~> evict a cache line from way saved in evict_way_q
-            with m.Case("WB_CACHELINE_FLUSH"):
-            with m.Case("WB_CACHELINE_MISS"):
-
-                comb += req_fsm_miss_valid .eq(0b1)
-                sync += req_fsm_miss_addr  .eq({evict_cl_q.tag, cnt_q[DCACHE_INDEX_WIDTH-1:DCACHE_BYTE_OFFSET], {{DCACHE_BYTE_OFFSET}{0b0}}};
-                comb += req_fsm_miss_be    .eq(1)
-                comb += req_fsm_miss_we    .eq(0b1)
-                sync += req_fsm_miss_wdata .eq(evict_cl_q.data;
-
-                # we've got a grant --> this is timing critical, think about it
-                if (gnt_miss_fsm) begin
-                    # write status array
-                    sync += addr_o    .eq(cnt_q)
-                    comb += req_o     .eq(0b1)
-                    comb += we_o      .eq(0b1)
-                    comb += data_o.valid.eq(INVALIDATE_ON_FLUSH ? 0b0 : 0b1)
-                    # invalidate
-                    sync += be_o.vldrty.eq(evict_way_q)
-                    # go back to handling the miss or flushing,
-                    # depending on where we came from
-                    with m.If(state_q == WB_CACHELINE_MISS):
-                        m.next = "MISS"
-                    with m.Else():
-                        m.next = "FLUSH_REQ_STATUS"
-
-            # ------------------------------
-            # Flushing & Initialization
-            # ------------------------------
-            # ~> make another request to check the same
-            # cache-line if there are still some valid entries
-            with m.Case("FLUSH_REQ_STATUS"):
-                comb += req_o  .eq(1)
-                sync += addr_o .eq(cnt_q)
-                m.next = "FLUSHING"
-
-            with m.Case("FLUSHING"):
-                # this has priority
-                # at least one of the cache lines is dirty
-                with m.If(~evict_way):
-                    # evict cache line, look for the first
-                    # cache-line which is dirty
-                    comb += evict_way_d.eq(get_victim_cl(evict_way))
-                    comb += evict_cl_d .eq(data_i[one_hot_to_bin(evict_way)])
-                    state_d     = WB_CACHELINE_FLUSH;
-                # not dirty ~> increment and continue
-                with m.Else():
-                    # increment and re-request
-                    sync += cnt_d.eq(cnt_q + (1 << DCACHE_BYTE_OFFSET))
-                    m.next = "FLUSH_REQ_STATUS"
-                    sync += addr_o     .eq(cnt_q)
-                    comb += req_o      .eq(1)
-                    comb += be_o.vldrty.eq(INVALIDATE_ON_FLUSH ? 1 : 0)
-                    comb += we_o       .eq(1)
-                    # finished with flushing operation, go back to idle
-                    with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \
-                               == DCACHE_NUM_WORDS-1):
-                        # only acknowledge if the flush wasn't
-                        # triggered by an atomic
-                        sync += flush_ack_o.eq(~serve_amo_q)
-                        m.next = "IDLE"
-
-            # ~> only called after reset
-            with m.Case("INIT"):
-                # initialize status array
-                sync += addr_o.eq(cnt_q)
-                comb += req_o .eq(1)
-                comb += we_o  .eq(1)
-                # only write the dirty array
-                comb += be_o.vldrty.eq(1)
-                sync += cnt_d      .eq(cnt_q + (1 << DCACHE_BYTE_OFFSET))
-                # finished initialization
-                with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \
-                            == DCACHE_NUM_WORDS-1)
-                    m.next = "IDLE"
-
-            # ----------------------
-            # AMOs
-            # ----------------------
-            # TODO(zarubaf) Move this closer to memory
-            # ~> we are here because we need to do the AMO,
-            # the cache is clean at this point
-            # start by executing the load
-            with m.Case("AMO_LOAD"):
-                comb += req_fsm_miss_valid.eq(1)
-                # address is in operand a
-                comb += req_fsm_miss_addr.eq(amo_req_i.operand_a)
-                comb += req_fsm_miss_req.eq(ariane_axi::SINGLE_REQ)
-                comb += req_fsm_miss_size.eq(amo_req_i.size)
-                # the request has been granted
-                with m.If(gnt_miss_fsm):
-                    m.next = "AMO_SAVE_LOAD"
-            # save the load value
-            with m.Case("AMO_SAVE_LOAD"):
-                with m.If (valid_miss_fsm):
-                    # we are only concerned about the lower 64-bit
-                    comb += mshr_d.wdata.eq(data_miss_fsm[0])
-                    m.next = "AMO_STORE"
-            # and do the store
-            with m.Case("AMO_STORE"):
-                load_data = Signal(64)
-                # re-align load data
-                comb += load_data.eq(data_align(amo_req_i.operand_a[:3],
-                                                mshr_q.wdata))
-                # Sign-extend for word operation
-                with m.If (amo_req_i.size == 0b10):
-                    comb += amo_operand_a.eq(sext32(load_data[:32]))
-                    comb += amo_operand_b.eq(sext32(amo_req_i.operand_b[:32]))
-                with m.Else():
-                    comb += amo_operand_a.eq(load_data)
-                    comb += amo_operand_b.eq(amo_req_i.operand_b)
-
-                #  we do not need a store request for load reserved
-                # or a failing store conditional
-                #  we can bail-out without making any further requests
-                with m.If ((amo_req_i.amo_op == AMO_LR) | \
-                           ((amo_req_i.amo_op == AMO_SC) & \
-                           ((reservation_q.valid & \
-                            (reservation_q.address != \
-                             amo_req_i.operand_a[3:64])) | \
-                             ~reservation_q.valid))):
-                    comb += req_fsm_miss_valid.eq(0)
-                    m.next = "IDLE"
-                    comb += amo_resp_o.ack.eq(1)
-                    # write-back the result
-                    comb += amo_resp_o.result.eq(amo_operand_a)
-                    # we know that the SC failed
-                    with m.If (amo_req_i.amo_op == AMO_SC):
-                        comb += amo_resp_o.result.eq(1)
-                        # also clear the reservation
-                        comb += reservation_d.valid.eq(0)
-                with m.Else():
-                    comb += req_fsm_miss_valid.eq(1)
-
-                comb += req_fsm_miss_we  .eq(1)
-                comb += req_fsm_miss_req .eq(ariane_axi::SINGLE_REQ)
-                comb += req_fsm_miss_size.eq(amo_req_i.size)
-                comb += req_fsm_miss_addr.eq(amo_req_i.operand_a)
-
-                comb += req_fsm_miss_wdata.eq(
-                    data_align(amo_req_i.operand_a[0:3], amo_result_o))
-                comb += req_fsm_miss_be.eq(
-                    be_gen(amo_req_i.operand_a[0:3], amo_req_i.size))
-
-                # place a reservation on the memory
-                with m.If (amo_req_i.amo_op == AMO_LR):
-                    comb += reservation_d.address.eq(amo_req_i.operand_a[3:64])
-                    comb += reservation_d.valid.eq(1)
-
-                # the request is valid or we didn't need to go for another store
-                with m.If (valid_miss_fsm):
-                    m.next = "IDLE"
-                    comb += amo_resp_o.ack.eq(1)
-                    # write-back the result
-                    comb += amo_resp_o.result.eq(amo_operand_a;
-
-                    if (amo_req_i.amo_op == AMO_SC) begin
-                        comb += amo_resp_o.result.eq(0)
-                        # An SC must fail if there is another SC
-                        # (to any address) between the LR and the SC in
-                        # program order (even to the same address).
-                        # in any case destroy the reservation
-                        comb += reservation_d.valid.eq(0)
-
-        # check MSHR for aliasing
-
-        comb += mshr_addr_matches_o .eq(0)
-        comb += mshr_index_matches_o.eq()
-
-        for i in range(NR_PORTS):
-            # check mshr for potential matching of other units,
-            # exclude the unit currently being served
-            with m.If (mshr_q.valid & \
-                    (mshr_addr_i[i][DCACHE_BYTE_OFFSET:56] == \
-                     mshr_q.addr[DCACHE_BYTE_OFFSET:56])):
-                comb += mshr_addr_matches_o[i].eq(1)
-
-            # same as previous, but checking only the index
-            with m.If (mshr_q.valid & \
-                    (mshr_addr_i[i][DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] == \
-                     mshr_q.addr[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH])):
-                mshr_index_matches_o[i].eq(1)
-
-        # --------------------
-        # Sequential Process
-        # --------------------
-
-        """
-        #pragma translate_off
-        `ifndef VERILATOR
-        # assert that cache only hits on one way
-        assert property (
-          @(posedge clk_i) $onehot0(evict_way_q)) else $warning("Evict-way should be one-hot encoded");
-        `endif
-        #pragma translate_on
-        """
-
-        # ----------------------
-        # Bypass Arbiter
-        # ----------------------
-        # Connection Arbiter <-> AXI
-        req_fsm_bypass_valid = Signal()
-        req_fsm_bypass_addr = Signal(64)
-        req_fsm_bypass_wdata = Signal(64)
-        req_fsm_bypass_we = Signal()
-        req_fsm_bypass_be = Signal(8)
-        req_fsm_bypass_size = Signal(2)
-        gnt_bypass_fsm = Signal()
-        valid_bypass_fsm = Signal()
-        data_bypass_fsm = Signal(64)
-        logic [$clog2(NR_PORTS)-1:0] id_fsm_bypass;
-        logic [3:0]                  id_bypass_fsm;
-        logic [3:0]                  gnt_id_bypass_fsm;
-
-        i_bypass_arbiter = ib = AXIArbiter( NR_PORTS, 64)
-        comb += [
-            # Master Side
-            ib.data_req_i     .eq( miss_req_valid & miss_req_bypass         ),
-            ib.address_i      .eq( miss_req_addr                            ),
-            ib.data_wdata_i   .eq( miss_req_wdata                           ),
-            ib.data_we_i      .eq( miss_req_we                              ),
-            ib.data_be_i      .eq( miss_req_be                              ),
-            ib.data_size_i    .eq( miss_req_size                            ),
-            ib.data_gnt_o     .eq( bypass_gnt_o                             ),
-            ib.data_rvalid_o  .eq( bypass_valid_o                           ),
-            ib.data_rdata_o   .eq( bypass_data_o                            ),
-            # Slave Sid
-            ib.id_i           .eq( id_bypass_fsm[$clog2(NR_PORTS)-1:0]      ),
-            ib.id_o           .eq( id_fsm_bypass                            ),
-            ib.gnt_id_i       .eq( gnt_id_bypass_fsm[$clog2(NR_PORTS)-1:0]  ),
-            ib.address_o      .eq( req_fsm_bypass_addr                      ),
-            ib.data_wdata_o   .eq( req_fsm_bypass_wdata                     ),
-            ib.data_req_o     .eq( req_fsm_bypass_valid                     ),
-            ib.data_we_o      .eq( req_fsm_bypass_we                        ),
-            ib.data_be_o      .eq( req_fsm_bypass_be                        ),
-            ib.data_size_o    .eq( req_fsm_bypass_size                      ),
-            ib.data_gnt_i     .eq( gnt_bypass_fsm                           ),
-            ib.data_rvalid_i  .eq( valid_bypass_fsm                         ),
-            ib.data_rdata_i   .eq( data_bypass_fsm                          ),
-        ]
-
-        axi_adapter #(
-            .DATA_WIDTH            ( 64                 ),
-            .AXI_ID_WIDTH          ( 4                  ),
-            .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET )
-        ) i_bypass_axi_adapter (
-            .clk_i,
-            .rst_ni,
-            .req_i                 ( req_fsm_bypass_valid   ),
-            .type_i                ( ariane_axi::SINGLE_REQ ),
-            .gnt_o                 ( gnt_bypass_fsm         ),
-            .addr_i                ( req_fsm_bypass_addr    ),
-            .we_i                  ( req_fsm_bypass_we      ),
-            .wdata_i               ( req_fsm_bypass_wdata   ),
-            .be_i                  ( req_fsm_bypass_be      ),
-            .size_i                ( req_fsm_bypass_size    ),
-            .id_i                  ( Cat(id_fsm_bypass, 0, 0) ),
-            .valid_o               ( valid_bypass_fsm       ),
-            .rdata_o               ( data_bypass_fsm        ),
-            .gnt_id_o              ( gnt_id_bypass_fsm      ),
-            .id_o                  ( id_bypass_fsm          ),
-            .critical_word_o       (                        ), # not used for single requests
-            .critical_word_valid_o (                        ), # not used for single requests
-            .axi_req_o             ( axi_bypass_o           ),
-            .axi_resp_i            ( axi_bypass_i           )
-        );
-
-        # ----------------------
-        # Cache Line AXI Refill
-        # ----------------------
-        axi_adapter  #(
-            .DATA_WIDTH            ( DCACHE_LINE_WIDTH  ),
-            .AXI_ID_WIDTH          ( 4                  ),
-            .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET )
-        ) i_miss_axi_adapter (
-            .clk_i,
-            .rst_ni,
-            .req_i               ( req_fsm_miss_valid ),
-            .type_i              ( req_fsm_miss_req   ),
-            .gnt_o               ( gnt_miss_fsm       ),
-            .addr_i              ( req_fsm_miss_addr  ),
-            .we_i                ( req_fsm_miss_we    ),
-            .wdata_i             ( req_fsm_miss_wdata ),
-            .be_i                ( req_fsm_miss_be    ),
-            .size_i              ( req_fsm_miss_size  ),
-            .id_i                ( Const(0b1100, 4)   ),
-            .gnt_id_o            (                    ), # open
-            .valid_o             ( valid_miss_fsm     ),
-            .rdata_o             ( data_miss_fsm      ),
-            .id_o                (                    ),
-            .critical_word_o,
-            .critical_word_valid_o,
-            .axi_req_o           ( axi_data_o         ),
-            .axi_resp_i          ( axi_data_i         )
-        );
-
-        # -----------------
-        # Replacement LFSR
-        # -----------------
-        lfsr_8bit #(.WIDTH (DCACHE_SET_ASSOC)) i_lfsr (
-            .en_i           ( lfsr_enable ),
-            .refill_way_oh  ( lfsr_oh     ),
-            .refill_way_bin ( lfsr_bin    ),
-            .*
-        );
-
-        # -----------------
-        # AMO ALU
-        # -----------------
-        amo_alu i_amo_alu (
-            .amo_op_i        ( amo_op        ),
-            .amo_operand_a_i ( amo_operand_a ),
-            .amo_operand_b_i ( amo_operand_b ),
-            .amo_result_o    ( amo_result_o  )
-        );
-
-        # -----------------
-        # Struct Split
-        # -----------------
-
-        for i in range(NR_PORTS):
-            miss_req = MissReq()
-            comb += miss_req.eq(miss_req_i[i]);
-            comb += miss_req_valid  [i] .eq(miss_req.valid)
-            comb += miss_req_bypass [i] .eq(miss_req.bypass)
-            comb += miss_req_addr   [i] .eq(miss_req.addr)
-            comb += miss_req_wdata  [i] .eq(miss_req.wdata)
-            comb += miss_req_we     [i] .eq(miss_req.we)
-            comb += miss_req_be     [i] .eq(miss_req.be)
-            comb += miss_req_size   [i] .eq(miss_req.size)
-
-    # --------------
-    # AXI Arbiter
-    # --------------s
-    #
-    # Description: Arbitrates access to AXI refill/bypass
-    #
-class AXIArbiter:
-    def __init__(self, NR_PORTS   = 3, DATA_WIDTH = 64):
-        self.NR_PORTS = NR_PORTS
-        self.DATA_WIDTH = DATA_WIDTH
-        self.pwid = pwid = ceil(log(NR_PORTS) / log(2))
-        rst_ni = ResetSignal() # Asynchronous reset active low
-        # master ports
-        self.data_req_i = Signal(NR_PORTS)
-        self.address_i = Array(Signal(name="address_i", 64) \
-                                    for i in range(NR_PORTS))
-        self.data_wdata_i = Array(Signal(name="data_wdata_i", 64) \
-                                    for i in range(NR_PORTS))
-        self.data_we_i = Signal(NR_PORTS)
-        self.data_be_i = Array(Signal(name="data_wdata_i", DATA_WIDTH/8) \
-                                    for i in range(NR_PORTS))
-        self.data_size_i = Array(Signal(name="data_size_i", 2) \
-                                    for i in range(NR_PORTS))
-        self.data_gnt_o = Signal(NR_PORTS)
-        self.data_rvalid_o = Signal(NR_PORTS)
-        self.data_rdata_o = Array(Signal(name="data_rdata_o", 64) \
-                                    for i in range(NR_PORTS))
-
-        # slave port
-        self.id_i = Signal(pwid)
-        self.id_o = Signal(pwid)
-        self.gnt_id_i = Signal(pwid)
-        self.data_req_o = Signal()
-        self.address_o = Signal(64)
-        self.data_wdata_o = Signal(DATA_WIDTH)
-        self.data_we_o = Signal()
-        self.data_be_o = Signal(DATA_WIDTH/8)
-        self.data_size_o = Signal(2)
-        self.data_gnt_i = Signal()
-        self.data_rvalid_i = Signal()
-        self.data_rdata_i = Signal(DATA_WIDTH)
-
-    def elaborate(self, platform):
-        #enum logic [1:0] { IDLE, REQ, SERVING } state_d, state_q;
-
-        class Packet:
-            def __init__(self, pwid, DATA_WIDTH):
-                self.id = Signal(pwid)
-                self.address = Signal(64)
-                self.data = Signal(64)
-                self.size = Signal(2)
-                self.be = Signal(DATA_WIDTH/8)
-                self.we = Signal()
-
-        request_index = Signal(self.pwid)
-        req_q = Packet(self.pwid, self.DATA_WIDTH)
-        req_d = Packet(self.pwid, self.DATA_WIDTH)
-
-        # request register
-        sync += req_q.eq(req_d)
-
-        # request port
-        comb += self.address_o             .eq(req_q.address)
-        comb += self.data_wdata_o          .eq(req_q.data)
-        comb += self.data_be_o             .eq(req_q.be)
-        comb += self.data_size_o           .eq(req_q.size)
-        comb += self.data_we_o             .eq(req_q.we)
-        comb += self.id_o                  .eq(req_q.id)
-        comb += self.data_gnt_o            .eq(0)
-        # read port
-        comb += self.data_rvalid_o         .eq(0)
-        comb += self.data_rdata_o          .eq(0)
-        comb += self.data_rdata_o[req_q.id].eq(data_rdata_i)
-
-        m.submodules.pp = pp = PriorityEncoder(self.NR_PORTS)
-        comb += pp.i.eq(self.data_req_i) # select one request (priority-based)
-        comb += request_index.eq(pp.o)
-
-        with m.Switch("state") as s:
-
-            with m.Case("IDLE"):
-                # wait for incoming requests (priority encoder data_req_i)
-                with m.If(~pp.n): # one output valid from encoder
-                    comb += self.data_req_o   .eq(self.data_req_i[i])
-                    comb += self.data_gnt_o[i].eq(self.data_req_i[i])
-                    # save the request
-                    comb += req_d.address.eq(self.address_i[i])
-                    comb += req_d.id.eq(request_index)
-                    comb += req_d.data.eq(self.data_wdata_i[i])
-                    comb += req_d.size.eq(self.data_size_i[i])
-                    comb += req_d.be.eq(self.data_be_i[i])
-                    comb += req_d.we.eq(self.data_we_i[i])
-                    m.next = "SERVING"
-
-                comb += self.address_o    .eq(self.address_i[request_index])
-                comb += self.data_wdata_o .eq(self.data_wdata_i[request_index])
-                comb += self.data_be_o    .eq(self.data_be_i[request_index])
-                comb += self.data_size_o  .eq(self.data_size_i[request_index])
-                comb += self.data_we_o    .eq(self.data_we_i[request_index])
-                comb += self.id_o         .eq(request_index)
-
-            with m.Case("SERVING"):
-                comb += self.data_req_o.eq(1)
-                with m.If (self.data_rvalid_i):
-                    comb += self.data_rvalid_o[req_q.id].eq(1)
-                    m.next = "IDLE"
-
-        # ------------
-        # Assertions
-        # ------------
-
-        """
-#pragma translate_off
-`ifndef VERILATOR
-# make sure that we eventually get an rvalid after we received a grant
-assert property (@(posedge clk_i) data_gnt_i |-> ##[1:$] data_rvalid_i )
-    else begin $error("There was a grant without a rvalid"); $stop(); end
-# assert that there is no grant without a request
-assert property (@(negedge clk_i) data_gnt_i |-> data_req_o)
-    else begin $error("There was a grant without a request."); $stop(); end
-# assert that the address does not contain X when request is sent
-assert property ( @(posedge clk_i) (data_req_o) |-> (!$isunknown(address_o)) )
-  else begin $error("address contains X when request is set"); $stop(); end
-
-`endif
-#pragma translate_on
-        """
-
diff --git a/src/TLB/ariane/mmu.py b/src/TLB/ariane/mmu.py
deleted file mode 100644
index a14862cd..00000000
--- a/src/TLB/ariane/mmu.py
+++ /dev/null
@@ -1,474 +0,0 @@
-"""
-# Copyright 2018 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the "License"); you may not use this file except in
-# compliance with the License.  You may obtain a copy of the License at
-# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-#
-# Author: Florian Zaruba, ETH Zurich
-# Date: 19/04/2017
-# Description: Memory Management Unit for Ariane, contains TLB and
-#              address translation unit. SV48 as defined in
-#              Volume II: RISC-V Privileged Architectures V1.10 Page 63
-
-import ariane_pkg::*;
-"""
-
-from nmigen import Const, Signal, Cat, Module, Mux
-from nmigen.cli import verilog, rtlil
-
-from ptw import DCacheReqI, DCacheReqO, TLBUpdate, PTE, PTW
-from tlb import TLB
-from exceptcause import (INSTR_ACCESS_FAULT, INSTR_PAGE_FAULT,
-                         LOAD_PAGE_FAULT, STORE_PAGE_FAULT)
-
-PRIV_LVL_M = Const(0b11, 2)
-PRIV_LVL_S = Const(0b01, 2)
-PRIV_LVL_U = Const(0b00, 2)
-
-
-class RVException:
-    def __init__(self):
-         self.cause = Signal(64) # cause of exception
-         self.tval = Signal(64) # more info of causing exception
-                                # (e.g.: instruction causing it),
-                                #        address of LD/ST fault
-         self.valid = Signal()
-
-    def eq(self, inp):
-        res = []
-        for (o, i) in zip(self.ports(), inp.ports()):
-            res.append(o.eq(i))
-        return res
-
-    def __iter__(self):
-        yield self.cause
-        yield self.tval
-        yield self.valid
-
-    def ports(self):
-        return list(self)
-
-
-class ICacheReqI:
-    def __init__(self):
-        self.fetch_valid = Signal()   # address translation valid
-        self.fetch_paddr = Signal(64) # physical address in
-        self.fetch_exception = RVException() # exception occurred during fetch
-
-    def __iter__(self):
-        yield self.fetch_valid
-        yield self.fetch_paddr
-        yield from self.fetch_exception
-
-    def ports(self):
-        return list(self)
-
-
-class ICacheReqO:
-    def __init__(self):
-        self.fetch_req = Signal()     # address translation request
-        self.fetch_vaddr = Signal(64) # virtual address out
-
-    def __iter__(self):
-        yield self.fetch_req
-        yield self.fetch_vaddr
-
-    def ports(self):
-        return list(self)
-
-
-class MMU:
-    def __init__(self, instr_tlb_entries = 4,
-                       data_tlb_entries  = 4,
-                       asid_width        = 1):
-        self.instr_tlb_entries = instr_tlb_entries
-        self.data_tlb_entries = data_tlb_entries
-        self.asid_width = asid_width
-
-        self.flush_i = Signal()
-        self.enable_translation_i = Signal()
-        self.en_ld_st_translation_i = Signal() # enable VM translation for LD/ST
-        # IF interface
-        self.icache_areq_i = ICacheReqO()
-        self.icache_areq_o = ICacheReqI()
-        # LSU interface
-        # this is a more minimalistic interface because the actual addressing
-        # logic is handled in the LSU as we distinguish load and stores,
-        # what we do here is simple address translation
-        self.misaligned_ex_i = RVException()
-        self.lsu_req_i = Signal()   # request address translation
-        self.lsu_vaddr_i = Signal(64) # virtual address in
-        self.lsu_is_store_i = Signal() # the translation is requested by a store
-        # if we need to walk the page table we can't grant in the same cycle
-
-        # Cycle 0
-        self.lsu_dtlb_hit_o = Signal() # sent in the same cycle as the request
-                                       # if translation hits in the DTLB
-        # Cycle 1
-        self.lsu_valid_o = Signal()  # translation is valid
-        self.lsu_paddr_o = Signal(64) # translated address
-        self.lsu_exception_o = RVException() # addr translate threw exception
-
-        # General control signals
-        self.priv_lvl_i = Signal(2)
-        self.ld_st_priv_lvl_i = Signal(2)
-        self.sum_i = Signal()
-        self.mxr_i = Signal()
-        # input logic flag_mprv_i,
-        self.satp_ppn_i = Signal(44)
-        self.asid_i = Signal(self.asid_width)
-        self.flush_tlb_i = Signal()
-        # Performance counters
-        self.itlb_miss_o = Signal()
-        self.dtlb_miss_o = Signal()
-        # PTW memory interface
-        self.req_port_i = DCacheReqO()
-        self.req_port_o = DCacheReqI()
-
-    def elaborate(self, platform):
-        m = Module()
-
-        iaccess_err = Signal()   # insufficient priv to access instr page
-        daccess_err = Signal()   # insufficient priv to access data page
-        ptw_active = Signal()    # PTW is currently walking a page table
-        walking_instr = Signal() # PTW is walking because of an ITLB miss
-        ptw_error = Signal()     # PTW threw an exception
-
-        update_vaddr = Signal(48)				  # guessed
-        uaddr64 = Cat(update_vaddr, Const(0, 25)) # extend to 64bit with zeros
-        update_ptw_itlb = TLBUpdate(self.asid_width)
-        update_ptw_dtlb = TLBUpdate(self.asid_width)
-
-        itlb_lu_access = Signal()
-        itlb_content = PTE()
-        itlb_is_2M = Signal()
-        itlb_is_1G = Signal()
-        itlb_is_512G = Signal()
-        itlb_lu_hit = Signal()
-
-        dtlb_lu_access = Signal()
-        dtlb_content = PTE()
-        dtlb_is_2M = Signal()
-        dtlb_is_1G = Signal()
-        dtlb_is_512G = Signal()
-        dtlb_lu_hit = Signal()
-
-        # Assignments
-        m.d.comb += [itlb_lu_access.eq(self.icache_areq_i.fetch_req),
-                     dtlb_lu_access.eq(self.lsu_req_i)
-                    ]
-
-        # ITLB
-        m.submodules.i_tlb = i_tlb = TLB(self.instr_tlb_entries,
-                                         self.asid_width)
-        m.d.comb += [i_tlb.flush_i.eq(self.flush_tlb_i),
-                     i_tlb.update_i.eq(update_ptw_itlb),
-                     i_tlb.lu_access_i.eq(itlb_lu_access),
-                     i_tlb.lu_asid_i.eq(self.asid_i),
-                     i_tlb.lu_vaddr_i.eq(self.icache_areq_i.fetch_vaddr),
-                     itlb_content.eq(i_tlb.lu_content_o),
-                     itlb_is_2M.eq(i_tlb.lu_is_2M_o),
-                     itlb_is_1G.eq(i_tlb.lu_is_1G_o),
-                     itlb_is_512G.eq(i_tlb.lu_is_512G_o),
-                     itlb_lu_hit.eq(i_tlb.lu_hit_o),
-                    ]
-
-        # DTLB
-        m.submodules.d_tlb = d_tlb = TLB(self.data_tlb_entries,
-                                         self.asid_width)
-        m.d.comb += [d_tlb.flush_i.eq(self.flush_tlb_i),
-                     d_tlb.update_i.eq(update_ptw_dtlb),
-                     d_tlb.lu_access_i.eq(dtlb_lu_access),
-                     d_tlb.lu_asid_i.eq(self.asid_i),
-                     d_tlb.lu_vaddr_i.eq(self.lsu_vaddr_i),
-                     dtlb_content.eq(d_tlb.lu_content_o),
-                     dtlb_is_2M.eq(d_tlb.lu_is_2M_o),
-                     dtlb_is_1G.eq(d_tlb.lu_is_1G_o),
-                     dtlb_is_512G.eq(d_tlb.lu_is_512G_o),
-                     dtlb_lu_hit.eq(d_tlb.lu_hit_o),
-                    ]
-
-        # PTW
-        m.submodules.ptw = ptw = PTW(self.asid_width)
-        m.d.comb += [ptw_active.eq(ptw.ptw_active_o),
-                     walking_instr.eq(ptw.walking_instr_o),
-                     ptw_error.eq(ptw.ptw_error_o),
-                     ptw.enable_translation_i.eq(self.enable_translation_i),
-
-                     update_vaddr.eq(ptw.update_vaddr_o),
-                     update_ptw_itlb.eq(ptw.itlb_update_o),
-                     update_ptw_dtlb.eq(ptw.dtlb_update_o),
-
-                     ptw.itlb_access_i.eq(itlb_lu_access),
-                     ptw.itlb_hit_i.eq(itlb_lu_hit),
-                     ptw.itlb_vaddr_i.eq(self.icache_areq_i.fetch_vaddr),
-
-                     ptw.dtlb_access_i.eq(dtlb_lu_access),
-                     ptw.dtlb_hit_i.eq(dtlb_lu_hit),
-                     ptw.dtlb_vaddr_i.eq(self.lsu_vaddr_i),
-
-                     ptw.req_port_i.eq(self.req_port_i),
-                     self.req_port_o.eq(ptw.req_port_o),
-                    ]
-
-        # ila_1 i_ila_1 (
-        #     .clk(clk_i), # input wire clk
-        #     .probe0({req_port_o.address_tag, req_port_o.address_index}),
-        #     .probe1(req_port_o.data_req), # input wire [63:0]  probe1
-        #     .probe2(req_port_i.data_gnt), # input wire [0:0]  probe2
-        #     .probe3(req_port_i.data_rdata), # input wire [0:0]  probe3
-        #     .probe4(req_port_i.data_rvalid), # input wire [0:0]  probe4
-        #     .probe5(ptw_error), # input wire [1:0]  probe5
-        #     .probe6(update_vaddr), # input wire [0:0]  probe6
-        #     .probe7(update_ptw_itlb.valid), # input wire [0:0]  probe7
-        #     .probe8(update_ptw_dtlb.valid), # input wire [0:0]  probe8
-        #     .probe9(dtlb_lu_access), # input wire [0:0]  probe9
-        #     .probe10(lsu_vaddr_i), # input wire [0:0]  probe10
-        #     .probe11(dtlb_lu_hit), # input wire [0:0]  probe11
-        #     .probe12(itlb_lu_access), # input wire [0:0]  probe12
-        #     .probe13(icache_areq_i.fetch_vaddr), # input wire [0:0]  probe13
-        #     .probe14(itlb_lu_hit) # input wire [0:0]  probe13
-        # );
-
-        #-----------------------
-        # Instruction Interface
-        #-----------------------
-        # The instruction interface is a simple request response interface
-
-        # MMU disabled: just pass through
-        m.d.comb += [self.icache_areq_o.fetch_valid.eq(
-                                                self.icache_areq_i.fetch_req),
-                     # play through in case we disabled address translation
-                     self.icache_areq_o.fetch_paddr.eq(
-                                                self.icache_areq_i.fetch_vaddr)
-                    ]
-        # two potential exception sources:
-        # 1. HPTW threw an exception -> signal with a page fault exception
-        # 2. We got an access error because of insufficient permissions ->
-        #    throw an access exception
-        m.d.comb += self.icache_areq_o.fetch_exception.valid.eq(0)
-        # Check whether we are allowed to access this memory region
-        # from a fetch perspective
-
-        # PLATEN TODO: use PermissionValidator instead [we like modules]
-        m.d.comb += iaccess_err.eq(self.icache_areq_i.fetch_req & \
-                                   (((self.priv_lvl_i == PRIV_LVL_U) & \
-                                      ~itlb_content.u) | \
-                                   ((self.priv_lvl_i == PRIV_LVL_S) & \
-                                    itlb_content.u)))
-
-        # MMU enabled: address from TLB, request delayed until hit.
-        # Error when TLB hit and no access right or TLB hit and
-        # translated address not valid (e.g.  AXI decode error),
-        # or when PTW performs walk due to ITLB miss and raises
-        # an error.
-        with m.If (self.enable_translation_i):
-            # we work with SV48, so if VM is enabled, check that
-            # all bits [47:38] are equal
-            with m.If (self.icache_areq_i.fetch_req & \
-                ~(((~self.icache_areq_i.fetch_vaddr[47:64]) == 0) | \
-                 (self.icache_areq_i.fetch_vaddr[47:64]) == 0)):
-                fe = self.icache_areq_o.fetch_exception
-                m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT),
-                             fe.tval.eq(self.icache_areq_i.fetch_vaddr),
-                             fe.valid.eq(1)
-                            ]
-
-            m.d.comb += self.icache_areq_o.fetch_valid.eq(0)
-
-            # 4K page
-            paddr = Signal.like(self.icache_areq_o.fetch_paddr)
-            paddr4k = Cat(self.icache_areq_i.fetch_vaddr[0:12],
-                          itlb_content.ppn)
-            m.d.comb += paddr.eq(paddr4k)
-            # Mega page
-            with m.If(itlb_is_2M):
-                m.d.comb += paddr[12:21].eq(
-                          self.icache_areq_i.fetch_vaddr[12:21])
-            # Giga page
-            with m.If(itlb_is_1G):
-                m.d.comb += paddr[12:30].eq(
-                          self.icache_areq_i.fetch_vaddr[12:30])
-            m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr)
-            # Tera page
-            with m.If(itlb_is_512G):
-                m.d.comb += paddr[12:39].eq(
-                          self.icache_areq_i.fetch_vaddr[12:39])
-            m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr)
-
-            # ---------
-            # ITLB Hit
-            # --------
-            # if we hit the ITLB output the request signal immediately
-            with m.If(itlb_lu_hit):
-                m.d.comb += self.icache_areq_o.fetch_valid.eq(
-                                          self.icache_areq_i.fetch_req)
-                # we got an access error
-                with m.If (iaccess_err):
-                    # throw a page fault
-                    fe = self.icache_areq_o.fetch_exception
-                    m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT),
-                                 fe.tval.eq(self.icache_areq_i.fetch_vaddr),
-                                 fe.valid.eq(1)
-                                ]
-            # ---------
-            # ITLB Miss
-            # ---------
-            # watch out for exceptions happening during walking the page table
-            with m.Elif(ptw_active & walking_instr):
-                m.d.comb += self.icache_areq_o.fetch_valid.eq(ptw_error)
-                fe = self.icache_areq_o.fetch_exception
-                m.d.comb += [fe.cause.eq(INSTR_PAGE_FAULT),
-                             fe.tval.eq(uaddr64),
-                             fe.valid.eq(1)
-                            ]
-
-        #-----------------------
-        # Data Interface
-        #-----------------------
-
-        lsu_vaddr = Signal(64)
-        dtlb_pte = PTE()
-        misaligned_ex = RVException()
-        lsu_req = Signal()
-        lsu_is_store = Signal()
-        dtlb_hit = Signal()
-        #dtlb_is_2M = Signal()
-        #dtlb_is_1G = Signal()
-        #dtlb_is_512 = Signal()
-
-        # check if we need to do translation or if we are always
-        # ready (e.g.: we are not translating anything)
-        m.d.comb += self.lsu_dtlb_hit_o.eq(Mux(self.en_ld_st_translation_i,
-                                          dtlb_lu_hit, 1))
-
-        # The data interface is simpler and only consists of a
-        # request/response interface
-        m.d.comb += [
-            # save request and DTLB response
-            lsu_vaddr.eq(self.lsu_vaddr_i),
-            lsu_req.eq(self.lsu_req_i),
-            misaligned_ex.eq(self.misaligned_ex_i),
-            dtlb_pte.eq(dtlb_content),
-            dtlb_hit.eq(dtlb_lu_hit),
-            lsu_is_store.eq(self.lsu_is_store_i),
-            #dtlb_is_2M.eq(dtlb_is_2M),
-            #dtlb_is_1G.eq(dtlb_is_1G),
-            ##dtlb_is_512.eq(self.dtlb_is_512G) #????
-        ]
-        m.d.sync += [
-            self.lsu_paddr_o.eq(lsu_vaddr),
-            self.lsu_valid_o.eq(lsu_req),
-            self.lsu_exception_o.eq(misaligned_ex),
-        ]
-
-        sverr = Signal()
-        usrerr = Signal()
-
-        m.d.comb += [
-            # mute misaligned exceptions if there is no request
-            # otherwise they will throw accidental exceptions
-            misaligned_ex.valid.eq(self.misaligned_ex_i.valid & self.lsu_req_i),
-
-            # SUM is not set and we are trying to access a user
-            # page in supervisor mode
-            sverr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_S & ~self.sum_i & \
-                       dtlb_pte.u),
-            # this is not a user page but we are in user mode and
-            # trying to access it
-            usrerr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_U & ~dtlb_pte.u),
-
-            # Check if the User flag is set, then we may only
-            # access it in supervisor mode if SUM is enabled
-            daccess_err.eq(sverr | usrerr),
-            ]
-
-        # translation is enabled and no misaligned exception occurred
-        with m.If(self.en_ld_st_translation_i & ~misaligned_ex.valid):
-            m.d.comb += lsu_req.eq(0)
-            # 4K page
-            paddr = Signal.like(lsu_vaddr)
-            paddr4k = Cat(lsu_vaddr[0:12], itlb_content.ppn)
-            m.d.comb += paddr.eq(paddr4k)
-            # Mega page
-            with m.If(dtlb_is_2M):
-                m.d.comb += paddr[12:21].eq(lsu_vaddr[12:21])
-            # Giga page
-            with m.If(dtlb_is_1G):
-                m.d.comb += paddr[12:30].eq(lsu_vaddr[12:30])
-            m.d.sync += self.lsu_paddr_o.eq(paddr)
-            # TODO platen tera_page
-
-            # ---------
-            # DTLB Hit
-            # --------
-            with m.If(dtlb_hit & lsu_req):
-                m.d.comb += lsu_req.eq(1)
-                # this is a store
-                with m.If (lsu_is_store):
-                    # check if the page is write-able and
-                    # we are not violating privileges
-                    # also check if the dirty flag is set
-                    with m.If(~dtlb_pte.w | daccess_err | ~dtlb_pte.d):
-                        le = self.lsu_exception_o
-                        m.d.sync += [le.cause.eq(STORE_PAGE_FAULT),
-                                     le.tval.eq(lsu_vaddr),
-                                     le.valid.eq(1)
-                                    ]
-
-                # this is a load, check for sufficient access
-                # privileges - throw a page fault if necessary
-                with m.Elif(daccess_err):
-                    le = self.lsu_exception_o
-                    m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT),
-                                 le.tval.eq(lsu_vaddr),
-                                 le.valid.eq(1)
-                                ]
-            # ---------
-            # DTLB Miss
-            # ---------
-            # watch out for exceptions
-            with m.Elif (ptw_active & ~walking_instr):
-                # page table walker threw an exception
-                with m.If (ptw_error):
-                    # an error makes the translation valid
-                    m.d.comb += lsu_req.eq(1)
-                    # the page table walker can only throw page faults
-                    with m.If (lsu_is_store):
-                        le = self.lsu_exception_o
-                        m.d.sync += [le.cause.eq(STORE_PAGE_FAULT),
-                                     le.tval.eq(uaddr64),
-                                     le.valid.eq(1)
-                                    ]
-                    with m.Else():
-                        m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT),
-                                     le.tval.eq(uaddr64),
-                                     le.valid.eq(1)
-                                    ]
-
-        return m
-
-    def ports(self):
-        return [self.flush_i, self.enable_translation_i,
-                self.en_ld_st_translation_i,
-                self.lsu_req_i,
-                self.lsu_vaddr_i, self.lsu_is_store_i, self.lsu_dtlb_hit_o,
-                self.lsu_valid_o, self.lsu_paddr_o,
-                self.priv_lvl_i, self.ld_st_priv_lvl_i, self.sum_i, self.mxr_i,
-                self.satp_ppn_i, self.asid_i, self.flush_tlb_i,
-                self.itlb_miss_o, self.dtlb_miss_o] + \
-                self.icache_areq_i.ports() + self.icache_areq_o.ports() + \
-                self.req_port_i.ports() + self.req_port_o.ports() + \
-                self.misaligned_ex_i.ports() + self.lsu_exception_o.ports()
-
-if __name__ == '__main__':
-    mmu = MMU()
-    vl = rtlil.convert(mmu, ports=mmu.ports())
-    with open("test_mmu.il", "w") as f:
-        f.write(vl)
-
diff --git a/src/TLB/ariane/p_lru.txt b/src/TLB/ariane/p_lru.txt
deleted file mode 100644
index 4bac7680..00000000
--- a/src/TLB/ariane/p_lru.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-pseudo-LRU
-
-two-way set associative - one bit
-
-   indicates which line of the two has been reference more recently
-
-
-four-way set associative - three bits
-
-   each bit represents one branch point in a binary decision tree; let 1
-   represent that the left side has been referenced more recently than the
-   right side, and 0 vice-versa
-
-              are all 4 lines valid?
-                   /       \
-                 yes        no, use an invalid line
-                  |
-                  |
-                  |
-             bit_0 == 0?            state | replace      ref to | next state
-              /       \             ------+--------      -------+-----------
-             y         n             00x  |  line_0      line_0 |    11_
-            /           \            01x  |  line_1      line_1 |    10_
-     bit_1 == 0?    bit_2 == 0?      1x0  |  line_2      line_2 |    0_1
-       /    \          /    \        1x1  |  line_3      line_3 |    0_0
-      y      n        y      n
-     /        \      /        \        ('x' means       ('_' means unchanged)
-   line_0  line_1  line_2  line_3      don't care)
-
-   (see Figure 3-7, p. 3-18, in Intel Embedded Pentium Processor Family Dev.
-    Manual, 1998, http://www.intel.com/design/intarch/manuals/273204.htm)
-
-
-note that there is a 6-bit encoding for true LRU for four-way set associative
-
-  bit 0: bank[1] more recently used than bank[0]
-  bit 1: bank[2] more recently used than bank[0]
-  bit 2: bank[2] more recently used than bank[1]
-  bit 3: bank[3] more recently used than bank[0]
-  bit 4: bank[3] more recently used than bank[1]
-  bit 5: bank[3] more recently used than bank[2]
-
-  this results in 24 valid bit patterns within the 64 possible bit patterns
-  (4! possible valid traces for bank references)
-
-  e.g., a trace of 0 1 2 3, where 0 is LRU and 3 is MRU, is encoded as 111111
-
-  you can implement a state machine with a 256x6 ROM (6-bit state encoding
-  appended with a 2-bit bank reference input will yield a new 6-bit state),
-  and you can implement an LRU bank indicator with a 64x2 ROM
-
diff --git a/src/TLB/ariane/plru.py b/src/TLB/ariane/plru.py
deleted file mode 100644
index a8db5c27..00000000
--- a/src/TLB/ariane/plru.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from nmigen import Signal, Module, Cat, Const
-from nmigen.hdl.ir import Elaboratable
-from math import log2
-
-
-class PLRU(Elaboratable):
-    """ PLRU - Pseudo Least Recently Used Replacement
-
-        PLRU-tree indexing:
-        lvl0        0
-                   / \
-                  /   \
-        lvl1     1     2
-                / \   / \
-        lvl2   3   4 5   6
-              / \ /\/\  /\
-             ... ... ... ...
-    """
-    def __init__(self, entries):
-        self.entries = entries
-        self.lu_hit = Signal(entries)
-        self.replace_en_o = Signal(entries)
-        self.lu_access_i = Signal()
-        # Tree (bit per entry)
-        self.TLBSZ = 2*(self.entries-1)
-        self.plru_tree = Signal(self.TLBSZ)
-        self.plru_tree_o = Signal(self.TLBSZ)
-
-    def elaborate(self, platform=None):
-        m = Module()
-
-        # Just predefine which nodes will be set/cleared
-        # E.g. for a TLB with 8 entries, the for-loop is semantically
-        # equivalent to the following pseudo-code:
-        # unique case (1'b1)
-        # lu_hit[7]: plru_tree[0, 2, 6] = {1, 1, 1};
-        # lu_hit[6]: plru_tree[0, 2, 6] = {1, 1, 0};
-        # lu_hit[5]: plru_tree[0, 2, 5] = {1, 0, 1};
-        # lu_hit[4]: plru_tree[0, 2, 5] = {1, 0, 0};
-        # lu_hit[3]: plru_tree[0, 1, 4] = {0, 1, 1};
-        # lu_hit[2]: plru_tree[0, 1, 4] = {0, 1, 0};
-        # lu_hit[1]: plru_tree[0, 1, 3] = {0, 0, 1};
-        # lu_hit[0]: plru_tree[0, 1, 3] = {0, 0, 0};
-        # default: begin /* No hit */ end
-        # endcase
-        LOG_TLB = int(log2(self.entries))
-        print(LOG_TLB)
-        for i in range(self.entries):
-            # we got a hit so update the pointer as it was least recently used
-            hit = Signal(reset_less=True)
-            m.d.comb += hit.eq(self.lu_hit[i] & self.lu_access_i)
-            with m.If(hit):
-                # Set the nodes to the values we would expect
-                for lvl in range(LOG_TLB):
-                    idx_base = (1<<lvl)-1
-                    # lvl0 <=> MSB, lvl1 <=> MSB-1, ...
-                    shift = LOG_TLB - lvl;
-                    new_idx = Const(~((i >> (shift-1)) & 1), (1, False))
-                    plru_idx = idx_base + (i >> shift)
-                    print ("plru", i, lvl, hex(idx_base),
-                                  plru_idx, shift, new_idx)
-                    m.d.comb += self.plru_tree_o[plru_idx].eq(new_idx)
-
-        # Decode tree to write enable signals
-        # Next for-loop basically creates the following logic for e.g.
-        # an 8 entry TLB (note: pseudo-code obviously):
-        # replace_en[7] = &plru_tree[ 6, 2, 0]; #plru_tree[0,2,6]=={1,1,1}
-        # replace_en[6] = &plru_tree[~6, 2, 0]; #plru_tree[0,2,6]=={1,1,0}
-        # replace_en[5] = &plru_tree[ 5,~2, 0]; #plru_tree[0,2,5]=={1,0,1}
-        # replace_en[4] = &plru_tree[~5,~2, 0]; #plru_tree[0,2,5]=={1,0,0}
-        # replace_en[3] = &plru_tree[ 4, 1,~0]; #plru_tree[0,1,4]=={0,1,1}
-        # replace_en[2] = &plru_tree[~4, 1,~0]; #plru_tree[0,1,4]=={0,1,0}
-        # replace_en[1] = &plru_tree[ 3,~1,~0]; #plru_tree[0,1,3]=={0,0,1}
-        # replace_en[0] = &plru_tree[~3,~1,~0]; #plru_tree[0,1,3]=={0,0,0}
-        # For each entry traverse the tree. If every tree-node matches
-        # the corresponding bit of the entry's index, this is
-        # the next entry to replace.
-        replace = []
-        for i in range(self.entries):
-            en = []
-            for lvl in range(LOG_TLB):
-                idx_base = (1<<lvl)-1
-                # lvl0 <=> MSB, lvl1 <=> MSB-1, ...
-                shift = LOG_TLB - lvl;
-                new_idx = (i >> (shift-1)) & 1;
-                plru_idx = idx_base + (i>>shift)
-                plru = Signal(reset_less=True,
-                              name="plru-%d-%d-%d" % (i, lvl, plru_idx))
-                m.d.comb += plru.eq(self.plru_tree[plru_idx])
-                # en &= plru_tree_q[idx_base + (i>>shift)] == new_idx;
-                if new_idx:
-                    en.append(~plru) # yes inverted (using bool())
-                else:
-                    en.append(plru)  # yes inverted (using bool())
-            print ("plru", i, en)
-            # boolean logic manipulation:
-            # plru0 & plru1 & plru2 == ~(~plru0 | ~plru1 | ~plru2)
-            replace.append(~Cat(*en).bool())
-        m.d.comb += self.replace_en_o.eq(Cat(*replace))
-
-        return m
-
-    def ports(self):
-        return [self.entries, self.lu_hit, self.replace_en_o,
-                self.lu_access_i, self.plru_tree, self.plru_tree_o]
diff --git a/src/TLB/ariane/ptw.py b/src/TLB/ariane/ptw.py
deleted file mode 100644
index 4046c711..00000000
--- a/src/TLB/ariane/ptw.py
+++ /dev/null
@@ -1,556 +0,0 @@
-"""
-# Copyright 2018 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the "License"); you may not use this file except in
-# compliance with the License.  You may obtain a copy of the License at
-# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-#
-# Author: David Schaffenrath, TU Graz
-# Author: Florian Zaruba, ETH Zurich
-# Date: 24.4.2017
-# Description: Hardware-PTW
-
-/* verilator lint_off WIDTH */
-import ariane_pkg::*;
-
-see linux kernel source:
-
-* "arch/riscv/include/asm/page.h"
-* "arch/riscv/include/asm/mmu_context.h"
-* "arch/riscv/Kconfig" (CONFIG_PAGE_OFFSET)
-
-"""
-
-from nmigen import Const, Signal, Cat, Module, Elaboratable
-from nmigen.hdl.ast import ArrayProxy
-from nmigen.cli import verilog, rtlil
-from math import log2
-
-
-DCACHE_SET_ASSOC = 8
-CONFIG_L1D_SIZE =  32*1024
-DCACHE_INDEX_WIDTH = int(log2(CONFIG_L1D_SIZE / DCACHE_SET_ASSOC))
-DCACHE_TAG_WIDTH = 56 - DCACHE_INDEX_WIDTH
-
-ASID_WIDTH = 8
-
-
-class DCacheReqI:
-    def __init__(self):
-        self.address_index = Signal(DCACHE_INDEX_WIDTH)
-        self.address_tag = Signal(DCACHE_TAG_WIDTH)
-        self.data_wdata = Signal(64)
-        self.data_req = Signal()
-        self.data_we = Signal()
-        self.data_be = Signal(8)
-        self.data_size = Signal(2)
-        self.kill_req = Signal()
-        self.tag_valid = Signal()
-
-    def eq(self, inp):
-        res = []
-        for (o, i) in zip(self.ports(), inp.ports()):
-            res.append(o.eq(i))
-        return res
-
-    def ports(self):
-        return [self.address_index, self.address_tag,
-                self.data_wdata, self.data_req,
-                self.data_we, self.data_be, self.data_size,
-                self.kill_req, self.tag_valid,
-            ]
-
-class DCacheReqO:
-    def __init__(self):
-        self.data_gnt = Signal()
-        self.data_rvalid = Signal()
-        self.data_rdata = Signal(64) # actually in PTE object format
-
-    def eq(self, inp):
-        res = []
-        for (o, i) in zip(self.ports(), inp.ports()):
-            res.append(o.eq(i))
-        return res
-
-    def ports(self):
-        return [self.data_gnt, self.data_rvalid, self.data_rdata]
-
-
-class PTE: #(RecordObject):
-    def __init__(self):
-        self.v = Signal()
-        self.r = Signal()
-        self.w = Signal()
-        self.x = Signal()
-        self.u = Signal()
-        self.g = Signal()
-        self.a = Signal()
-        self.d = Signal()
-        self.rsw = Signal(2)
-        self.ppn = Signal(44)
-        self.reserved = Signal(10)
-
-    def flatten(self):
-        return Cat(*self.ports())
-
-    def eq(self, x):
-        if isinstance(x, ArrayProxy):
-            res = []
-            for o in self.ports():
-                i = getattr(x, o.name)
-                res.append(i)
-            x = Cat(*res)
-        else:
-            x = x.flatten()
-        return self.flatten().eq(x)
-
-    def __iter__(self):
-        """ order is critical so that flatten creates LSB to MSB
-        """
-        yield self.v
-        yield self.r
-        yield self.w
-        yield self.x
-        yield self.u
-        yield self.g
-        yield self.a
-        yield self.d
-        yield self.rsw
-        yield self.ppn
-        yield self.reserved
-
-    def ports(self):
-        return list(self)
-
-
-class TLBUpdate:
-    def __init__(self, asid_width):
-        self.valid = Signal()      # valid flag
-        self.is_2M = Signal()
-        self.is_1G = Signal()
-        self.is_512G = Signal()
-        self.vpn = Signal(36)
-        self.asid = Signal(asid_width)
-        self.content = PTE()
-
-    def flatten(self):
-        return Cat(*self.ports())
-
-    def eq(self, x):
-        return self.flatten().eq(x.flatten())
-
-    def ports(self):
-        return [self.valid, self.is_2M, self.is_1G, self.vpn, self.asid] + \
-                self.content.ports()
-
-
-# SV48 defines four levels of page tables
-LVL1 = Const(0, 2) # defined to 0 so that ptw_lvl default-resets to LVL1
-LVL2 = Const(1, 2)
-LVL3 = Const(2, 2)
-LVL4 = Const(3, 2)
-
-
-class PTW(Elaboratable):
-    def __init__(self, asid_width=8):
-        self.asid_width = asid_width
-
-        self.flush_i = Signal() # flush everything, we need to do this because
-        # actually everything we do is speculative at this stage
-        # e.g.: there could be a CSR instruction that changes everything
-        self.ptw_active_o = Signal(reset=1)    # active if not IDLE
-        self.walking_instr_o = Signal()        # set when walking for TLB
-        self.ptw_error_o = Signal()            # set when an error occurred
-        self.enable_translation_i = Signal()   # CSRs indicate to enable SV48
-        self.en_ld_st_translation_i = Signal() # enable VM translation for ld/st
-
-        self.lsu_is_store_i = Signal()       # translation triggered by store
-        # PTW memory interface
-        self.req_port_i = DCacheReqO()
-        self.req_port_o = DCacheReqI()
-
-        # to TLBs, update logic
-        self.itlb_update_o = TLBUpdate(asid_width)
-        self.dtlb_update_o = TLBUpdate(asid_width)
-
-        self.update_vaddr_o = Signal(48)
-
-        self.asid_i = Signal(self.asid_width)
-        # from TLBs
-        # did we miss?
-        self.itlb_access_i = Signal()
-        self.itlb_hit_i = Signal()
-        self.itlb_vaddr_i = Signal(64)
-
-        self.dtlb_access_i = Signal()
-        self.dtlb_hit_i = Signal()
-        self.dtlb_vaddr_i = Signal(64)
-        # from CSR file
-        self.satp_ppn_i = Signal(44) # ppn from satp
-        self.mxr_i = Signal()
-        # Performance counters
-        self.itlb_miss_o = Signal()
-        self.dtlb_miss_o = Signal()
-
-    def ports(self):
-        return [self.ptw_active_o, self.walking_instr_o, self.ptw_error_o,
-                ]
-        return [
-                self.enable_translation_i, self.en_ld_st_translation_i,
-                self.lsu_is_store_i, self.req_port_i, self.req_port_o,
-                self.update_vaddr_o,
-                self.asid_i,
-                self.itlb_access_i, self.itlb_hit_i, self.itlb_vaddr_i,
-                self.dtlb_access_i, self.dtlb_hit_i, self.dtlb_vaddr_i,
-                self.satp_ppn_i, self.mxr_i,
-                self.itlb_miss_o, self.dtlb_miss_o
-            ] + self.itlb_update_o.ports() + self.dtlb_update_o.ports()
-
-    def elaborate(self, platform):
-        m = Module()
-
-        # input registers
-        data_rvalid = Signal()
-        data_rdata = Signal(64)
-
-        # NOTE: pte decodes the incoming bit-field (data_rdata). data_rdata
-        # is spec'd in 64-bit binary-format: better to spec as Record?
-        pte = PTE()
-        m.d.comb += pte.flatten().eq(data_rdata)
-
-        # SV48 defines four levels of page tables
-        ptw_lvl = Signal(2) # default=0=LVL1 on reset (see above)
-        ptw_lvl1 = Signal()
-        ptw_lvl2 = Signal()
-        ptw_lvl3 = Signal()
-        ptw_lvl4 = Signal()
-        m.d.comb += [ptw_lvl1.eq(ptw_lvl == LVL1),
-                     ptw_lvl2.eq(ptw_lvl == LVL2),
-                     ptw_lvl3.eq(ptw_lvl == LVL3),
-                     ptw_lvl4.eq(ptw_lvl == LVL4)
-                     ]
-
-        # is this an instruction page table walk?
-        is_instr_ptw = Signal()
-        global_mapping = Signal()
-        # latched tag signal
-        tag_valid = Signal()
-        # register the ASID
-        tlb_update_asid = Signal(self.asid_width)
-        # register VPN we need to walk, SV48 defines a 48 bit virtual addr
-        vaddr = Signal(64)
-        # 4 byte aligned physical pointer
-        ptw_pptr = Signal(56)
-
-        end = DCACHE_INDEX_WIDTH + DCACHE_TAG_WIDTH
-        m.d.sync += [
-            # Assignments
-            self.update_vaddr_o.eq(vaddr),
-
-            self.walking_instr_o.eq(is_instr_ptw),
-            # directly output the correct physical address
-            self.req_port_o.address_index.eq(ptw_pptr[0:DCACHE_INDEX_WIDTH]),
-            self.req_port_o.address_tag.eq(ptw_pptr[DCACHE_INDEX_WIDTH:end]),
-            # we are never going to kill this request
-            self.req_port_o.kill_req.eq(0),              # XXX assign comb?
-            # we are never going to write with the HPTW
-            self.req_port_o.data_wdata.eq(Const(0, 64)), # XXX assign comb?
-            # -----------
-            # TLB Update
-            # -----------
-            self.itlb_update_o.vpn.eq(vaddr[12:48]),
-            self.dtlb_update_o.vpn.eq(vaddr[12:48]),
-            # update the correct page table level
-            self.itlb_update_o.is_2M.eq(ptw_lvl3),
-            self.itlb_update_o.is_1G.eq(ptw_lvl2),
-            self.itlb_update_o.is_512G.eq(ptw_lvl1),
-            self.dtlb_update_o.is_2M.eq(ptw_lvl3),
-            self.dtlb_update_o.is_1G.eq(ptw_lvl2),
-            self.dtlb_update_o.is_512G.eq(ptw_lvl1),
-            
-            # output the correct ASID
-            self.itlb_update_o.asid.eq(tlb_update_asid),
-            self.dtlb_update_o.asid.eq(tlb_update_asid),
-            # set the global mapping bit
-            self.itlb_update_o.content.eq(pte),
-            self.itlb_update_o.content.g.eq(global_mapping),
-            self.dtlb_update_o.content.eq(pte),
-            self.dtlb_update_o.content.g.eq(global_mapping),
-
-            self.req_port_o.tag_valid.eq(tag_valid),
-        ]
-
-        #-------------------
-        # Page table walker   #needs update
-        #-------------------
-        # A virtual address va is translated into a physical address pa as
-        # follows:
-        # 1. Let a be sptbr.ppn Ã PAGESIZE, and let i = LEVELS-1. (For Sv48,
-        #    PAGESIZE=2^12 and LEVELS=4.)
-        # 2. Let pte be the value of the PTE at address a+va.vpn[i]ÃPTESIZE.
-        #    (For Sv32, PTESIZE=4.)
-        # 3. If pte.v = 0, or if pte.r = 0 and pte.w = 1, stop and raise an
-        #    access exception.
-        # 4. Otherwise, the PTE is valid. If pte.r = 1 or pte.x = 1, go to
-        #    step 5.  Otherwise, this PTE is a pointer to the next level of
-        #    the page table.
-        #    Let i=i-1. If i < 0, stop and raise an access exception.
-        #    Otherwise, let a = pte.ppn Ã PAGESIZE and go to step 2.
-        # 5. A leaf PTE has been found. Determine if the requested memory
-        #    access is allowed by the pte.r, pte.w, and pte.x bits. If not,
-        #    stop and raise an access exception. Otherwise, the translation is
-        #    successful.  Set pte.a to 1, and, if the memory access is a
-        #    store, set pte.d to 1.
-        #    The translated physical address is given as follows:
-        #      - pa.pgoff = va.pgoff.
-        #      - If i > 0, then this is a superpage translation and
-        #        pa.ppn[i-1:0] = va.vpn[i-1:0].
-        #      - pa.ppn[LEVELS-1:i] = pte.ppn[LEVELS-1:i].
-        # 6. If i > 0 and pa.ppn[i â 1 : 0] != 0, this is a misaligned
-        #    superpage stop and raise a page-fault exception.
-
-        m.d.sync += tag_valid.eq(0)
-
-        # default assignments
-        m.d.comb += [
-            # PTW memory interface
-            self.req_port_o.data_req.eq(0),
-            self.req_port_o.data_be.eq(Const(0xFF, 8)),
-            self.req_port_o.data_size.eq(Const(0b11, 2)),
-            self.req_port_o.data_we.eq(0),
-            self.ptw_error_o.eq(0),
-            self.itlb_update_o.valid.eq(0),
-            self.dtlb_update_o.valid.eq(0),
-
-            self.itlb_miss_o.eq(0),
-            self.dtlb_miss_o.eq(0),
-        ]
-
-        # ------------
-        # State Machine
-        # ------------
-
-        with m.FSM() as fsm:
-
-            with m.State("IDLE"):
-                self.idle(m, is_instr_ptw, ptw_lvl, global_mapping,
-                          ptw_pptr, vaddr, tlb_update_asid)
-
-            with m.State("WAIT_GRANT"):
-                self.grant(m, tag_valid, data_rvalid)
-
-            with m.State("PTE_LOOKUP"):
-                # we wait for the valid signal
-                with m.If(data_rvalid):
-                    self.lookup(m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4,
-                                data_rvalid, global_mapping,
-                                is_instr_ptw, ptw_pptr)
-
-            # Propagate error to MMU/LSU
-            with m.State("PROPAGATE_ERROR"):
-                m.next = "IDLE"
-                m.d.comb += self.ptw_error_o.eq(1)
-
-            # wait for the rvalid before going back to IDLE
-            with m.State("WAIT_RVALID"):
-                with m.If(data_rvalid):
-                    m.next = "IDLE"
-
-        m.d.sync += [data_rdata.eq(self.req_port_i.data_rdata),
-                     data_rvalid.eq(self.req_port_i.data_rvalid)
-                    ]
-
-        return m
-
-    def set_grant_state(self, m):
-        # should we have flushed before we got an rvalid,
-        # wait for it until going back to IDLE
-        with m.If(self.flush_i):
-            with m.If (self.req_port_i.data_gnt):
-                m.next = "WAIT_RVALID"
-            with m.Else():
-                m.next = "IDLE"
-        with m.Else():
-            m.next = "WAIT_GRANT"
-
-    def idle(self, m, is_instr_ptw, ptw_lvl, global_mapping,
-                          ptw_pptr, vaddr, tlb_update_asid):
-        # by default we start with the top-most page table
-        m.d.sync += [is_instr_ptw.eq(0),
-                     ptw_lvl.eq(LVL1),
-                     global_mapping.eq(0),
-                     self.ptw_active_o.eq(0), # deactive (IDLE)
-                    ]
-        # work out itlb/dtlb miss
-        m.d.comb += self.itlb_miss_o.eq(self.enable_translation_i & \
-                                 self.itlb_access_i & \
-                                 ~self.itlb_hit_i & \
-                                 ~self.dtlb_access_i)
-        m.d.comb += self.dtlb_miss_o.eq(self.en_ld_st_translation_i & \
-                                        self.dtlb_access_i & \
-                                        ~self.dtlb_hit_i)
-        # we got an ITLB miss?
-        with m.If(self.itlb_miss_o):
-            pptr = Cat(Const(0, 3), self.itlb_vaddr_i[30:48],
-                       self.satp_ppn_i)
-            m.d.sync += [ptw_pptr.eq(pptr),
-                         is_instr_ptw.eq(1),
-                         vaddr.eq(self.itlb_vaddr_i),
-                         tlb_update_asid.eq(self.asid_i),
-                        ]
-            self.set_grant_state(m)
-
-        # we got a DTLB miss?
-        with m.Elif(self.dtlb_miss_o):
-            pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:48],
-                       self.satp_ppn_i)
-            m.d.sync += [ptw_pptr.eq(pptr),
-                         vaddr.eq(self.dtlb_vaddr_i),
-                         tlb_update_asid.eq(self.asid_i),
-                        ]
-            self.set_grant_state(m)
-
-    def grant(self, m, tag_valid, data_rvalid):
-        # we've got a data WAIT_GRANT so tell the
-        # cache that the tag is valid
-
-        # send a request out
-        m.d.comb += self.req_port_o.data_req.eq(1)
-        # wait for the WAIT_GRANT
-        with m.If(self.req_port_i.data_gnt):
-            # send the tag valid signal one cycle later
-            m.d.sync += tag_valid.eq(1)
-            # should we have flushed before we got an rvalid,
-            # wait for it until going back to IDLE
-            with m.If(self.flush_i):
-                with m.If (~data_rvalid):
-                    m.next = "WAIT_RVALID"
-                with m.Else():
-                    m.next = "IDLE"
-            with m.Else():
-                m.next = "PTE_LOOKUP"
-
-    def lookup(self, m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4, 
-                            data_rvalid, global_mapping,
-                            is_instr_ptw, ptw_pptr):
-        # temporaries
-        pte_rx = Signal(reset_less=True)
-        pte_exe = Signal(reset_less=True)
-        pte_inv = Signal(reset_less=True)
-        pte_a = Signal(reset_less=True)
-        st_wd = Signal(reset_less=True)
-        m.d.comb += [pte_rx.eq(pte.r | pte.x),
-                    pte_exe.eq(~pte.x | ~pte.a),
-                    pte_inv.eq(~pte.v | (~pte.r & pte.w)),
-                    pte_a.eq(pte.a & (pte.r | (pte.x & self.mxr_i))),
-                    st_wd.eq(self.lsu_is_store_i & (~pte.w | ~pte.d))]
-
-        l1err = Signal(reset_less=True)
-        l2err = Signal(reset_less=True)
-        l3err = Signal(reset_less=True)
-        m.d.comb += [l3err.eq((ptw_lvl3) & pte.ppn[0:9] != Const(0,0)),
-                     l2err.eq((ptw_lvl2) & pte.ppn[0:18] != Const(0, 18)),
-                     l1err.eq((ptw_lvl1) & pte.ppn[0:27] != Const(0, 27))]
-
-        # check if the global mapping bit is set
-        with m.If (pte.g):
-            m.d.sync += global_mapping.eq(1)
-
-        m.next = "IDLE"
-
-        # -------------
-        # Invalid PTE
-        # -------------
-        # If pte.v = 0, or if pte.r = 0 and pte.w = 1,
-        # stop and raise a page-fault exception.
-        with m.If (pte_inv):
-            m.next = "PROPAGATE_ERROR"
-
-        # -----------
-        # Valid PTE
-        # -----------
-
-        # it is a valid PTE
-        # if pte.r = 1 or pte.x = 1 it is a valid PTE
-        with m.Elif (pte_rx):
-            # Valid translation found (either 1G, 2M or 4K)
-            with m.If(is_instr_ptw):
-                # ------------
-                # Update ITLB
-                # ------------
-                # If page not executable, we can directly raise error.
-                # This doesn't put a useless entry into the TLB.
-                # The same idea applies to the access flag since we let
-                # the access flag be managed by SW.
-                with m.If (pte_exe):
-                    m.next = "IDLE"
-                with m.Else():
-                    m.d.comb += self.itlb_update_o.valid.eq(1)
-
-            with m.Else():
-                # ------------
-                # Update DTLB
-                # ------------
-                # Check if the access flag has been set, otherwise
-                # throw page-fault and let software handle those bits.
-                # If page not readable (there are no write-only pages)
-                # directly raise an error. This doesn't put a useless
-                # entry into the TLB.
-                with m.If(pte_a):
-                    m.d.comb += self.dtlb_update_o.valid.eq(1)
-                with m.Else():
-                    m.next = "PROPAGATE_ERROR"
-                # Request is a store: perform additional checks
-                # If the request was a store and the page not
-                # write-able, raise an error
-                # the same applies if the dirty flag is not set
-                with m.If (st_wd):
-                    m.d.comb += self.dtlb_update_o.valid.eq(0)
-                    m.next = "PROPAGATE_ERROR"
-
-            # check if the ppn is correctly aligned: Case (6)
-            with m.If(l1err | l2err | l3err):
-                m.next = "PROPAGATE_ERROR"
-                m.d.comb += [self.dtlb_update_o.valid.eq(0),
-                             self.itlb_update_o.valid.eq(0)]
-
-        # this is a pointer to the next TLB level
-        with m.Else():
-            # pointer to next level of page table
-            with m.If (ptw_lvl1):
-                # we are in the second level now
-                pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:39], pte.ppn)
-                m.d.sync += [ptw_pptr.eq(pptr),
-                             ptw_lvl.eq(LVL2)
-                            ]
-            with m.If(ptw_lvl2):
-                # here we received a pointer to the third level
-                pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[21:30], pte.ppn)
-                m.d.sync += [ptw_pptr.eq(pptr),
-                             ptw_lvl.eq(LVL3)
-                            ]
-            with m.If(ptw_lvl3): #guess: shift page levels by one
-                # here we received a pointer to the fourth level
-                # the last one is near the page offset
-                pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[12:21], pte.ppn)
-                m.d.sync += [ptw_pptr.eq(pptr),
-                             ptw_lvl.eq(LVL4)
-                            ]
-            self.set_grant_state(m)
-
-            with m.If (ptw_lvl4):
-                # Should already be the last level
-                # page table => Error
-                m.d.sync += ptw_lvl.eq(LVL4)
-                m.next = "PROPAGATE_ERROR"
-
-
-if __name__ == '__main__':
-    ptw = PTW()
-    vl = rtlil.convert(ptw, ports=ptw.ports())
-    with open("test_ptw.il", "w") as f:
-        f.write(vl)
diff --git a/src/TLB/ariane/test/test_plru.py b/src/TLB/ariane/test/test_plru.py
deleted file mode 100644
index 68dcfa58..00000000
--- a/src/TLB/ariane/test/test_plru.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import sys
-sys.path.append("../src")
-sys.path.append("../../../TestUtil")
-
-from TLB.ariane.plru import PLRU
-
-from nmigen.compat.sim import run_simulation
-
-def tbench(dut):
-    yield
-
-if __name__ == "__main__":
-    dut = PLRU(4)
-    run_simulation(dut, tbench(dut), vcd_name="test_plru.vcd")
-    print("PLRU Unit Test Success")
diff --git a/src/TLB/ariane/test/test_ptw.py b/src/TLB/ariane/test/test_ptw.py
deleted file mode 100644
index b5deb28b..00000000
--- a/src/TLB/ariane/test/test_ptw.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import sys
-sys.path.append("../src")
-sys.path.append("../../../TestUtil")
-
-from nmigen.compat.sim import run_simulation
-
-from TLB.ariane.ptw import PTW, PTE
-
-# unit was changed, test needs to be changed
-
-def tbench(dut):
-
-    addr = 0x8000000
-
-    #pte = PTE()
-    #yield pte.v.eq(1)
-    #yield pte.r.eq(1)
-
-    yield dut.req_port_i.data_gnt.eq(1)
-    yield dut.req_port_i.data_rvalid.eq(1)
-    yield dut.req_port_i.data_rdata.eq(0x43)#pte.flatten())
-
-    # data lookup
-    yield dut.en_ld_st_translation_i.eq(1)
-    yield dut.asid_i.eq(1)
-
-    yield dut.dtlb_access_i.eq(1)
-    yield dut.dtlb_hit_i.eq(0)
-    yield dut.dtlb_vaddr_i.eq(0x400000000)
-
-    yield
-    yield
-    yield
-
-    yield dut.dtlb_access_i.eq(1)
-    yield dut.dtlb_hit_i.eq(0)
-    yield dut.dtlb_vaddr_i.eq(0x200000)
-
-    yield
-    yield
-    yield
-
-    yield dut.req_port_i.data_gnt.eq(0)
-    yield dut.dtlb_access_i.eq(1)
-    yield dut.dtlb_hit_i.eq(0)
-    yield dut.dtlb_vaddr_i.eq(0x400000011)
-
-    yield
-    yield dut.req_port_i.data_gnt.eq(1)
-    yield
-    yield
-
-    # data lookup, PTW levels 1-2-3
-    addr = 0x4000000
-    yield dut.dtlb_vaddr_i.eq(addr)
-    yield dut.mxr_i.eq(0x1)
-    yield dut.req_port_i.data_gnt.eq(1)
-    yield dut.req_port_i.data_rvalid.eq(1)
-    yield dut.req_port_i.data_rdata.eq(0x41 | (addr>>12)<<10)#pte.flatten())
-
-    yield dut.en_ld_st_translation_i.eq(1)
-    yield dut.asid_i.eq(1)
-
-    yield dut.dtlb_access_i.eq(1)
-    yield dut.dtlb_hit_i.eq(0)
-    yield dut.dtlb_vaddr_i.eq(addr)
-
-    yield
-    yield
-    yield
-    yield
-    yield
-    yield
-    yield
-    yield
-
-    yield dut.req_port_i.data_gnt.eq(0)
-    yield dut.dtlb_access_i.eq(1)
-    yield dut.dtlb_hit_i.eq(0)
-    yield dut.dtlb_vaddr_i.eq(0x400000011)
-
-    yield
-    yield dut.req_port_i.data_gnt.eq(1)
-    yield
-    yield
-    yield
-    yield
-
-
-    # instruction lookup
-    yield dut.en_ld_st_translation_i.eq(0)
-    yield dut.enable_translation_i.eq(1)
-    yield dut.asid_i.eq(1)
-
-    yield dut.itlb_access_i.eq(1)
-    yield dut.itlb_hit_i.eq(0)
-    yield dut.itlb_vaddr_i.eq(0x800000)
-
-    yield
-    yield
-    yield
-
-    yield dut.itlb_access_i.eq(1)
-    yield dut.itlb_hit_i.eq(0)
-    yield dut.itlb_vaddr_i.eq(0x200000)
-
-    yield
-    yield
-    yield
-
-    yield dut.req_port_i.data_gnt.eq(0)
-    yield dut.itlb_access_i.eq(1)
-    yield dut.itlb_hit_i.eq(0)
-    yield dut.itlb_vaddr_i.eq(0x800011)
-
-    yield
-    yield dut.req_port_i.data_gnt.eq(1)
-    yield
-    yield
-
-    yield
-
-
-def test_ptw():
-    dut = PTW()
-    run_simulation(dut, tbench(dut), vcd_name="test_ptw.vcd")
-    print("PTW Unit Test Success")
-
-if __name__ == "__main__":
-    test_ptw()
diff --git a/src/TLB/ariane/test/test_tlb.py b/src/TLB/ariane/test/test_tlb.py
deleted file mode 100644
index b94438ff..00000000
--- a/src/TLB/ariane/test/test_tlb.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import sys
-sys.path.append("../src")
-sys.path.append("../../../TestUtil")
-
-from nmigen.compat.sim import run_simulation
-
-from TLB.ariane.tlb import TLB
-
-def set_vaddr(addr):
-    yield dut.lu_vaddr_i.eq(addr)
-    yield dut.update_i.vpn.eq(addr>>12)
-
-
-def tbench(dut):
-    yield dut.lu_access_i.eq(1)
-    yield dut.lu_asid_i.eq(1)
-    yield dut.update_i.valid.eq(1)
-    yield dut.update_i.is_1G.eq(0)
-    yield dut.update_i.is_2M.eq(0)
-    yield dut.update_i.asid.eq(1)
-    yield dut.update_i.content.ppn.eq(0)
-    yield dut.update_i.content.rsw.eq(0)
-    yield dut.update_i.content.r.eq(1)
-
-    yield
-
-    addr = 0x80000
-    yield from set_vaddr(addr)
-    yield
-
-    addr = 0x90001
-    yield from set_vaddr(addr)
-    yield
-
-    addr = 0x28000000
-    yield from set_vaddr(addr)
-    yield
-
-    addr = 0x28000001
-    yield from set_vaddr(addr)
-
-    addr = 0x28000001
-    yield from set_vaddr(addr)
-    yield
-
-    addr = 0x1000040000
-    yield from set_vaddr(addr)
-    yield
-
-    addr = 0x1000040001
-    yield from set_vaddr(addr)
-    yield
-
-    yield dut.update_i.is_1G.eq(1)
-    addr = 0x2040000
-    yield from set_vaddr(addr)
-    yield
-
-    yield dut.update_i.is_1G.eq(1)
-    addr = 0x2040001
-    yield from set_vaddr(addr)
-    yield
-
-    yield
-
-
-if __name__ == "__main__":
-    dut = TLB()
-    run_simulation(dut, tbench(dut), vcd_name="test_tlb.vcd")
-    print("TLB Unit Test Success")
diff --git a/src/TLB/ariane/test/test_tlb_content.py b/src/TLB/ariane/test/test_tlb_content.py
deleted file mode 100644
index 145ded7d..00000000
--- a/src/TLB/ariane/test/test_tlb_content.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import sys
-sys.path.append("../src")
-sys.path.append("../../../TestUtil")
-
-from nmigen.compat.sim import run_simulation
-
-from TLB.ariane.tlb_content import TLBContent
-from TestUtil.test_helper import assert_op, assert_eq
-
-def update(dut,a,t,g,m):
-    yield dut.replace_en_i.eq(1)
-    yield dut.update_i.valid.eq(1)
-    yield dut.update_i.is_512G.eq(t)
-    yield dut.update_i.is_1G.eq(g)
-    yield dut.update_i.is_2M.eq(m)
-    yield dut.update_i.vpn.eq(a)
-    yield
-    yield
-
-def check_hit(dut,hit,pagesize):
-    hit_d = yield dut.lu_hit_o
-    assert_eq("hit", hit_d, hit)
-
-    if(hit):
-        if(pagesize=="t"):
-            hitp = yield dut.lu_is_512G_o
-            assert_eq("lu_is_512G_o", hitp, 1)
-        elif(pagesize=="g"):
-            hitp = yield dut.lu_is_1G_o
-            assert_eq("lu_is_1G_o", hitp, 1)
-        elif(pagesize=="m"):
-            hitp = yield dut.lu_is_2M_o
-            assert_eq("lu_is_2M_o", hitp, 1)
-
-def addr(a,b,c,d):
-    return a | b << 9 | c << 18 | d << 27  
-    
-def tbench(dut):
-    yield dut.vpn0.eq(0x0A)
-    yield dut.vpn1.eq(0x0B)
-    yield dut.vpn2.eq(0x0C)
-    yield dut.vpn3.eq(0x0D)
-    yield from update(dut,addr(0xFF,0xFF,0xFF,0x0D),1,0,0)
-    yield from check_hit(dut,1,"t")
-    
-    yield from update(dut,addr(0xFF,0xFF,0x0C,0x0D),0,1,0)
-    yield from check_hit(dut,1,"g")
-    
-    yield from update(dut,addr(0xFF,0x0B,0x0C,0x0D),0,0,1)
-    yield from check_hit(dut,1,"m")
-    
-    yield from update(dut,addr(0x0A,0x0B,0x0C,0x0D),0,0,0)
-    yield from check_hit(dut,1,"")
-
-    yield from update(dut,addr(0xAA,0xBB,0xCC,0xDD),0,0,0)
-    yield from check_hit(dut,0,"miss")
-    
-
-if __name__ == "__main__":
-    dut = TLBContent(4,4)
-    #
-    run_simulation(dut, tbench(dut), vcd_name="test_tlb_content.vcd")
-    print("TLBContent Unit Test Success")
diff --git a/src/TLB/ariane/tlb.py b/src/TLB/ariane/tlb.py
deleted file mode 100644
index cf4af57a..00000000
--- a/src/TLB/ariane/tlb.py
+++ /dev/null
@@ -1,175 +0,0 @@
-"""
-# Copyright 2018 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the "License"); you may not use this file except in
-# compliance with the License.  You may obtain a copy of the License at
-# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-#
-# Author: David Schaffenrath, TU Graz
-# Author: Florian Zaruba, ETH Zurich
-# Date: 21.4.2017
-# Description: Translation Lookaside Buffer, SV48
-#              fully set-associative
-
-Implementation in c++:
-https://raw.githubusercontent.com/Tony-Hu/TreePLRU/master/TreePLRU.cpp
-
-Text description:
-https://people.cs.clemson.edu/~mark/464/p_lru.txt
-
-Online simulator:
-http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/vm.html
-"""
-from math import log2
-from nmigen import Signal, Module, Cat, Const, Array, Elaboratable
-from nmigen.cli import verilog, rtlil
-from nmigen.lib.coding import Encoder
-
-from TLB.ariane.ptw import TLBUpdate, PTE, ASID_WIDTH
-from TLB.ariane.plru import PLRU
-from TLB.ariane.tlb_content import TLBContent
-
-TLB_ENTRIES = 8
-
-class TLB(Elaboratable):
-    def __init__(self, tlb_entries=8, asid_width=8):
-        self.tlb_entries = tlb_entries
-        self.asid_width = asid_width
-
-        self.flush_i = Signal()  # Flush signal
-        # Lookup signals
-        self.lu_access_i = Signal()
-        self.lu_asid_i = Signal(self.asid_width)
-        self.lu_vaddr_i = Signal(64)
-        self.lu_content_o = PTE()
-        self.lu_is_2M_o = Signal()
-        self.lu_is_1G_o = Signal()
-        self.lu_is_512G_o = Signal()
-        self.lu_hit_o = Signal()
-        # Update TLB
-        self.pte_width = len(self.lu_content_o.flatten())
-        self.update_i = TLBUpdate(asid_width)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        vpn3 = Signal(9) #FIXME unused signal
-        vpn2 = Signal(9)
-        vpn1 = Signal(9)
-        vpn0 = Signal(9)
-
-        #-------------
-        # Translation
-        #-------------
-
-        # SV48 defines four levels of page tables
-        m.d.comb += [ vpn0.eq(self.lu_vaddr_i[12:21]),
-                      vpn1.eq(self.lu_vaddr_i[21:30]),
-                      vpn2.eq(self.lu_vaddr_i[30:39]),
-                      vpn3.eq(self.lu_vaddr_i[39:48]),      ### FIXME
-                    ]
-
-        tc = []
-        for i in range(self.tlb_entries):
-            tlc = TLBContent(self.pte_width, self.asid_width)
-            setattr(m.submodules, "tc%d" % i, tlc)
-            tc.append(tlc)
-            # connect inputs
-            tlc.update_i = self.update_i     # saves a lot of graphviz links
-            m.d.comb += [tlc.vpn0.eq(vpn0),
-                         tlc.vpn1.eq(vpn1),
-                         tlc.vpn2.eq(vpn2),
-                         # TODO 4th
-                         tlc.flush_i.eq(self.flush_i),
-                         #tlc.update_i.eq(self.update_i),
-                         tlc.lu_asid_i.eq(self.lu_asid_i)]
-        tc = Array(tc)
-
-        #--------------
-        # Select hit
-        #--------------
-
-        # use Encoder to select hit index
-        # XXX TODO: assert that there's only one valid entry (one lu_hit)
-        hitsel = Encoder(self.tlb_entries)
-        m.submodules.hitsel = hitsel
-
-        hits = []
-        for i in range(self.tlb_entries):
-            hits.append(tc[i].lu_hit_o)
-        m.d.comb += hitsel.i.eq(Cat(*hits)) # (goes into plru as well)
-        idx = hitsel.o
-
-        active = Signal(reset_less=True)
-        m.d.comb += active.eq(~hitsel.n)
-        with m.If(active):
-            # active hit, send selected as output
-            m.d.comb += [ self.lu_is_512G_o.eq(tc[idx].lu_is_512G_o),
-                          self.lu_is_1G_o.eq(tc[idx].lu_is_1G_o),
-                          self.lu_is_2M_o.eq(tc[idx].lu_is_2M_o),
-                          self.lu_hit_o.eq(1),
-                          self.lu_content_o.flatten().eq(tc[idx].lu_content_o),
-                        ]
-
-        #--------------
-        # PLRU.
-        #--------------
-
-        p = PLRU(self.tlb_entries)
-        plru_tree = Signal(p.TLBSZ)
-        m.submodules.plru = p
-
-        # connect PLRU inputs/outputs
-        # XXX TODO: assert that there's only one valid entry (one replace_en)
-        en = []
-        for i in range(self.tlb_entries):
-            en.append(tc[i].replace_en_i)
-        m.d.comb += [Cat(*en).eq(p.replace_en_o), # output from PLRU into tags
-                     p.lu_hit.eq(hitsel.i),
-                     p.lu_access_i.eq(self.lu_access_i),
-                     p.plru_tree.eq(plru_tree)]
-        m.d.sync += plru_tree.eq(p.plru_tree_o)
-
-        #--------------
-        # Sanity checks
-        #--------------
-
-        assert (self.tlb_entries % 2 == 0) and (self.tlb_entries > 1), \
-            "TLB size must be a multiple of 2 and greater than 1"
-        assert (self.asid_width >= 1), \
-            "ASID width must be at least 1"
-
-        return m
-
-        """
-        # Just for checking
-        function int countSetBits(logic[self.tlb_entries-1:0] vector);
-          automatic int count = 0;
-          foreach (vector[idx]) begin
-            count += vector[idx];
-          end
-          return count;
-        endfunction
-
-        assert property (@(posedge clk_i)(countSetBits(lu_hit) <= 1))
-          else $error("More then one hit in TLB!"); $stop(); end
-        assert property (@(posedge clk_i)(countSetBits(replace_en) <= 1))
-          else $error("More then one TLB entry selected for next replace!");
-        """
-
-    def ports(self):
-        return [self.flush_i, self.lu_access_i,
-                 self.lu_asid_i, self.lu_vaddr_i,
-                 self.lu_is_2M_o, self.lu_1G_o, self.lu_is_512G_o, self.lu_hit_o
-                ] + self.lu_content_o.ports() + self.update_i.ports()
-
-if __name__ == '__main__':
-    tlb = TLB()
-    vl = rtlil.convert(tlb, ports=tlb.ports())
-    with open("test_tlb.il", "w") as f:
-        f.write(vl)
-
diff --git a/src/TLB/ariane/tlb_content.py b/src/TLB/ariane/tlb_content.py
deleted file mode 100644
index 3384c885..00000000
--- a/src/TLB/ariane/tlb_content.py
+++ /dev/null
@@ -1,145 +0,0 @@
-from nmigen import Signal, Module, Cat, Const, Elaboratable
-
-from TLB.ariane.ptw import TLBUpdate, PTE
-
-
-class TLBEntry:
-    def __init__(self, asid_width):
-        self.asid = Signal(asid_width,name="ent_asid")
-        # SV48 defines four levels of page tables
-        self.vpn0 = Signal(9,name="ent_vpn0")
-        self.vpn1 = Signal(9,name="ent_vpn1")
-        self.vpn2 = Signal(9,name="ent_vpn2")
-        self.vpn3 = Signal(9,name="ent_vpn3")
-        self.is_2M = Signal(name="ent_is_2M")
-        self.is_1G = Signal(name="ent_is_1G")
-        self.is_512G = Signal(name="ent_is_512G")
-        self.valid = Signal(name="ent_valid")
-        
-    def flatten(self):
-        return Cat(*self.ports())
-
-    def eq(self, x):
-        return self.flatten().eq(x.flatten())
-
-    def ports(self):
-        return [self.asid, self.vpn0, self.vpn1, self.vpn2,
-                self.is_2M, self.is_1G, self.valid]
-        
-
-class TLBContent(Elaboratable):
-    def __init__(self, pte_width, asid_width):
-        self.asid_width = asid_width
-        self.pte_width = pte_width
-        self.flush_i = Signal()  # Flush signal
-        # Update TLB
-        self.update_i = TLBUpdate(asid_width)
-        self.vpn3 = Signal(9)
-        self.vpn2 = Signal(9)
-        self.vpn1 = Signal(9)
-        self.vpn0 = Signal(9)
-        self.replace_en_i = Signal() # replace the following entry,
-                                     # set by replacement strategy
-        # Lookup signals
-        self.lu_asid_i = Signal(asid_width)
-        self.lu_content_o = Signal(pte_width)
-        self.lu_is_512G_o = Signal()
-        self.lu_is_2M_o = Signal()
-        self.lu_is_1G_o = Signal()
-        self.lu_hit_o = Signal()
-
-    def elaborate(self, platform):
-        m = Module()
-
-        tags = TLBEntry(self.asid_width)
-        
-        
-        content = Signal(self.pte_width)
-
-        m.d.comb += [self.lu_hit_o.eq(0),
-                     self.lu_is_512G_o.eq(0),
-                     self.lu_is_2M_o.eq(0),
-                     self.lu_is_1G_o.eq(0)]
-
-        # temporaries for lookup
-        asid_ok = Signal(reset_less=True)
-        # tags_ok = Signal(reset_less=True)
-
-        vpn3_ok = Signal(reset_less=True)
-        vpn2_ok = Signal(reset_less=True)
-        vpn1_ok = Signal(reset_less=True)
-        vpn0_ok = Signal(reset_less=True)
-
-        #tags_2M = Signal(reset_less=True)
-        vpn0_or_2M = Signal(reset_less=True)
-    
-        m.d.comb += [
-                     #compare asid and vpn*
-                     asid_ok.eq(tags.asid == self.lu_asid_i),
-                     vpn3_ok.eq(tags.vpn3 == self.vpn3),
-                     vpn2_ok.eq(tags.vpn2 == self.vpn2),
-                     vpn1_ok.eq(tags.vpn1 == self.vpn1),
-                     vpn0_ok.eq(tags.vpn0 == self.vpn0),
-                     vpn0_or_2M.eq(tags.is_2M | vpn0_ok)
-        ]
-        
-        
-        with m.If(asid_ok & tags.valid):
-            # first level, only vpn3 needs to match
-            with m.If (tags.is_512G & vpn3_ok):
-                m.d.comb += [ self.lu_content_o.eq(content),
-                              self.lu_is_512G_o.eq(1),
-                              self.lu_hit_o.eq(1),
-                            ]
-            # second level , second level vpn2 and vpn3 need to match
-            with m.Elif (tags.is_1G & vpn2_ok & vpn3_ok):
-                m.d.comb += [ self.lu_content_o.eq(content),
-                              self.lu_is_1G_o.eq(1),
-                              self.lu_hit_o.eq(1),
-                            ]
-            # not a giga page hit nor a tera page hit so check further
-            with m.Elif(vpn1_ok):
-                # this could be a 2 mega page hit or a 4 kB hit
-                # output accordingly
-                with m.If(vpn0_or_2M):
-                    m.d.comb += [ self.lu_content_o.eq(content),
-                                  self.lu_is_2M_o.eq(tags.is_2M),
-                                  self.lu_hit_o.eq(1),
-                                ]
-        # ------------------
-        # Update or Flush
-        # ------------------
-
-        # temporaries
-        replace_valid = Signal(reset_less=True)
-        m.d.comb += replace_valid.eq(self.update_i.valid & self.replace_en_i)
-
-        # flush
-        with m.If (self.flush_i):
-            # invalidate (flush) conditions: all if zero or just this ASID
-            with m.If (self.lu_asid_i == Const(0, self.asid_width) |
-                      (self.lu_asid_i == tags.asid)):
-                m.d.sync += tags.valid.eq(0)
-
-        # normal replacement
-        with m.Elif(replace_valid):
-            m.d.sync += [ # update tag array
-                          tags.asid.eq(self.update_i.asid),
-                          tags.vpn3.eq(self.update_i.vpn[27:36]),
-                          tags.vpn2.eq(self.update_i.vpn[18:27]),
-                          tags.vpn1.eq(self.update_i.vpn[9:18]),
-                          tags.vpn0.eq(self.update_i.vpn[0:9]),
-                          tags.is_512G.eq(self.update_i.is_512G),
-                          tags.is_1G.eq(self.update_i.is_1G),
-                          tags.is_2M.eq(self.update_i.is_2M),
-                          tags.valid.eq(1),
-                          # and content as well
-                          content.eq(self.update_i.content.flatten())
-                        ]
-        return m
-
-    def ports(self):
-        return [self.flush_i,
-                 self.lu_asid_i,
-                 self.lu_is_2M_o, self.lu_is_1G_o,self.lu_is_512G_o, self.lu_hit_o,
-                ] + self.update_i.content.ports() + self.update_i.ports()
diff --git a/src/TLB/test/__init__.py b/src/TLB/test/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/TLB/test/test_LFSR2.py b/src/TLB/test/test_LFSR2.py
deleted file mode 100644
index c05f55b7..00000000
--- a/src/TLB/test/test_LFSR2.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# SPDX-License-Identifier: LGPL-2.1-or-later
-# See Notices.txt for copyright information
-from TLB.LFSR import LFSR, LFSRPolynomial, LFSR_POLY_3
-
-from nmigen.back.pysim import Simulator, Delay, Tick
-import unittest
-
-
-class TestLFSR(unittest.TestCase):
-    def test_poly(self):
-        v = LFSRPolynomial()
-        self.assertEqual(repr(v), "LFSRPolynomial([0])")
-        self.assertEqual(str(v), "1")
-        v = LFSRPolynomial([1])
-        self.assertEqual(repr(v), "LFSRPolynomial([1, 0])")
-        self.assertEqual(str(v), "x + 1")
-        v = LFSRPolynomial([0, 1])
-        self.assertEqual(repr(v), "LFSRPolynomial([1, 0])")
-        self.assertEqual(str(v), "x + 1")
-        v = LFSRPolynomial([1, 2])
-        self.assertEqual(repr(v), "LFSRPolynomial([2, 1, 0])")
-        self.assertEqual(str(v), "x^2 + x + 1")
-        v = LFSRPolynomial([2])
-        self.assertEqual(repr(v), "LFSRPolynomial([2, 0])")
-        self.assertEqual(str(v), "x^2 + 1")
-        self.assertEqual(str(LFSR_POLY_3), "x^3 + x^2 + 1")
-
-    def test_lfsr_3(self):
-        module = LFSR(LFSR_POLY_3)
-        traces = [module.state, module.enable]
-        with Simulator(module,
-                       vcd_file=open("Waveforms/test_LFSR2.vcd", "w"),
-                       gtkw_file=open("Waveforms/test_LFSR2.gtkw", "w"),
-                       traces=traces) as sim:
-            sim.add_clock(1e-6, 0.25e-6)
-            delay = Delay(1e-7)
-
-            def async_process():
-                yield module.enable.eq(0)
-                yield Tick()
-                self.assertEqual((yield module.state), 0x1)
-                yield Tick()
-                self.assertEqual((yield module.state), 0x1)
-                yield module.enable.eq(1)
-                yield Tick()
-                yield delay
-                self.assertEqual((yield module.state), 0x2)
-                yield Tick()
-                yield delay
-                self.assertEqual((yield module.state), 0x5)
-                yield Tick()
-                yield delay
-                self.assertEqual((yield module.state), 0x3)
-                yield Tick()
-                yield delay
-                self.assertEqual((yield module.state), 0x7)
-                yield Tick()
-                yield delay
-                self.assertEqual((yield module.state), 0x6)
-                yield Tick()
-                yield delay
-                self.assertEqual((yield module.state), 0x4)
-                yield Tick()
-                yield delay
-                self.assertEqual((yield module.state), 0x1)
-                yield Tick()
-
-            sim.add_process(async_process)
-            sim.run()
-
diff --git a/src/TLB/test/test_address_encoder.py b/src/TLB/test/test_address_encoder.py
deleted file mode 100644
index 0aad35b4..00000000
--- a/src/TLB/test/test_address_encoder.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from TLB.AddressEncoder import AddressEncoder
-from TestUtil.test_helper import assert_eq, assert_ne, assert_op
-
-
-# This function allows for the easy setting of values to the AddressEncoder
-# Arguments:
-#   dut: The AddressEncoder being tested
-#   i (Input): The array of single bits to be written
-def set_encoder(dut, i):
-    yield dut.i.eq(i)
-    yield
-
-# Checks the single match of the AddressEncoder
-# Arguments:
-#   dut: The AddressEncoder being tested
-#   sm (Single Match): The expected match result
-#   op (Operation): (0 => ==), (1 => !=)
-def check_single_match(dut, sm, op):
-    out_sm = yield dut.single_match
-    assert_op("Single Match", out_sm, sm, op)
-
-# Checks the multiple match of the AddressEncoder
-# Arguments:
-#   dut: The AddressEncoder being tested
-#   mm (Multiple Match): The expected match result
-#   op (Operation): (0 => ==), (1 => !=)
-def check_multiple_match(dut, mm, op):
-    out_mm = yield dut.multiple_match
-    assert_op("Multiple Match", out_mm, mm, op)
-
-# Checks the output of the AddressEncoder
-# Arguments:
-#   dut: The AddressEncoder being tested
-#   o (Output): The expected output
-#   op (Operation): (0 => ==), (1 => !=)
-def check_output(dut, o, op):
-    out_o = yield dut.o
-    assert_op("Output", out_o, o, op)
-
-# Checks the state of the AddressEncoder
-# Arguments:
-#   dut: The AddressEncoder being tested
-#   sm (Single Match): The expected match result
-#   mm (Multiple Match): The expected match result
-#   o (Output): The expected output
-#   ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-#   mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-#   o_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-def check_all(dut, sm, mm, o, sm_op, mm_op, o_op):
-    yield from check_single_match(dut, sm, sm_op)
-    yield from check_multiple_match(dut, mm, mm_op)
-    yield from check_output(dut, o, o_op)
-
-def tbench(dut):
-    # Check invalid input
-    in_val = 0b000
-    single_match = 0
-    multiple_match = 0
-    output = 0
-    yield from set_encoder(dut, in_val)
-    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
-
-    # Check single bit
-    in_val = 0b001
-    single_match = 1
-    multiple_match = 0
-    output = 0
-    yield from set_encoder(dut, in_val)
-    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
-
-    # Check another single bit
-    in_val = 0b100
-    single_match = 1
-    multiple_match = 0
-    output = 2
-    yield from set_encoder(dut, in_val)
-    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
-
-    # Check multiple match
-    # We expected the lowest bit to be returned which is address 0
-    in_val = 0b101
-    single_match = 0
-    multiple_match = 1
-    output = 0
-    yield from set_encoder(dut, in_val)
-    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
-
-    # Check another multiple match
-    # We expected the lowest bit to be returned which is address 1
-    in_val = 0b110
-    single_match = 0
-    multiple_match = 1
-    output = 1
-    yield from set_encoder(dut, in_val)
-    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
-
-def test_addr():
-    dut = AddressEncoder(4)
-    run_simulation(dut, tbench(dut), 
-                   vcd_name="Waveforms/test_address_encoder.vcd")
-    print("AddressEncoder Unit Test Success")
-
-if __name__ == "__main__":
-    test_addr()
diff --git a/src/TLB/test/test_cam.py b/src/TLB/test/test_cam.py
deleted file mode 100644
index f11c48ad..00000000
--- a/src/TLB/test/test_cam.py
+++ /dev/null
@@ -1,206 +0,0 @@
-from nmigen.compat.sim import run_simulation
-
-from TLB.Cam import Cam
-
-from TestUtil.test_helper import assert_eq, assert_ne, assert_op
-
-# This function allows for the easy setting of values to the Cam
-# Arguments:
-#   dut: The Cam being tested
-#   e (Enable): Whether the block is going to be enabled
-#   we (Write Enable): Whether the Cam will write on the next cycle
-#   a (Address): Where the data will be written if write enable is high
-#   d (Data): Either what we are looking for or will write to the address
-def set_cam(dut, e, we, a, d):
-    yield dut.enable.eq(e)
-    yield dut.write_enable.eq(we)
-    yield dut.address_in.eq(a)
-    yield dut.data_in.eq(d)
-    yield
-
-# Checks the multiple match of the Cam
-# Arguments:
-#   dut: The Cam being tested
-#   mm (Multiple Match): The expected match result
-#   op (Operation): (0 => ==), (1 => !=)
-def check_multiple_match(dut, mm, op):
-    out_mm = yield dut.multiple_match
-    assert_op("Multiple Match", out_mm, mm, op)
-
-# Checks the single match of the Cam
-# Arguments:
-#   dut: The Cam being tested
-#   sm (Single Match): The expected match result
-#   op (Operation): (0 => ==), (1 => !=)
-def check_single_match(dut, sm, op):
-    out_sm = yield dut.single_match
-    assert_op("Single Match", out_sm, sm, op)
-
-# Checks the address output of the Cam
-# Arguments:
-#   dut: The Cam being tested
-#   ma (Match Address): The expected match result
-#   op (Operation): (0 => ==), (1 => !=)
-def check_match_address(dut, ma, op):
-    out_ma = yield dut.match_address
-    assert_op("Match Address", out_ma, ma, op)
-
-# Checks the state of the Cam
-# Arguments:
-#   dut: The Cam being tested
-#   sm (Single Match): The expected match result
-#   mm (Multiple Match): The expected match result
-#   ma: (Match Address): The expected address output
-#   ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-#   mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-#   ma_op (Operation): Operation for the address assertion (0 => ==), (1 => !=)
-def check_all(dut, mm, sm, ma, mm_op, sm_op, ma_op):
-    yield from check_multiple_match(dut, mm, mm_op)
-    yield from check_single_match(dut, sm, sm_op)
-    yield from check_match_address(dut, ma, ma_op)
-
-def tbench(dut):
-    # NA
-    enable = 0
-    write_enable = 0
-    address = 0
-    data = 0
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_single_match(dut, single_match, 0)
-
-    # Read Miss Multiple
-    # Note that the default starting entry data bits are all 0
-    enable = 1
-    write_enable = 0
-    address = 0
-    data = 0
-    multiple_match = 1
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_multiple_match(dut, multiple_match, 0)
-
-    # Read Miss
-    # Note that the default starting entry data bits are all 0
-    enable = 1
-    write_enable = 0
-    address = 0
-    data = 1
-    multiple_match = 0
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_single_match(dut, single_match, 0)
-
-    # Write Entry 0
-    enable = 1
-    write_enable = 1
-    address = 0
-    data = 4
-    multiple_match = 0
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_single_match(dut, single_match, 0)
-
-    # Read Hit Entry 0
-    enable = 1
-    write_enable = 0
-    address = 0
-    data = 4
-    multiple_match = 0
-    single_match = 1
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
-
-    # Search Hit
-    enable = 1
-    write_enable = 0
-    address = 0
-    data = 4
-    multiple_match = 0
-    single_match = 1
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
-
-    # Search Miss
-    enable = 1
-    write_enable = 0
-    address = 0
-    data = 5
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_single_match(dut, single_match, 0)
-
-    # Multiple Match test
-    # Write Entry 1
-    enable = 1
-    write_enable = 1
-    address = 1
-    data = 5
-    multiple_match = 0
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_single_match(dut, single_match, 0)
-
-    # Write Entry 2
-    # Same data as Entry 1
-    enable = 1
-    write_enable = 1
-    address = 2
-    data = 5
-    multiple_match = 0
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_single_match(dut, single_match, 0)
-
-    # Read Hit Data 5
-    enable = 1
-    write_enable = 0
-    address = 1
-    data = 5
-    multiple_match = 1
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_all(dut, multiple_match, single_match, address,0,0,0)
-
-    # Verify read_warning is not caused
-    # Write Entry 0
-    enable = 1
-    write_enable = 1
-    address = 0
-    data = 7
-    multiple_match = 0
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    # Note there is no yield we immediately attempt to read in the next cycle
-
-    # Read Hit Data 7
-    enable = 1
-    write_enable = 0
-    address = 0
-    data = 7
-    multiple_match = 0
-    single_match = 1
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_single_match(dut, single_match, 0)
-
-    yield
-
-
-def test_cam():
-    dut = Cam(4, 4)
-    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam.vcd")
-    print("Cam Unit Test Success")
-
-if __name__ == "__main__":
-    test_cam()
diff --git a/src/TLB/test/test_cam_entry.py b/src/TLB/test/test_cam_entry.py
deleted file mode 100644
index 43b699d2..00000000
--- a/src/TLB/test/test_cam_entry.py
+++ /dev/null
@@ -1,110 +0,0 @@
-from nmigen.compat.sim import run_simulation
-
-from TestUtil.test_helper import assert_eq, assert_ne, assert_op
-from TLB.CamEntry import CamEntry
-
-# This function allows for the easy setting of values to the Cam Entry
-# Arguments:
-#   dut: The CamEntry being tested
-#   c (command): NA (0), Read (1), Write (2), Reserve (3)
-#   d (data): The data to be set
-def set_cam_entry(dut, c, d):
-    # Write desired values
-    yield dut.command.eq(c)
-    yield dut.data_in.eq(d)
-    yield
-    # Reset all lines
-    yield dut.command.eq(0)
-    yield dut.data_in.eq(0)
-    yield
-
-# Checks the data state of the CAM entry
-# Arguments:
-#   dut: The CamEntry being tested
-#   d (Data): The expected data
-#   op (Operation): (0 => ==), (1 => !=)
-def check_data(dut, d, op):
-    out_d = yield dut.data
-    assert_op("Data", out_d, d, op)
-
-# Checks the match state of the CAM entry
-# Arguments:
-#   dut: The CamEntry being tested
-#   m (Match): The expected match
-#   op (Operation): (0 => ==), (1 => !=)
-def check_match(dut, m, op):
-    out_m = yield dut.match
-    assert_op("Match", out_m, m, op)
-
-# Checks the state of the CAM entry
-# Arguments:
-#   dut: The CamEntry being tested
-#   d (data): The expected data
-#   m (match): The expected match
-#   d_op (Operation): Operation for the data assertion (0 => ==), (1 => !=)
-#   m_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-def check_all(dut, d, m, d_op, m_op):
-    yield from check_data(dut, d, d_op)
-    yield from check_match(dut, m, m_op)
-
-# This tbench goes through the paces of testing the CamEntry module
-# It is done by writing and then reading various combinations of key/data pairs
-# and reading the results with varying keys to verify the resulting stored
-# data is correct.
-def tbench(dut):
-    # Check write
-    command = 2
-    data = 1
-    match = 0
-    yield from set_cam_entry(dut, command, data)
-    yield from check_all(dut, data, match, 0, 0)
-
-    # Check read miss
-    command = 1
-    data = 2
-    match = 0
-    yield from set_cam_entry(dut, command, data)
-    yield from check_all(dut, data, match, 1, 0)
-
-    # Check read hit
-    command = 1
-    data = 1
-    match = 1
-    yield from set_cam_entry(dut, command, data)
-    yield from check_all(dut, data, match, 0, 0)
-
-    # Check overwrite
-    command = 2
-    data = 5
-    match = 0
-    yield from set_cam_entry(dut, command, data)
-    yield
-    yield from check_all(dut, data, match, 0, 0)
-
-    # Check read hit
-    command = 1
-    data = 5
-    match = 1
-    yield from set_cam_entry(dut, command, data)
-    yield from check_all(dut, data, match, 0, 0)
-
-    # Check reset
-    command = 3
-    data = 0
-    match = 0
-    yield from set_cam_entry(dut, command, data)
-    yield from check_all(dut, data, match, 0, 0)
-
-    # Extra clock cycle for waveform
-    yield
-
-
-def test_camentry():
-    dut = CamEntry(4)
-    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam_entry.vcd")
-    print("CamEntry Unit Test Success")
-
-
-if __name__ == "__main__":
-    test_camentry()
-
diff --git a/src/TLB/test/test_permission_validator.py b/src/TLB/test/test_permission_validator.py
deleted file mode 100644
index 81873d79..00000000
--- a/src/TLB/test/test_permission_validator.py
+++ /dev/null
@@ -1,146 +0,0 @@
-from nmigen.compat.sim import run_simulation
-
-from TLB.PermissionValidator import PermissionValidator
-
-from TestUtil.test_helper import assert_op
-
-
-def set_validator(dut, d, xwr, sm, sa, asid):
-    yield dut.data.eq(d)
-    yield dut.xwr.eq(xwr)
-    yield dut.super_mode.eq(sm)
-    yield dut.super_access.eq(sa)
-    yield dut.asid.eq(asid)
-    yield
-
-def check_valid(dut, v, op):
-    out_v = yield dut.valid
-    assert_op("Valid", out_v, v, op)
-
-def tbench(dut):
-    # 80 bits represented. Ignore the MSB as it will be truncated
-    # ASID is bits first 4 hex values (bits 64 - 78)
-
-    # Test user mode entry valid
-    # Global Bit matching ASID
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000031
-    # Ignore MSB it will be truncated
-    asid = 0x7FFF
-    super_mode = 0
-    super_access = 0
-    xwr = 0
-    valid = 1
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test user mode entry valid
-    # Global Bit nonmatching ASID
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000031
-    # Ignore MSB it will be truncated
-    asid = 0x7FF6
-    super_mode = 0
-    super_access = 0
-    xwr = 0
-    valid = 1
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test user mode entry invalid
-    # Global Bit nonmatching ASID
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000021
-    # Ignore MSB it will be truncated
-    asid = 0x7FF6
-    super_mode = 0
-    super_access = 0
-    xwr = 0
-    valid = 0
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test user mode entry valid
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000011
-    # Ignore MSB it will be truncated
-    asid = 0x7FFF
-    super_mode = 0
-    super_access = 0
-    xwr = 0
-    valid = 1
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test user mode entry invalid
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000011
-    # Ignore MSB it will be truncated
-    asid = 0x7FF6
-    super_mode = 0
-    super_access = 0
-    xwr = 0
-    valid = 0
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test supervisor mode entry valid
-    # The entry is NOT in user mode
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000001
-    # Ignore MSB it will be truncated
-    asid = 0x7FFF
-    super_mode = 1
-    super_access = 0
-    xwr = 0
-    valid = 1
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test supervisor mode entry invalid
-    # The entry is in user mode
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000011
-    # Ignore MSB it will be truncated
-    asid = 0x7FFF
-    super_mode = 1
-    super_access = 0
-    xwr = 0
-    valid = 0
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test supervisor mode entry valid
-    # The entry is NOT in user mode with access
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000001
-    # Ignore MSB it will be truncated
-    asid = 0x7FFF
-    super_mode = 1
-    super_access = 1
-    xwr = 0
-    valid = 1
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test supervisor mode entry valid
-    # The entry is in user mode with access
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000011
-    # Ignore MSB it will be truncated
-    asid = 0x7FFF
-    super_mode = 1
-    super_access = 1
-    xwr = 0
-    valid = 1
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-
-def test_permv():
-    dut = PermissionValidator(15, 64);
-    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_permission_validator.vcd")
-    print("PermissionValidator Unit Test Success")
-
-if __name__ == "__main__":
-    test_permv()
diff --git a/src/TLB/test/test_pte_entry.py b/src/TLB/test/test_pte_entry.py
deleted file mode 100644
index 5c0c34dc..00000000
--- a/src/TLB/test/test_pte_entry.py
+++ /dev/null
@@ -1,102 +0,0 @@
-from nmigen.compat.sim import run_simulation
-
-from TLB.PteEntry import PteEntry
-
-from TestUtil.test_helper import assert_op
-
-def set_entry(dut, i):
-    yield dut.i.eq(i)
-    yield
-
-def check_dirty(dut, d, op):
-    out_d = yield dut.d
-    assert_op("Dirty", out_d, d, op)
-
-def check_accessed(dut, a, op):
-    out_a = yield dut.a
-    assert_op("Accessed", out_a, a, op)
-
-def check_global(dut, o, op):
-    out = yield dut.g
-    assert_op("Global", out, o, op)
-
-def check_user(dut, o, op):
-    out = yield dut.u
-    assert_op("User Mode", out, o, op)
-
-def check_xwr(dut, o, op):
-    out = yield dut.xwr
-    assert_op("XWR", out, o, op)
-
-def check_asid(dut, o, op):
-    out = yield dut.asid
-    assert_op("ASID", out, o, op)
-
-def check_pte(dut, o, op):
-    out = yield dut.pte
-    assert_op("ASID", out, o, op)
-
-def check_valid(dut, v, op):
-    out_v = yield dut.v
-    assert_op("Valid", out_v, v, op)
-
-def check_all(dut, d, a, g, u, xwr, v, asid, pte):
-    yield from check_dirty(dut, d, 0)
-    yield from check_accessed(dut, a, 0)
-    yield from check_global(dut, g, 0)
-    yield from check_user(dut, u, 0)
-    yield from check_xwr(dut, xwr, 0)
-    yield from check_asid(dut, asid, 0)
-    yield from check_pte(dut, pte, 0)
-    yield from check_valid(dut, v, 0)
-
-def tbench(dut):
-    # 80 bits represented. Ignore the MSB as it will be truncated
-    # ASID is bits first 4 hex values (bits 64 - 78)
-
-    i = 0x7FFF0000000000000031
-    dirty = 0
-    access = 0
-    glob = 1
-    user = 1
-    xwr = 0
-    valid = 1
-    asid = 0x7FFF
-    pte = 0x0000000000000031
-    yield from set_entry(dut, i)
-    yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
-
-    i = 0x0FFF00000000000000FF
-    dirty = 1
-    access = 1
-    glob = 1
-    user = 1
-    xwr = 7
-    valid = 1
-    asid = 0x0FFF
-    pte = 0x00000000000000FF
-    yield from set_entry(dut, i)
-    yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
-
-    i = 0x0721000000001100001F
-    dirty = 0
-    access = 0
-    glob = 0
-    user = 1
-    xwr = 7
-    valid = 1
-    asid = 0x0721
-    pte = 0x000000001100001F
-    yield from set_entry(dut, i)
-    yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
-
-    yield
-
-
-def test_pteentry():
-    dut = PteEntry(15, 64);
-    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_pte_entry.vcd")
-    print("PteEntry Unit Test Success")
-
-if __name__ == "__main__":
-    test_pteentry()
diff --git a/src/TLB/test/test_set_associative_cache.py b/src/TLB/test/test_set_associative_cache.py
deleted file mode 100644
index 0641b556..00000000
--- a/src/TLB/test/test_set_associative_cache.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from nmigen.compat.sim import run_simulation
-
-from TLB.SetAssociativeCache import SetAssociativeCache
-
-from TestUtil.test_helper import assert_eq, assert_ne, assert_op
-
-def set_sac(dut, e, c, s, t, d):
-    yield dut.enable.eq(e)
-    yield dut.command.eq(c)
-    yield dut.cset.eq(s)
-    yield dut.tag.eq(t)
-    yield dut.data_i.eq(d)
-    yield
-
-def tbench(dut):
-    enable = 1
-    command = 2
-    cset = 1
-    tag = 2
-    data = 3
-    yield from set_sac(dut, enable, command, cset, tag, data)
-    yield
-
-    enable = 1
-    command = 2
-    cset = 1
-    tag = 5
-    data = 8
-    yield from set_sac(dut, enable, command, cset, tag, data)
-    yield
-
-def test_assoc_cache():
-    dut = SetAssociativeCache(4, 4, 4, 4)
-    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_set_associative_cache.vcd")
-    print("Set Associative Cache Unit Test Success")
-
-if __name__ == "__main__":
-    test_assoc_cache()
diff --git a/src/TLB/test/test_tlb.py b/src/TLB/test/test_tlb.py
deleted file mode 100644
index e9cc9d69..00000000
--- a/src/TLB/test/test_tlb.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#import tracemalloc
-#tracemalloc.start()
-
-from nmigen.compat.sim import run_simulation
-
-from TLB.TLB import TLB
-
-from TestUtil.test_helper import assert_op, assert_eq
-
-#self.supermode = Signal(1) # Supervisor Mode
-#self.super_access = Signal(1) # Supervisor Access
-#self.command = Signal(2) # 00=None, 01=Search, 10=Write L1, 11=Write L2
-#self.xwr = Signal(3) # Execute, Write, Read
-#self.mode = Signal(4) # 4 bits for access to Sv48 on Rv64
-#self.address_L1 = Signal(max=L1_size)
-#self.asid = Signal(asid_size) # Address Space IDentifier (ASID)
-#self.vma = Signal(vma_size) # Virtual Memory Address (VMA)
-#self.pte_in = Signal(pte_size) # To be saved Page Table Entry (PTE)
-#
-#self.hit = Signal(1) # Denotes if the VMA had a mapped PTE
-#self.perm_valid = Signal(1) # Denotes if the permissions are correct
-#self.pte_out = Signal(pte_size) # PTE that was mapped to by the VMA
-
-COMMAND_READ=1
-COMMAND_WRITE_L1=2
-
-# Checks the data state of the CAM entry
-# Arguments:
-#   dut: The CamEntry being tested
-#   d (Data): The expected data
-#   op (Operation): (0 => ==), (1 => !=)
-def check_hit(dut, d):
-    hit_d = yield dut.hit
-    #assert_eq("hit", hit_d, d)
-
-def test_command(dut,cmd,xwr,cycles):
-    yield dut.command.eq(cmd)
-    yield dut.xwr.eq(xwr)
-    for i in range(0,cycles):
-        yield
-
-def test_write_L1(dut,vma,address_L1,asid,pte_in):
-    yield dut.address_L1.eq(address_L1)
-    yield dut.asid.eq(asid)
-    yield dut.vma.eq(vma)
-    yield dut.pte_in.eq(pte_in)
-    yield from test_command(dut,COMMAND_WRITE_L1,7,2)
-
-def test_search(dut,vma,found):
-    yield dut.vma.eq(vma)
-    yield from test_command(dut,COMMAND_READ,7,1)
-    yield from check_hit(dut,found)
-
-def zero(dut):
-    yield dut.supermode.eq(0)
-    yield dut.super_access.eq(0)
-    yield dut.mode.eq(0)
-    yield dut.address_L1.eq(0)
-    yield dut.asid.eq(0)
-    yield dut.vma.eq(0)
-    yield dut.pte_in.eq(0)
-
-def tbench(dut):
-    yield from zero(dut)
-    yield dut.mode.eq(0xF) # enable TLB
-    #test hit
-    yield from test_write_L1(dut,0xFEEDFACE,0,0xFFFF,0xF0F0)
-    yield from test_search(dut,0xFEEDFACE,1)
-    yield from test_search(dut,0xFACEFEED,0)
-    
-
-    
-
-def test_tlb():
-    dut = TLB(15,36,64,8)
-    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_tlb.vcd")
-    print("TLB Unit Test Success")
-
-if __name__ == "__main__":
-    test_tlb()
diff --git a/src/TestUtil/test_helper.py b/src/TestUtil/test_helper.py
deleted file mode 100644
index c42990d6..00000000
--- a/src/TestUtil/test_helper.py
+++ /dev/null
@@ -1,30 +0,0 @@
-def assert_op(pre, o, e, op):
-    """ Verifies the given values given the particular operand
-        Arguments:
-            p (Prefix): Appended to the front of the assert statement
-            e (Expected): The expected value
-            o (Output): The output result
-            op (Operation): (0 => ==), (1 => !=)
-    """
-    if op == 0:
-        assert_eq(pre, o, e)
-    else:
-        assert_ne(pre, o, e)    
-
-def assert_eq(p, o, e):
-    """ Verifies the given values are equal
-        Arguments:
-           p (Prefix): Appended to the front of the assert statement
-           e (Expected): The expected value
-           o (Output): The output result
-    """
-    assert o == e, p + " Output " + str(o) + " Expected " + str(e)
-    
-def assert_ne(p, o, e):
-    """ Verifies the given values are not equal
-        Arguments:
-           p (Prefix): Appended to the front of the assert statement
-           e (Expected): The expected value
-           o (Output): The output result
-    """
-    assert o != e, p + " Output " + str(o) + " Not Expecting " + str(e) 
diff --git a/src/decoder/.gitignore b/src/decoder/.gitignore
deleted file mode 100644
index afed0735..00000000
--- a/src/decoder/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*.csv
diff --git a/src/decoder/power_decoder.py b/src/decoder/power_decoder.py
deleted file mode 100644
index 5b5e7103..00000000
--- a/src/decoder/power_decoder.py
+++ /dev/null
@@ -1,275 +0,0 @@
-"""Cascading Power ISA Decoder
-
-This module uses CSV tables in a hierarchical/peer cascading fashion,
-to create a multi-level instruction decoder by recognising appropriate
-patterns.  The output is a flattened (1-level) series of fields suitable
-for a simple RISC engine.
-
-This is based on Anton Blanchard's excellent microwatt work:
-https://github.com/antonblanchard/microwatt/blob/master/decode1.vhdl
-
-The basic principle is that the python code does the heavy lifting
-(reading the CSV files, constructing the hierarchy), creating the HDL
-AST with for-loops generating switch-case statements.
-
-PowerDecoder takes a *list* of CSV files with an associated bit-range
-that it is requested to match against the "opcode" row of the CSV file.
-This pattern can be either an integer, a binary number, *or* a wildcard
-nmigen Case pattern of the form "001--1-100".
-
-Subdecoders are *additional* cases with further decoding.  The "pattern"
-argument is specified as one of the Case statements (a peer of the opcode
-row in the CSV file), and thus further fields of the opcode may be decoded
-giving increasing levels of detail.
-
-Top Level:
-
-    [ (extra.csv: bit-fields entire 32-bit range
-        opcode                           -> matches
-        000000---------------01000000000 -> ILLEGAL instruction
-        01100000000000000000000000000000 -> SIM_CONFIG instruction
-        ................................ ->
-      ),
-      (major.csv: first 6 bits ONLY
-        opcode                           -> matches
-        001100                           -> ALU,OP_ADD (add)
-        001101                           -> ALU,OP_ADD (another type of add)
-        ......                           -> ...
-        ......                           -> ...
-        subdecoders:
-        001011 this must match *MAJOR*.CSV
-            [ (minor_19.csv: bits 21 through 30 inclusive:
-                opcode                  -> matches
-                0b0000000000            -> ALU,OP_MCRF
-                ............            -> ....
-              ),
-              (minor_19_00000.csv: bits 21 through 25 inclusive:
-                opcode                  -> matches
-                0b00010                 -> ALU,add_pcis
-              )
-            ]
-      ),
-    ]
-
-"""
-
-from nmigen import Module, Elaboratable, Signal
-from nmigen.cli import rtlil
-from power_enums import (Function, Form, InternalOp, In1Sel, In2Sel, In3Sel,
-                         OutSel, RC, LdstLen, CryIn, get_csv, single_bit_flags,
-                         get_signal_name, default_values)
-from collections import namedtuple
-from power_fields import DecodeFields
-from power_fieldsn import SigDecode, SignalBitRange
-
-Subdecoder = namedtuple("Subdecoder", ["pattern", "opcodes", "opint",
-                                       "bitsel", "suffix", "subdecoders"])
-
-
-class PowerOp:
-    """PowerOp: spec for execution.  op type (ADD etc.) reg specs etc.
-    """
-
-    def __init__(self):
-        self.function_unit = Signal(Function, reset_less=True)
-        self.internal_op = Signal(InternalOp, reset_less=True)
-        self.form = Signal(Form, reset_less=True)
-        self.in1_sel = Signal(In1Sel, reset_less=True)
-        self.in2_sel = Signal(In2Sel, reset_less=True)
-        self.in3_sel = Signal(In3Sel, reset_less=True)
-        self.out_sel = Signal(OutSel, reset_less=True)
-        self.ldst_len = Signal(LdstLen, reset_less=True)
-        self.rc_sel = Signal(RC, reset_less=True)
-        self.cry_in = Signal(CryIn, reset_less=True)
-        for bit in single_bit_flags:
-            name = get_signal_name(bit)
-            setattr(self, name, Signal(reset_less=True, name=name))
-
-    def _eq(self, row=None):
-        if row is None:
-            row = default_values
-        res = [self.function_unit.eq(Function[row['unit']]),
-               self.form.eq(Form[row['form']]),
-               self.internal_op.eq(InternalOp[row['internal op']]),
-               self.in1_sel.eq(In1Sel[row['in1']]),
-               self.in2_sel.eq(In2Sel[row['in2']]),
-               self.in3_sel.eq(In3Sel[row['in3']]),
-               self.out_sel.eq(OutSel[row['out']]),
-               self.ldst_len.eq(LdstLen[row['ldst len']]),
-               self.rc_sel.eq(RC[row['rc']]),
-               self.cry_in.eq(CryIn[row['cry in']]),
-               ]
-        for bit in single_bit_flags:
-            sig = getattr(self, get_signal_name(bit))
-            res.append(sig.eq(int(row.get(bit, 0))))
-        return res
-
-    def eq(self, otherop):
-        res = [self.function_unit.eq(otherop.function_unit),
-               self.form.eq(otherop.form),
-               self.internal_op.eq(otherop.internal_op),
-               self.in1_sel.eq(otherop.in1_sel),
-               self.in2_sel.eq(otherop.in2_sel),
-               self.in3_sel.eq(otherop.in3_sel),
-               self.out_sel.eq(otherop.out_sel),
-               self.rc_sel.eq(otherop.rc_sel),
-               self.ldst_len.eq(otherop.ldst_len),
-               self.cry_in.eq(otherop.cry_in)]
-        for bit in single_bit_flags:
-            sig = getattr(self, get_signal_name(bit))
-            res.append(sig.eq(getattr(otherop, get_signal_name(bit))))
-        return res
-
-    def ports(self):
-        regular = [self.function_unit,
-                   self.in1_sel,
-                   self.in2_sel,
-                   self.in3_sel,
-                   self.out_sel,
-                   self.ldst_len,
-                   self.rc_sel,
-                   self.internal_op,
-                   self.form]
-        single_bit_ports = [getattr(self, get_signal_name(x))
-                            for x in single_bit_flags]
-        return regular + single_bit_ports
-
-
-class PowerDecoder(Elaboratable):
-    """PowerDecoder - decodes an incoming opcode into the type of operation
-    """
-
-    def __init__(self, width, dec):
-        if not isinstance(dec, list):
-            dec = [dec]
-        self.dec = dec
-        self.opcode_in = Signal(width, reset_less=True)
-
-        self.op = PowerOp()
-        for d in dec:
-            if d.suffix is not None and d.suffix >= width:
-                d.suffix = None
-        self.width = width
-
-    def suffix_mask(self, d):
-        return ((1 << d.suffix) - 1)
-
-    def divide_opcodes(self, d):
-        divided = {}
-        mask = self.suffix_mask(d)
-        print("mask", hex(mask))
-        for row in d.opcodes:
-            opcode = row['opcode']
-            if d.opint and '-' not in opcode:
-                opcode = int(opcode, 0)
-            key = opcode & mask
-            opcode = opcode >> d.suffix
-            if key not in divided:
-                divided[key] = []
-            r = row.copy()
-            r['opcode'] = opcode
-            divided[key].append(r)
-        return divided
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-
-        # note: default opcode is "illegal" as this is a combinatorial block
-
-        # go through the list of CSV decoders first
-        for d in self.dec:
-            opcode_switch = Signal(d.bitsel[1] - d.bitsel[0],
-                                   reset_less=True)
-            comb += opcode_switch.eq(self.opcode_in[d.bitsel[0]:d.bitsel[1]])
-            if d.suffix:
-                opcodes = self.divide_opcodes(d)
-                opc_in = Signal(d.suffix, reset_less=True)
-                comb += opc_in.eq(opcode_switch[:d.suffix])
-                with m.Switch(opc_in):
-                    for key, row in opcodes.items():
-                        bitsel = (d.suffix+d.bitsel[0], d.bitsel[1])
-                        sd = Subdecoder(pattern=None, opcodes=row,
-                                        bitsel=bitsel, suffix=None,
-                                        opint=False, subdecoders=[])
-                        subdecoder = PowerDecoder(width=32, dec=sd)
-                        setattr(m.submodules, "dec_sub%d" % key, subdecoder)
-                        comb += subdecoder.opcode_in.eq(self.opcode_in)
-                        with m.Case(key):
-                            comb += self.op.eq(subdecoder.op)
-            else:
-                # TODO: arguments, here (all of them) need to be a list.
-                # a for-loop around the *list* of decoder args.
-                with m.Switch(opcode_switch):
-                    self.handle_subdecoders(m, d)
-                    for row in d.opcodes:
-                        opcode = row['opcode']
-                        if d.opint and '-' not in opcode:
-                            opcode = int(opcode, 0)
-                        if not row['unit']:
-                            continue
-                        with m.Case(opcode):
-                            comb += self.op._eq(row)
-        return m
-
-    def handle_subdecoders(self, m, d):
-        for dec in d.subdecoders:
-            subdecoder = PowerDecoder(self.width, dec)
-            if isinstance(dec, list): # XXX HACK: take first pattern
-                dec = dec[0]
-            setattr(m.submodules, "dec%d" % dec.pattern, subdecoder)
-            m.d.comb += subdecoder.opcode_in.eq(self.opcode_in)
-            with m.Case(dec.pattern):
-                m.d.comb += self.op.eq(subdecoder.op)
-
-    def ports(self):
-        return [self.opcode_in] + self.op.ports()
-
-
-class TopPowerDecoder(PowerDecoder, DecodeFields):
-
-    def __init__(self, width, dec):
-        PowerDecoder.__init__(self, width, dec)
-        DecodeFields.__init__(self, SignalBitRange, [self.opcode_in])
-        self.create_specs()
-
-
-def create_pdecode():
-
-    # minor 19 has extra patterns
-    m19 = []
-    m19.append(Subdecoder(pattern=19, opcodes=get_csv("minor_19.csv"),
-                   opint=True, bitsel=(1, 11), suffix=None, subdecoders=[]))
-    m19.append(Subdecoder(pattern=19, opcodes=get_csv("minor_19_00000.csv"),
-                   opint=True, bitsel=(1, 6), suffix=None, subdecoders=[]))
-
-    # minor opcodes.
-    pminor = [
-        m19,
-        Subdecoder(pattern=30, opcodes=get_csv("minor_30.csv"),
-                   opint=True, bitsel=(1, 6), suffix=None, subdecoders=[]),
-        Subdecoder(pattern=31, opcodes=get_csv("minor_31.csv"),
-                   opint=True, bitsel=(1, 11), suffix=0b00101, subdecoders=[]),
-        Subdecoder(pattern=58, opcodes=get_csv("minor_58.csv"),
-                   opint=True, bitsel=(0, 2), suffix=None, subdecoders=[]),
-        Subdecoder(pattern=62, opcodes=get_csv("minor_62.csv"),
-                   opint=True, bitsel=(0, 2), suffix=None, subdecoders=[]),
-    ]
-
-    # top level: extra merged with major
-    dec = []
-    opcodes = get_csv("major.csv")
-    dec.append(Subdecoder(pattern=None, opint=True, opcodes=opcodes,
-                     bitsel=(26, 32), suffix=None, subdecoders=pminor))
-    opcodes = get_csv("extra.csv")
-    dec.append(Subdecoder(pattern=None, opint=False, opcodes=opcodes,
-                     bitsel=(0, 32), suffix=None, subdecoders=[]))
-
-    return TopPowerDecoder(32, dec)
-
-
-if __name__ == '__main__':
-    pdecode = create_pdecode()
-    vl = rtlil.convert(pdecode, ports=pdecode.ports())
-    with open("decoder.il", "w") as f:
-        f.write(vl)
diff --git a/src/decoder/power_decoder2.py b/src/decoder/power_decoder2.py
deleted file mode 100644
index 1b7435a0..00000000
--- a/src/decoder/power_decoder2.py
+++ /dev/null
@@ -1,429 +0,0 @@
-"""Power ISA Decoder second stage
-
-based on Anton Blanchard microwatt decode2.vhdl
-
-"""
-from nmigen import Module, Elaboratable, Signal, Mux, Const
-from nmigen.cli import rtlil
-
-from power_decoder import create_pdecode
-from power_enums import (InternalOp, CryIn, Function, LdstLen,
-                         In1Sel, In2Sel, In3Sel, OutSel, SPR, RC)
-
-
-class DecodeA(Elaboratable):
-    """DecodeA from instruction
-
-    decodes register RA, whether immediate-zero, implicit and
-    explicit CSRs
-    """
-
-    def __init__(self, dec):
-        self.dec = dec
-        self.sel_in = Signal(In1Sel, reset_less=True)
-        self.insn_in = Signal(32, reset_less=True)
-        self.reg_out = Data(5, name="reg_a")
-        self.immz_out = Signal(reset_less=True)
-        self.spr_out = Data(10, "spr_a")
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-
-        # select Register A field
-        with m.If((self.sel_in == In1Sel.RA) |
-                  ((self.sel_in == In1Sel.RA_OR_ZERO) &
-                   (self.reg_out.data != Const(0, 5)))):
-            comb += self.reg_out.data.eq(self.dec.RA[0:-1])
-            comb += self.reg_out.ok.eq(1)
-
-        # zero immediate requested
-        with m.If((self.sel_in == In1Sel.RA_OR_ZERO) &
-                   (self.reg_out.data == Const(0, 5))):
-            comb += self.immz_out.eq(1)
-
-        # decode SPR1 based on instruction type
-        op = self.dec.op
-        # BC or BCREG: potential implicit register (CTR)
-        with m.If((op.internal_op == InternalOp.OP_BC) |
-                  (op.internal_op == InternalOp.OP_BCREG)):
-            with m.If(~self.dec.BO[2]): # 3.0B p38 BO2=0, use CTR reg
-                comb += self.spr_out.data.eq(SPR.CTR) # constant: CTR
-                comb += self.spr_out.ok.eq(1)
-        # MFSPR or MTSPR: move-from / move-to SPRs
-        with m.If((op.internal_op == InternalOp.OP_MFSPR) |
-                  (op.internal_op == InternalOp.OP_MTSPR)):
-            comb += self.spr_out.data.eq(self.dec.SPR[0:-1]) # SPR field, XFX
-            comb += self.spr_out.ok.eq(1)
-
-        return m
-
-class Data:
-
-    def __init__(self, width, name):
-
-        self.data = Signal(width, name=name, reset_less=True)
-        self.ok = Signal(name="%s_ok" % name, reset_less=True)
-
-    def eq(self, rhs):
-        return [self.data.eq(rhs.data),
-                self.ok.eq(rhs.ok)]
-
-    def ports(self):
-        return [self.data, self.ok]
-
-
-class DecodeB(Elaboratable):
-    """DecodeB from instruction
-
-    decodes register RB, different forms of immediate (signed, unsigned),
-    and implicit SPRs
-    """
-
-    def __init__(self, dec):
-        self.dec = dec
-        self.sel_in = Signal(In2Sel, reset_less=True)
-        self.insn_in = Signal(32, reset_less=True)
-        self.reg_out = Data(5, "reg_b")
-        self.imm_out = Data(64, "imm_b")
-        self.spr_out = Data(10, "spr_b")
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-
-        # select Register B field
-        with m.Switch(self.sel_in):
-            with m.Case(In2Sel.RB):
-                comb += self.reg_out.data.eq(self.dec.RB[0:-1])
-                comb += self.reg_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_UI):
-                comb += self.imm_out.data.eq(self.dec.UI[0:-1])
-                comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_SI): # TODO: sign-extend here?
-                comb += self.imm_out.data.eq(self.dec.SI[0:-1])
-                comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_UI_HI):
-                comb += self.imm_out.data.eq(self.dec.UI[0:-1]<<4)
-                comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_SI_HI): # TODO: sign-extend here?
-                comb += self.imm_out.data.eq(self.dec.SI[0:-1]<<4)
-                comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_LI):
-                comb += self.imm_out.data.eq(self.dec.LI[0:-1]<<2)
-                comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_BD):
-                comb += self.imm_out.data.eq(self.dec.BD[0:-1]<<2)
-                comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_DS):
-                comb += self.imm_out.data.eq(self.dec.DS[0:-1]<<2)
-                comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_M1):
-                comb += self.imm_out.data.eq(~Const(0, 64)) # all 1s
-                comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_SH):
-                comb += self.imm_out.data.eq(self.dec.sh[0:-1])
-                comb += self.imm_out.ok.eq(1)
-            with m.Case(In2Sel.CONST_SH32):
-                comb += self.imm_out.data.eq(self.dec.SH32[0:-1])
-                comb += self.imm_out.ok.eq(1)
-
-        # decode SPR2 based on instruction type
-        op = self.dec.op
-        # BCREG implicitly uses CTR or LR for 2nd reg
-        with m.If(op.internal_op == InternalOp.OP_BCREG):
-            with m.If(self.dec.FormXL.XO[9]): # 3.0B p38 top bit of XO
-                comb += self.spr_out.data.eq(SPR.CTR)
-            with m.Else():
-                comb += self.spr_out.data.eq(SPR.LR)
-            comb += self.spr_out.ok.eq(1)
-
-        return m
-
-
-class DecodeC(Elaboratable):
-    """DecodeC from instruction
-
-    decodes register RC
-    """
-
-    def __init__(self, dec):
-        self.dec = dec
-        self.sel_in = Signal(In3Sel, reset_less=True)
-        self.insn_in = Signal(32, reset_less=True)
-        self.reg_out = Data(5, "reg_c")
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-
-        # select Register C field
-        with m.If(self.sel_in == In3Sel.RS):
-            comb += self.reg_out.data.eq(self.dec.RS[0:-1])
-            comb += self.reg_out.ok.eq(1)
-
-        return m
-
-
-class DecodeOut(Elaboratable):
-    """DecodeOut from instruction
-
-    decodes output register RA, RT or SPR
-    """
-
-    def __init__(self, dec):
-        self.dec = dec
-        self.sel_in = Signal(OutSel, reset_less=True)
-        self.insn_in = Signal(32, reset_less=True)
-        self.reg_out = Data(5, "reg_o")
-        self.spr_out = Data(10, "spr_o")
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-
-        # select Register out field
-        with m.Switch(self.sel_in):
-            with m.Case(OutSel.RT):
-                comb += self.reg_out.data.eq(self.dec.RT[0:-1])
-                comb += self.reg_out.ok.eq(1)
-            with m.Case(OutSel.RA):
-                comb += self.reg_out.data.eq(self.dec.RA[0:-1])
-                comb += self.reg_out.ok.eq(1)
-            with m.Case(OutSel.SPR):
-                comb += self.spr_out.data.eq(self.dec.SPR[0:-1]) # from XFX
-                comb += self.spr_out.ok.eq(1)
-
-        return m
-
-
-class DecodeRC(Elaboratable):
-    """DecodeRc from instruction
-
-    decodes Record bit Rc
-    """
-    def __init__(self, dec):
-        self.dec = dec
-        self.sel_in = Signal(RC, reset_less=True)
-        self.insn_in = Signal(32, reset_less=True)
-        self.rc_out = Data(1, "rc")
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-
-        # select Record bit out field
-        with m.Switch(self.sel_in):
-            with m.Case(RC.RC):
-                comb += self.rc_out.data.eq(self.dec.Rc[0:-1])
-                comb += self.rc_out.ok.eq(1)
-            with m.Case(RC.ONE):
-                comb += self.rc_out.data.eq(1)
-                comb += self.rc_out.ok.eq(1)
-            with m.Case(RC.NONE):
-                comb += self.rc_out.data.eq(0)
-                comb += self.rc_out.ok.eq(1)
-
-        return m
-
-
-class DecodeOE(Elaboratable):
-    """DecodeOE from instruction
-
-    decodes OE field: uses RC decode detection which might not be good
-
-    -- For now, use "rc" in the decode table to decide whether oe exists.
-    -- This is not entirely correct architecturally: For mulhd and
-    -- mulhdu, the OE field is reserved. It remains to be seen what an
-    -- actual POWER9 does if we set it on those instructions, for now we
-    -- test that further down when assigning to the multiplier oe input.
-    """
-    def __init__(self, dec):
-        self.dec = dec
-        self.sel_in = Signal(RC, reset_less=True)
-        self.insn_in = Signal(32, reset_less=True)
-        self.oe_out = Data(1, "oe")
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-
-        # select OE bit out field
-        with m.Switch(self.sel_in):
-            with m.Case(RC.RC):
-                comb += self.oe_out.data.eq(self.dec.OE[0:-1])
-                comb += self.oe_out.ok.eq(1)
-
-        return m
-
-
-class XerBits:
-    def __init__(self):
-        self.ca = Signal(reset_less=True)
-        self.ca32 = Signal(reset_less=True)
-        self.ov = Signal(reset_less=True)
-        self.ov32 = Signal(reset_less=True)
-        self.so = Signal(reset_less=True)
-
-    def ports(self):
-        return [self.ca, self.ca32, self.ov, self.ov32, self.so, ]
-
-
-class Decode2ToExecute1Type:
-
-    def __init__(self):
-
-        self.valid = Signal(reset_less=True)
-        self.insn_type = Signal(InternalOp, reset_less=True)
-        self.nia = Signal(64, reset_less=True)
-        self.write_reg = Data(5, name="rego")
-        self.read_reg1 = Data(5, name="reg1")
-        self.read_reg2 = Data(5, name="reg2")
-        self.read_reg3 = Data(5, name="reg3")
-        self.imm_data = Data(64, name="imm")
-        self.write_spr = Data(10, name="spro")
-        self.read_spr1 = Data(10, name="spr1")
-        self.read_spr2 = Data(10, name="spr2")
-        #self.read_data1 = Signal(64, reset_less=True)
-        #self.read_data2 = Signal(64, reset_less=True)
-        #self.read_data3 = Signal(64, reset_less=True)
-        #self.cr = Signal(32, reset_less=True) # NO: this is from the CR SPR
-        #self.xerc = XerBits() # NO: this is from the XER SPR
-        self.lk = Signal(reset_less=True)
-        self.rc = Data(1, "rc")
-        self.oe = Data(1, "oe")
-        self.invert_a = Signal(reset_less=True)
-        self.invert_out = Signal(reset_less=True)
-        self.input_carry = Signal(CryIn, reset_less=True)
-        self.output_carry = Signal(reset_less=True)
-        self.input_cr = Signal(reset_less=True)
-        self.output_cr = Signal(reset_less=True)
-        self.is_32bit = Signal(reset_less=True)
-        self.is_signed = Signal(reset_less=True)
-        self.insn = Signal(32, reset_less=True)
-        self.data_len = Signal(4, reset_less=True) # bytes
-        self.byte_reverse  = Signal(reset_less=True)
-        self.sign_extend  = Signal(reset_less=True)# do we need this?
-        self.update  = Signal(reset_less=True) # is this an update instruction?
-
-    def ports(self):
-        return [self.valid, self.insn_type, self.nia,
-                #self.read_data1, self.read_data2, self.read_data3,
-                #self.cr,
-                self.lk,
-                self.invert_a, self.invert_out,
-                self.input_carry, self.output_carry,
-                self.input_cr, self.output_cr,
-                self.is_32bit, self.is_signed,
-                self.insn,
-                self.data_len, self.byte_reverse , self.sign_extend ,
-                self.update] + \
-                self.oe.ports() + \
-                self.rc.ports() + \
-                self.write_spr.ports() + \
-                self.read_spr1.ports() + \
-                self.read_spr2.ports() + \
-                self.write_reg.ports() + \
-                self.read_reg1.ports() + \
-                self.read_reg2.ports() + \
-                self.read_reg3.ports() + \
-                self.imm_data.ports()
-                # + self.xerc.ports()
-
-class PowerDecode2(Elaboratable):
-
-    def __init__(self, dec):
-
-        self.dec = dec
-        self.e = Decode2ToExecute1Type()
-
-    def ports(self):
-        return self.dec.ports() + self.e.ports()
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-
-        # set up submodule decoders
-        m.submodules.dec = self.dec
-        m.submodules.dec_a = dec_a = DecodeA(self.dec)
-        m.submodules.dec_b = dec_b = DecodeB(self.dec)
-        m.submodules.dec_c = dec_c = DecodeC(self.dec)
-        m.submodules.dec_o = dec_o = DecodeOut(self.dec)
-        m.submodules.dec_rc = dec_rc = DecodeRC(self.dec)
-        m.submodules.dec_oe = dec_oe = DecodeOE(self.dec)
-
-        # copy instruction through...
-        for i in [self.e.insn, dec_a.insn_in, dec_b.insn_in,
-                  dec_c.insn_in, dec_o.insn_in, dec_rc.insn_in,
-                  dec_oe.insn_in]:
-            comb += i.eq(self.dec.opcode_in)
-
-        # ...and subdecoders' input fields
-        comb += dec_a.sel_in.eq(self.dec.op.in1_sel)
-        comb += dec_b.sel_in.eq(self.dec.op.in2_sel)
-        comb += dec_c.sel_in.eq(self.dec.op.in3_sel)
-        comb += dec_o.sel_in.eq(self.dec.op.out_sel)
-        comb += dec_rc.sel_in.eq(self.dec.op.rc_sel)
-        comb += dec_oe.sel_in.eq(self.dec.op.rc_sel) # XXX should be OE sel
-
-        # decode LD/ST length
-        with m.Switch(self.dec.op.ldst_len):
-            with m.Case(LdstLen.is1B):
-                comb += self.e.data_len.eq(1)
-            with m.Case(LdstLen.is2B):
-                comb += self.e.data_len.eq(2)
-            with m.Case(LdstLen.is4B):
-                comb += self.e.data_len.eq(4)
-            with m.Case(LdstLen.is8B):
-                comb += self.e.data_len.eq(8)
-
-        #comb += self.e.nia.eq(self.dec.nia) # XXX TODO
-        itype = Mux(self.dec.op.function_unit == Function.NONE,
-                    InternalOp.OP_ILLEGAL,
-                    self.dec.op.internal_op)
-        comb += self.e.insn_type.eq(itype)
-
-        # registers a, b, c and out
-        comb += self.e.read_reg1.eq(dec_a.reg_out)
-        comb += self.e.read_reg2.eq(dec_b.reg_out)
-        comb += self.e.read_reg3.eq(dec_c.reg_out)
-        comb += self.e.write_reg.eq(dec_o.reg_out)
-        comb += self.e.imm_data.eq(dec_b.imm_out)
-
-        # rc and oe out
-        comb += self.e.rc.eq(dec_rc.rc_out)
-        comb += self.e.oe.eq(dec_oe.oe_out)
-
-        # SPRs out
-        comb += self.e.read_spr1.eq(dec_a.spr_out)
-        comb += self.e.read_spr2.eq(dec_b.spr_out)
-        comb += self.e.write_spr.eq(dec_o.spr_out)
-
-        # decoded/selected instruction flags
-        comb += self.e.invert_a.eq(self.dec.op.inv_a)
-        comb += self.e.invert_out.eq(self.dec.op.inv_out)
-        comb += self.e.input_carry.eq(self.dec.op.cry_in)
-        comb += self.e.output_carry.eq(self.dec.op.cry_out)
-        comb += self.e.is_32bit.eq(self.dec.op.is_32b)
-        comb += self.e.is_signed.eq(self.dec.op.sgn)
-        with m.If(self.dec.op.lk):
-            comb += self.e.lk.eq(self.dec.LK[0:-1]) # XXX TODO: accessor
-
-        comb += self.e.byte_reverse.eq(self.dec.op.br)
-        comb += self.e.sign_extend.eq(self.dec.op.sgn_ext)
-        comb += self.e.update.eq(self.dec.op.upd)
-
-        comb += self.e.input_cr.eq(self.dec.op.cr_in)
-        comb += self.e.output_cr.eq(self.dec.op.cr_out)
-
-        return m
-
-
-if __name__ == '__main__':
-    pdecode = create_pdecode()
-    dec2 = PowerDecode2(pdecode)
-    vl = rtlil.convert(dec2, ports=dec2.ports() + pdecode.ports())
-    with open("dec2.il", "w") as f:
-        f.write(vl)
-
diff --git a/src/decoder/power_enums.py b/src/decoder/power_enums.py
deleted file mode 100644
index dcf5cad2..00000000
--- a/src/decoder/power_enums.py
+++ /dev/null
@@ -1,229 +0,0 @@
-from enum import Enum, unique
-import csv
-import os
-import requests
-
-
-def get_csv(name):
-    file_dir = os.path.dirname(os.path.realpath(__file__))
-    file_path = os.path.join(file_dir, name)
-    if not os.path.isfile(file_path):
-        url = 'https://libre-riscv.org/openpower/isatables/' + name
-        r = requests.get(url, allow_redirects=True)
-        with open(file_path, 'w') as outfile:
-            outfile.write(r.content.decode("utf-8"))
-    with open(file_path, 'r') as csvfile:
-        reader = csv.DictReader(csvfile)
-        return list(reader)
-
-
-# names of the fields in the tables that don't correspond to an enum
-single_bit_flags = ['CR in', 'CR out', 'inv A', 'inv out',
-                    'cry out', 'BR', 'sgn ext', 'upd', 'rsrv', '32b',
-                    'sgn', 'lk', 'sgl pipe']
-
-# default values for fields in the table
-default_values = {'unit': "NONE", 'internal op': "OP_ILLEGAL",
-                   'in1': "RA", 'in2': 'NONE', 'in3': 'NONE', 'out': 'NONE',
-                   'ldst len': 'NONE',
-                   'rc' : 'NONE', 'cry in' : 'ZERO', 'form': 'NONE'}
-
-def get_signal_name(name):
-    if name[0].isdigit():
-        name = "is_" + name
-    return name.lower().replace(' ', '_')
-
-
-@unique
-class Function(Enum):
-    NONE = 0
-    ALU = 1
-    LDST = 2
-
-
-@unique
-class Form(Enum):
-    NONE = 0
-    I = 1
-    B = 2
-    SC = 3
-    D = 4
-    DS = 5
-    DQ = 6
-    DX = 7
-    X = 8
-    XL = 9
-    XFX = 10
-    XFL = 11
-    XX1 = 12
-    XX2 = 13
-    XX3 = 14
-    XX4 = 15
-    XS = 16
-    XO = 17
-    A = 18
-    M = 19
-    MD = 20
-    MDS = 21
-    VA = 22
-    VC = 23
-    VX = 24
-    EVX = 25
-    EVS = 26
-    Z22 = 27
-    Z23 = 28
-
-
-
-@unique
-class InternalOp(Enum):
-    OP_ILLEGAL = 0
-    OP_NOP = 1
-    OP_ADD = 2
-    OP_ADDPCIS = 3
-    OP_AND = 4
-    OP_ATTN = 5
-    OP_B = 6
-    OP_BC = 7
-    OP_BCREG = 8
-    OP_BPERM = 9
-    OP_CMP = 10
-    OP_CMPB = 11
-    OP_CMPEQB = 12
-    OP_CMPRB = 13
-    OP_CNTZ = 14
-    OP_CRAND = 15
-    OP_CRANDC = 16
-    OP_CREQV = 17
-    OP_CRNAND = 18
-    OP_CRNOR = 19
-    OP_CROR = 20
-    OP_CRORC = 21
-    OP_CRXOR = 22
-    OP_DARN = 23
-    OP_DCBF = 24
-    OP_DCBST = 25
-    OP_DCBT = 26
-    OP_DCBTST = 27
-    OP_DCBZ = 28
-    OP_DIV = 29
-    OP_DIVE = 30
-    OP_EXTS = 31
-    OP_EXTSWSLI = 32
-    OP_ICBI = 33
-    OP_ICBT = 34
-    OP_ISEL = 35
-    OP_ISYNC = 36
-    OP_LOAD = 37
-    OP_STORE = 38
-    OP_MADDHD = 39
-    OP_MADDHDU = 40
-    OP_MADDLD = 41
-    OP_MCRF = 42
-    OP_MCRXR = 43
-    OP_MCRXRX = 44
-    OP_MFCR = 45
-    OP_MFSPR = 46
-    OP_MOD = 47
-    OP_MTCRF = 48
-    OP_MTSPR = 49
-    OP_MUL_L64 = 50
-    OP_MUL_H64 = 51
-    OP_MUL_H32 = 52
-    OP_OR = 53
-    OP_POPCNT = 54
-    OP_PRTY = 55
-    OP_RLC = 56
-    OP_RLCL = 57
-    OP_RLCR = 58
-    OP_SETB = 59
-    OP_SHL = 60
-    OP_SHR = 61
-    OP_SYNC = 62
-    OP_TD = 63
-    OP_TDI = 64
-    OP_TW = 65
-    OP_TWI = 66
-    OP_XOR = 67
-    OP_SIM_CONFIG = 68
-
-
-@unique
-class In1Sel(Enum):
-    RA = 0
-    RA_OR_ZERO = 1
-    NONE = 2
-    SPR = 3
-
-
-@unique
-class In2Sel(Enum):
-    NONE = 0
-    RB = 1
-    CONST_UI = 2
-    CONST_SI = 3
-    CONST_UI_HI = 4
-    CONST_SI_HI = 5
-    CONST_LI = 6
-    CONST_BD = 7
-    CONST_DS = 8
-    CONST_M1 = 9
-    CONST_SH = 10
-    CONST_SH32 = 11
-    SPR = 12
-
-
-@unique
-class In3Sel(Enum):
-    NONE = 0
-    RS = 1
-
-
-@unique
-class OutSel(Enum):
-    NONE = 0
-    RT = 1
-    RA = 2
-    SPR = 3
-
-
-@unique
-class LdstLen(Enum):
-    NONE = 0
-    is1B = 1
-    is2B = 2
-    is4B = 3
-    is8B = 4
-
-
-@unique
-class RC(Enum):
-    NONE = 0
-    ONE = 1
-    RC = 2
-
-
-@unique
-class CryIn(Enum):
-    ZERO = 0
-    ONE = 1
-    CA = 2
-
-@unique
-class SPR(Enum):
-    XER    = 1
-    LR     = 8
-    CTR    = 9
-    TB     = 268
-    SRR0   = 26
-    SRR1   = 27
-    HSRR0  = 314
-    HSRR1  = 315
-    SPRG0  = 272
-    SPRG1  = 273
-    SPRG2  = 274
-    SPRG3  = 275
-    SPRG3U = 259
-    HSPRG0 = 304
-    HSPRG1 = 305
-
diff --git a/src/decoder/power_fields.py b/src/decoder/power_fields.py
deleted file mode 100644
index 3457331e..00000000
--- a/src/decoder/power_fields.py
+++ /dev/null
@@ -1,242 +0,0 @@
-from collections import OrderedDict, namedtuple
-
-
-class BitRange(OrderedDict):
-    """BitRange: remaps from straight indices (0,1,2..) to bit numbers
-    """
-    def __getitem__(self, subscript):
-        if isinstance(subscript, slice):
-            return list(self)[subscript]
-        else:
-            return self[subscript]
-
-def decode_instructions(form):
-    res = {}
-    accum = []
-    for l in form:
-        if l.strip().startswith("Formats"):
-            l = l.strip().split(":")[-1]
-            l = l.replace(" ", "")
-            l = l.split(",")
-            for fmt in l:
-                if fmt not in res:
-                    res[fmt] = [accum[0]]
-                else:
-                    res[fmt].append(accum[0])
-            accum = []
-        else:
-            accum.append(l.strip())
-    return res
-
-def decode_form_header(hdr):
-    res = {}
-    count = 0
-    hdr = hdr.strip()
-    print (hdr.split('|'))
-    for f in hdr.split("|"):
-        if not f:
-            continue
-        if f[0].isdigit():
-            idx = int(f.strip().split(' ')[0])
-            res[count] = idx
-        count += len(f) + 1
-    return res
-
-def find_unique(d, key):
-    if key not in d:
-        return key
-    idx = 1
-    while "%s_%d" % (key, idx) in d:
-        idx += 1
-    return "%s_%d" % (key, idx)
-
-
-def decode_line(header, line):
-    line = line.strip()
-    res = {}
-    count = 0
-    print ("line", line)
-    prev_fieldname = None
-    for f in line.split("|"):
-        if not f:
-            continue
-        end = count + len(f) + 1
-        fieldname = f.strip()
-        if not fieldname or fieldname.startswith('/'):
-            if prev_fieldname is not None:
-                res[prev_fieldname] = (res[prev_fieldname], header[count])
-                prev_fieldname = None
-            count = end
-            continue
-        bitstart = header[count]
-        if prev_fieldname is not None:
-            res[prev_fieldname] = (res[prev_fieldname], bitstart)
-        res[fieldname] = bitstart
-        count = end
-        prev_fieldname = fieldname
-    res[prev_fieldname] = (bitstart, 32)
-    return res
-
-
-def decode_form(form):
-    header = decode_form_header(form[0])
-    res = []
-    print ("header", header)
-    for line in form[1:]:
-        dec = decode_line(header, line)
-        if dec:
-            res.append(dec)
-    fields = {}
-    falternate = {}
-    for l in res:
-        for k, (start,end) in l.items():
-            if k in fields:
-                if (start, end) == fields[k]:
-                    continue # already in and matching for this Form
-                if k in falternate:
-                    alternate = "%s_%d" % (k, falternate[k])
-                    if (start, end) == fields[alternate]:
-                        continue
-                falternate[k] = fidx = falternate.get(k, 0) + 1
-                fields["%s_%d" % (k, fidx)] = (start, end)
-            else:
-                fields[k] = (start, end)
-    return fields
-
-
-class DecodeFields:
-
-    def __init__(self, bitkls=BitRange, bitargs=(), fname="fields.txt"):
-        self.bitkls = bitkls
-        self.bitargs = bitargs
-        self.fname = fname
-
-    def create_specs(self):
-        self.forms, self.instrs = self.decode_fields()
-        self.form_names = forms = self.instrs.keys()
-        for form in forms:
-            fields = self.instrs[form]
-            fk = fields.keys()
-            Fields = namedtuple("Fields", fk)
-            instr = Fields(**fields)
-            setattr(self, "Form%s" % form, instr)
-        # now add in some commonly-used fields (should be done automatically)
-        # note that these should only be ones which are the same on all Forms
-        # note: these are from microwatt insn_helpers.vhdl
-        self.RS = self.FormX.RS
-        self.RT = self.FormX.RT
-        self.RA = self.FormX.RA
-        self.RB = self.FormX.RB
-        self.SI = self.FormD.SI
-        self.UI = self.FormD.UI
-        self.L = self.FormD.L
-        self.SH32 = self.FormM.SH
-        self.sh = self.FormMD.sh
-        self.MB32 = self.FormM.MB
-        self.ME32 = self.FormM.ME
-        self.LI = self.FormI.LI
-        self.LK = self.FormI.LK
-        self.AA = self.FormB.AA
-        self.Rc = self.FormX.Rc
-        self.OE = self.FormXO.Rc
-        self.BD = self.FormB.BD
-        self.BF = self.FormX.BF
-        self.CR = self.FormXL.XO # used by further mcrf decoding
-        self.BB = self.FormXL.BB
-        self.BA = self.FormXL.BA
-        self.BT = self.FormXL.BT
-        self.FXM = self.FormXFX.FXM
-        self.BO = self.FormXL.BO
-        self.BI = self.FormXL.BI
-        self.BH = self.FormXL.BH
-        self.D = self.FormD.D
-        self.DS = self.FormDS.DS
-        self.TO = self.FormX.TO
-        self.BC = self.FormA.BC
-        self.SH = self.FormX.SH
-        self.ME = self.FormM.ME
-        self.MB = self.FormM.MB
-        self.SPR = self.FormXFX.SPR
-
-    def decode_fields(self):
-        with open(self.fname) as f:
-            txt = f.readlines()
-        forms = {}
-        reading_data = False
-        for l in txt:
-            print ("line", l)
-            l = l.strip()
-            if len(l) == 0:
-                continue
-            if reading_data:
-                if l[0] == '#':
-                    reading_data = False
-                else:
-                    forms[heading].append(l)
-            if not reading_data:
-                assert l[0] == '#'
-                heading = l[1:].strip()
-                #if heading.startswith('1.6.28'): # skip instr fields for now
-                    #break
-                heading = heading.split(' ')[-1]
-                print ("heading", heading)
-                reading_data = True
-                forms[heading] = []
-
-        res = {}
-        inst = {}
-
-        for hdr, form in forms.items():
-            print ("heading", hdr)
-            if heading == 'Fields':
-                i = decode_instructions(form)
-                for form, field in i.items():
-                    inst[form] = self.decode_instruction_fields(field)
-            #else:
-            #    res[hdr] = decode_form(form)
-        return res, inst
-
-    def decode_instruction_fields(self, fields):
-        res = {}
-        for field in fields:
-            f, spec = field.strip().split(" ")
-            d = self.bitkls(*self.bitargs)
-            idx = 0
-            for s in spec[1:-1].split(","):
-                s = s.split(':')
-                if len(s) == 1:
-                    d[idx] = int(s[0])
-                    idx += 1
-                else:
-                    start = int(s[0])
-                    end = int(s[1])
-                    while start <= end:
-                        d[idx] = start
-                        idx += 1
-                        start += 1
-            f = f.replace(",", "_")
-            unique = find_unique(res, f)
-            res[unique] = d
-
-        return res
-
-if __name__ == '__main__':
-    dec = DecodeFields()
-    dec.create_specs()
-    forms, instrs = dec.forms, dec.instrs
-    for hdr, form in forms.items():
-        print ()
-        print (hdr)
-        for k, v in form.items():
-            #print ("line", l)
-            #for k, v in l.items():
-            print ("%s: %d-%d" % (k, v[0], v[1]))
-    for form, field in instrs.items():
-        print ()
-        print (form)
-        for f, vals in field.items():
-            print ("    ", f, vals)
-    print (dec.FormX)
-    print (dec.FormX.A)
-    print (dir(dec.FormX))
-    print (dec.FormX._fields)
diff --git a/src/decoder/power_fieldsn.py b/src/decoder/power_fieldsn.py
deleted file mode 100644
index e603bbd3..00000000
--- a/src/decoder/power_fieldsn.py
+++ /dev/null
@@ -1,74 +0,0 @@
-from collections import OrderedDict
-from power_fields import DecodeFields, BitRange
-from nmigen import Module, Elaboratable, Signal, Cat
-from nmigen.cli import rtlil
-
-
-class SignalBitRange(BitRange):
-    def __init__(self, signal):
-        BitRange.__init__(self)
-        self.signal = signal
-
-    def __getitem__(self, subs):
-        # *sigh* field numberings are bit-inverted.  PowerISA 3.0B section 1.3.2
-        width = self.signal.shape()[0]
-        print (dir(self))
-        print (self.items())
-        if isinstance(subs, slice):
-            res = []
-            print (subs)
-            start, stop, step = subs.start, subs.stop, subs.step
-            if step is None:
-                step = 1
-            if start is None:
-                start = 0
-            if stop is None:
-                stop = -1
-            if start < 0:
-                start = len(self) - start - 1
-            if stop < 0:
-                stop = len(self) - stop - 1
-            print ("range", start, stop, step)
-            for t in range(start, stop, step):
-                k = OrderedDict.__getitem__(self, t)
-                print ("t", t, k)
-                res.append(self.signal[width-k-1])
-            return Cat(*res)
-        else:
-            k = OrderedDict.__getitem__(self, subs)
-            return self.signal[width-k-1]
-
-        print ("translated", subs, translated)
-
-
-class SigDecode(Elaboratable):
-
-    def __init__(self, width):
-        self.opcode_in = Signal(width, reset_less=False)
-        self.df = DecodeFields(SignalBitRange, [self.opcode_in])
-        self.df.create_specs()
-        self.x_s = Signal(len(self.df.FormX.S), reset_less=True)
-        self.x_sh = Signal(len(self.df.FormX.SH), reset_less=True)
-        self.dq_xs_s = Signal(len(self.df.FormDQ.SX_S), reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-        comb += self.x_s.eq(self.df.FormX.S[0])
-        comb += self.x_sh.eq(self.df.FormX.SH[0:-1])
-        comb += self.dq_xs_s.eq(self.df.FormDQ.SX_S[0:-1])
-        return m
-
-    def ports(self):
-        return [self.opcode_in, self.x_s, self.x_sh]
-
-def create_sigdecode():
-    s = SigDecode(32)
-    return s
-
-if __name__ == '__main__':
-    sigdecode = create_sigdecode()
-    vl = rtlil.convert(sigdecode, ports=sigdecode.ports())
-    with open("decoder.il", "w") as f:
-        f.write(vl)
-
diff --git a/src/decoder/test/test_power_decoder.py b/src/decoder/test/test_power_decoder.py
deleted file mode 100644
index f64f4b96..00000000
--- a/src/decoder/test/test_power_decoder.py
+++ /dev/null
@@ -1,130 +0,0 @@
-from nmigen import Module, Signal
-from nmigen.back.pysim import Simulator, Delay
-from nmigen.test.utils import FHDLTestCase
-from nmigen.cli import rtlil
-import sys
-import os
-import unittest
-sys.path.append("../")
-from power_decoder import (PowerDecoder, pdecode)
-from power_enums import (Function, InternalOp, In1Sel, In2Sel, In3Sel,
-                         OutSel, RC, LdstLen, CryIn, single_bit_flags,
-                         get_signal_name, get_csv)
-
-
-class DecoderTestCase(FHDLTestCase):
-
-    def run_tst(self, bitsel, csvname, minor=None, suffix=None, opint=True):
-        m = Module()
-        comb = m.d.comb
-        opcode = Signal(32)
-        function_unit = Signal(Function)
-        internal_op = Signal(InternalOp)
-        in1_sel = Signal(In1Sel)
-        in2_sel = Signal(In2Sel)
-        in3_sel = Signal(In3Sel)
-        out_sel = Signal(OutSel)
-        rc_sel = Signal(RC)
-        ldst_len = Signal(LdstLen)
-        cry_in = Signal(CryIn)
-
-        # opcodes = get_csv(csvname)
-        # m.submodules.dut = dut = PowerDecoder(32, opcodes, bitsel=bitsel,
-        #                                       opint=opint, suffix=suffix)
-        m.submodules.dut = dut = pdecode
-        comb += [dut.opcode_in.eq(opcode),
-                 function_unit.eq(dut.op.function_unit),
-                 in1_sel.eq(dut.op.in1_sel),
-                 in2_sel.eq(dut.op.in2_sel),
-                 in3_sel.eq(dut.op.in3_sel),
-                 out_sel.eq(dut.op.out_sel),
-                 rc_sel.eq(dut.op.rc_sel),
-                 ldst_len.eq(dut.op.ldst_len),
-                 cry_in.eq(dut.op.cry_in),
-                 internal_op.eq(dut.op.internal_op)]
-
-        sim = Simulator(m)
-        opcodes = get_csv(csvname)
-
-        def process():
-            for row in opcodes:
-                if not row['unit']:
-                    continue
-                op = row['opcode']
-                if not opint: # HACK: convert 001---10 to 0b00100010
-                    op = "0b" + op.replace('-', '0')
-                print ("opint", opint, row['opcode'], op)
-                print(row)
-                yield opcode.eq(0)
-                yield opcode[bitsel[0]:bitsel[1]].eq(int(op, 0))
-                if minor:
-                    print(minor)
-                    minorbits = minor[1]
-                    yield opcode[minorbits[0]:minorbits[1]].eq(minor[0])
-                yield Delay(1e-6)
-                signals = [(function_unit, Function, 'unit'),
-                           (internal_op, InternalOp, 'internal op'),
-                           (in1_sel, In1Sel, 'in1'),
-                           (in2_sel, In2Sel, 'in2'),
-                           (in3_sel, In3Sel, 'in3'),
-                           (out_sel, OutSel, 'out'),
-                           (rc_sel, RC, 'rc'),
-                           (cry_in, CryIn, 'cry in'),
-                           (ldst_len, LdstLen, 'ldst len')]
-                for sig, enm, name in signals:
-                    result = yield sig
-                    expected = enm[row[name]]
-                    msg = f"{sig.name} == {enm(result)}, expected: {expected}"
-                    self.assertEqual(enm(result), expected, msg)
-                for bit in single_bit_flags:
-                    sig = getattr(dut.op, get_signal_name(bit))
-                    result = yield sig
-                    expected = int(row[bit])
-                    msg = f"{sig.name} == {result}, expected: {expected}"
-                    self.assertEqual(expected, result, msg)
-        sim.add_process(process)
-        prefix = os.path.splitext(csvname)[0]
-        with sim.write_vcd("%s.vcd" % prefix, "%s.gtkw" % prefix, traces=[
-                opcode, function_unit, internal_op,
-                in1_sel, in2_sel]):
-            sim.run()
-
-    def generate_ilang(self):
-        vl = rtlil.convert(pdecode, ports=pdecode.ports())
-        with open("decoder.il", "w") as f:
-            f.write(vl)
-
-    def test_major(self):
-        self.run_tst((26, 32), "major.csv")
-        self.generate_ilang()
-
-    def test_minor_19(self):
-        self.run_tst((1, 11), "minor_19.csv", minor=(19, (26, 32)),
-                     suffix=(0, 5))
-
-    # def test_minor_19_00000(self):
-    #     self.run_tst((1, 11), "minor_19_00000.csv")
-
-    def test_minor_30(self):
-        self.run_tst((1, 5), "minor_30.csv", minor=(30, (26, 32)))
-
-    def test_minor_31(self):
-        self.run_tst((1, 11), "minor_31.csv", minor=(31, (26, 32)))
-
-    def test_minor_58(self):
-        self.run_tst((0, 2), "minor_58.csv", minor=(58, (26, 32)))
-
-    def test_minor_62(self):
-        self.run_tst((0, 2), "minor_62.csv", minor=(62, (26, 32)))
-
-
-    # #def test_minor_31_prefix(self):
-    # #    self.run_tst(10, "minor_31.csv", suffix=(5, 10))
-
-    # def test_extra(self):
-    #     self.run_tst(32, "extra.csv", opint=False)
-    #     self.generate_ilang(32, "extra.csv", opint=False)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/src/experiment/alu_hier.py b/src/experiment/alu_hier.py
deleted file mode 100644
index 9659059c..00000000
--- a/src/experiment/alu_hier.py
+++ /dev/null
@@ -1,239 +0,0 @@
-from nmigen import Elaboratable, Signal, Module, Const, Mux
-from nmigen.cli import main
-from nmigen.cli import verilog, rtlil
-
-import operator
-
-
-class Adder(Elaboratable):
-    def __init__(self, width):
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.o.eq(self.a + self.b)
-        return m
-
-
-class Subtractor(Elaboratable):
-    def __init__(self, width):
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.o.eq(self.a - self.b)
-        return m
-
-
-class Multiplier(Elaboratable):
-    def __init__(self, width):
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.o.eq(self.a * self.b)
-        return m
-
-
-class Shifter(Elaboratable):
-    def __init__(self, width):
-        self.width = width
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
-
-    def elaborate(self, platform):
-        m = Module()
-        btrunc = Signal(self.width)
-        m.d.comb += btrunc.eq(self.b & Const((1<<self.width)-1))
-        m.d.comb += self.o.eq(self.a >> btrunc)
-        return m
-
-
-class ALU(Elaboratable):
-    def __init__(self, width):
-        self.p_valid_i = Signal()
-        self.p_ready_o = Signal()
-        self.n_ready_i = Signal()
-        self.n_valid_o = Signal()
-        self.counter   = Signal(4)
-        self.op  = Signal(2)
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
-        self.width = width
-
-    def elaborate(self, platform):
-        m = Module()
-        add = Adder(self.width)
-        sub = Subtractor(self.width)
-        mul = Multiplier(self.width)
-        shf = Shifter(self.width)
-
-        m.submodules.add = add
-        m.submodules.sub = sub
-        m.submodules.mul = mul
-        m.submodules.shf = shf
-        for mod in [add, sub, mul, shf]:
-            m.d.comb += [
-                mod.a.eq(self.a),
-                mod.b.eq(self.b),
-            ]
-        go_now = Signal(reset_less=True) # testing no-delay ALU
-
-        with m.If(self.p_valid_i):
-            # input is valid. next check, if we already said "ready" or not
-            with m.If(~self.p_ready_o):
-                # we didn't say "ready" yet, so say so and initialise
-                m.d.sync += self.p_ready_o.eq(1)
-
-                # as this is a "fake" pipeline, just grab the output right now
-                with m.Switch(self.op):
-                    for i, mod in enumerate([add, sub, mul, shf]):
-                        with m.Case(i):
-                            m.d.sync += self.o.eq(mod.o)
-                with m.If(self.op == 2): # MUL, to take 5 instructions
-                    m.d.sync += self.counter.eq(5)
-                with m.Elif(self.op == 3): # SHIFT to take 7
-                    m.d.sync += self.counter.eq(7)
-                with m.Elif(self.op == 1): # SUB to take 1, straight away
-                    m.d.sync += self.counter.eq(1)
-                    m.d.comb += go_now.eq(1)
-                with m.Else(): # ADD to take 2
-                    m.d.sync += self.counter.eq(2)
-        with m.Else():
-            # input says no longer valid, so drop ready as well.
-            # a "proper" ALU would have had to sync in the opcode and a/b ops
-            m.d.sync += self.p_ready_o.eq(0)
-
-        # ok so the counter's running: when it gets to 1, fire the output
-        with m.If((self.counter == 1) | go_now):
-            # set the output as valid if the recipient is ready for it
-            m.d.sync += self.n_valid_o.eq(1)
-        with m.If(self.n_ready_i & self.n_valid_o):
-            m.d.sync += self.n_valid_o.eq(0)
-            # recipient said it was ready: reset back to known-good.
-            m.d.sync += self.counter.eq(0) # reset the counter
-            m.d.sync += self.o.eq(0) # clear the output for tidiness sake
-
-        # countdown to 1 (transition from 1 to 0 only on acknowledgement)
-        with m.If(self.counter > 1):
-            m.d.sync += self.counter.eq(self.counter - 1)
-
-        return m
-
-    def __iter__(self):
-        yield self.op
-        yield self.a
-        yield self.b
-        yield self.o
-
-    def ports(self):
-        return list(self)
-
-
-class BranchOp(Elaboratable):
-    def __init__(self, width, op):
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
-        self.op = op
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
-        return m
-
-
-class BranchALU(Elaboratable):
-    def __init__(self, width):
-        self.p_valid_i = Signal()
-        self.p_ready_o = Signal()
-        self.n_ready_i = Signal()
-        self.n_valid_o = Signal()
-        self.counter   = Signal(4)
-        self.op  = Signal(2)
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
-        self.width = width
-
-    def elaborate(self, platform):
-        m = Module()
-        bgt = BranchOp(self.width, operator.gt)
-        blt = BranchOp(self.width, operator.lt)
-        beq = BranchOp(self.width, operator.eq)
-        bne = BranchOp(self.width, operator.ne)
-
-        m.submodules.bgt = bgt
-        m.submodules.blt = blt
-        m.submodules.beq = beq
-        m.submodules.bne = bne
-        for mod in [bgt, blt, beq, bne]:
-            m.d.comb += [
-                mod.a.eq(self.a),
-                mod.b.eq(self.b),
-            ]
-
-        go_now = Signal(reset_less=True) # testing no-delay ALU
-        with m.If(self.p_valid_i):
-            # input is valid. next check, if we already said "ready" or not
-            with m.If(~self.p_ready_o):
-                # we didn't say "ready" yet, so say so and initialise
-                m.d.sync += self.p_ready_o.eq(1)
-
-                # as this is a "fake" pipeline, just grab the output right now
-                with m.Switch(self.op):
-                    for i, mod in enumerate([bgt, blt, beq, bne]):
-                        with m.Case(i):
-                            m.d.sync += self.o.eq(mod.o)
-                m.d.sync += self.counter.eq(5) # branch to take 5 cycles (fake)
-                #m.d.comb += go_now.eq(1)
-        with m.Else():
-            # input says no longer valid, so drop ready as well.
-            # a "proper" ALU would have had to sync in the opcode and a/b ops
-            m.d.sync += self.p_ready_o.eq(0)
-
-        # ok so the counter's running: when it gets to 1, fire the output
-        with m.If((self.counter == 1) | go_now):
-            # set the output as valid if the recipient is ready for it
-            m.d.sync += self.n_valid_o.eq(1)
-        with m.If(self.n_ready_i & self.n_valid_o):
-            m.d.sync += self.n_valid_o.eq(0)
-            # recipient said it was ready: reset back to known-good.
-            m.d.sync += self.counter.eq(0) # reset the counter
-            m.d.sync += self.o.eq(0) # clear the output for tidiness sake
-
-        # countdown to 1 (transition from 1 to 0 only on acknowledgement)
-        with m.If(self.counter > 1):
-            m.d.sync += self.counter.eq(self.counter - 1)
-
-        return m
-
-    def __iter__(self):
-        yield self.op
-        yield self.a
-        yield self.b
-        yield self.o
-
-    def ports(self):
-        return list(self)
-
-
-if __name__ == "__main__":
-    alu = ALU(width=16)
-    vl = rtlil.convert(alu, ports=alu.ports())
-    with open("test_alu.il", "w") as f:
-        f.write(vl)
-
-    alu = BranchALU(width=16)
-    vl = rtlil.convert(alu, ports=alu.ports())
-    with open("test_branch_alu.il", "w") as f:
-        f.write(vl)
-
diff --git a/src/experiment/compalu.py b/src/experiment/compalu.py
deleted file mode 100644
index 7da6b5cf..00000000
--- a/src/experiment/compalu.py
+++ /dev/null
@@ -1,207 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Mux, Elaboratable
-
-from nmutil.latch import SRLatch, latchregister
-
-""" Computation Unit (aka "ALU Manager").
-
-    This module runs a "revolving door" set of three latches, based on
-    * Issue
-    * Go_Read
-    * Go_Write
-    where one of them cannot be set on any given cycle.
-    (Note however that opc_l has been inverted (and qn used), due to SRLatch
-     default reset state being "0" rather than "1")
-
-    * When issue is first raised, a busy signal is sent out.
-      The src1 and src2 registers and the operand can be latched in
-      at this point
-
-    * Read request is set, which is acknowledged through the Scoreboard
-      to the priority picker, which generates (one and only one) Go_Read
-      at a time.  One of those will (eventually) be this Computation Unit.
-
-    * Once Go_Read is set, the src1/src2/operand latch door shuts (locking
-      src1/src2/operand in place), and the ALU is told to proceed.
-
-    * As this is currently a "demo" unit, a countdown timer is activated
-      to simulate an ALU "pipeline", which activates "write request release",
-      and the ALU's output is captured into a temporary register.
-
-    * Write request release will go through a similar process as Read request,
-      resulting (eventually) in Go_Write being asserted.
-
-    * When Go_Write is asserted, two things happen: (1) the data in the temp
-      register is placed combinatorially onto the output, and (2) the
-      req_l latch is cleared, busy is dropped, and the Comp Unit is back
-      through its revolving door to do another task.
-
-    Notes on oper_i:
-
-    * bits[0:2] are for the ALU, add=0, sub=1, shift=2, mul=3
-    * bit[2] are the immediate (bit[2]=1 == immediate mode)
-"""
-
-class ComputationUnitNoDelay(Elaboratable):
-    def __init__(self, rwid, opwid, alu):
-        self.opwid = opwid
-        self.rwid = rwid
-        self.alu = alu
-
-        self.counter = Signal(4)
-        self.go_rd_i = Signal(reset_less=True) # go read in
-        self.go_wr_i = Signal(reset_less=True) # go write in
-        self.issue_i = Signal(reset_less=True) # fn issue in
-        self.shadown_i = Signal(reset=1) # shadow function, defaults to ON
-        self.go_die_i = Signal() # go die (reset)
-
-        self.oper_i = Signal(opwid, reset_less=True) # opcode in
-        self.imm_i = Signal(rwid, reset_less=True) # immediate in
-        self.src1_i = Signal(rwid, reset_less=True) # oper1 in
-        self.src2_i = Signal(rwid, reset_less=True) # oper2 in
-
-        self.busy_o = Signal(reset_less=True) # fn busy out
-        self.data_o = Signal(rwid, reset_less=True) # Dest out
-        self.rd_rel_o = Signal(reset_less=True) # release src1/src2 request
-        self.req_rel_o = Signal(reset_less=True) # release request out (valid_o)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.alu = self.alu
-        m.submodules.src_l = src_l = SRLatch(sync=False)
-        m.submodules.opc_l = opc_l = SRLatch(sync=False)
-        m.submodules.req_l = req_l = SRLatch(sync=False)
-
-        # shadow/go_die
-        reset_w = Signal(reset_less=True)
-        reset_r = Signal(reset_less=True)
-        m.d.comb += reset_w.eq(self.go_wr_i | self.go_die_i)
-        m.d.comb += reset_r.eq(self.go_rd_i | self.go_die_i)
-
-        # This is fascinating and very important to observe that this
-        # is in effect a "3-way revolving door".  At no time may all 3
-        # latches be set at the same time.
-
-        # opcode latch (not using go_rd_i) - inverted so that busy resets to 0
-        m.d.sync += opc_l.s.eq(self.issue_i) # XXX NOTE: INVERTED FROM book!
-        m.d.sync += opc_l.r.eq(reset_w)      # XXX NOTE: INVERTED FROM book!
-
-        # src operand latch (not using go_wr_i)
-        m.d.sync += src_l.s.eq(self.issue_i)
-        m.d.sync += src_l.r.eq(reset_r)
-
-        # dest operand latch (not using issue_i)
-        m.d.sync += req_l.s.eq(self.go_rd_i)
-        m.d.sync += req_l.r.eq(reset_w)
-
-
-        # create a latch/register for the operand
-        oper_r = Signal(self.opwid+1, reset_less=True) # opcode reg
-        latchregister(m, self.oper_i, oper_r, self.issue_i)
-
-        # and one for the output from the ALU
-        data_r = Signal(self.rwid, reset_less=True) # Dest register
-        latchregister(m, self.alu.o, data_r, req_l.q)
-
-        # get the top 2 bits for the ALU
-        m.d.comb += self.alu.op.eq(oper_r[0:2])
-
-        # 3rd bit is whether this is an immediate or not
-        op_is_imm = Signal(reset_less=True)
-        m.d.comb += op_is_imm.eq(oper_r[2])
-
-        # select immediate if opcode says so.  however also change the latch
-        # to trigger *from* the opcode latch instead.
-        src2_or_imm = Signal(self.rwid, reset_less=True)
-        src_sel = Signal(reset_less=True)
-        m.d.comb += src_sel.eq(Mux(op_is_imm, opc_l.qn, src_l.q))
-        m.d.comb += src2_or_imm.eq(Mux(op_is_imm, self.imm_i, self.src2_i))
-
-        # create a latch/register for src1/src2
-        latchregister(m, self.src1_i, self.alu.a, src_l.q)
-        latchregister(m, src2_or_imm, self.alu.b, src_sel)
-
-        # -----
-        # outputs
-        # -----
-
-        # all request signals gated by busy_o.  prevents picker problems
-        busy_o = self.busy_o
-        m.d.comb += busy_o.eq(opc_l.q) # busy out
-        m.d.comb += self.rd_rel_o.eq(src_l.q & busy_o) # src1/src2 req rel
-
-        # on a go_read, tell the ALU we're accepting data.
-        # NOTE: this spells TROUBLE if the ALU isn't ready!
-        # go_read is only valid for one clock!
-        with m.If(self.go_rd_i):                     # src operands ready, GO!
-            with m.If(~self.alu.p_ready_o):          # no ACK yet
-                m.d.comb += self.alu.p_valid_i.eq(1) # so indicate valid
-
-        # only proceed if ALU says its output is valid
-        with m.If(self.alu.n_valid_o):
-            # when ALU ready, write req release out. waits for shadow
-            m.d.comb += self.req_rel_o.eq(req_l.q & busy_o & self.shadown_i)
-            # when output latch is ready, and ALU says ready, accept ALU output
-            with m.If(self.req_rel_o):
-                m.d.comb += self.alu.n_ready_i.eq(1) # tells ALU "thanks got it"
-
-        # output the data from the latch on go_write
-        with m.If(self.go_wr_i):
-            m.d.comb += self.data_o.eq(data_r)
-
-        return m
-
-    def __iter__(self):
-        yield self.go_rd_i
-        yield self.go_wr_i
-        yield self.issue_i
-        yield self.shadown_i
-        yield self.go_die_i
-        yield self.oper_i
-        yield self.imm_i
-        yield self.src1_i
-        yield self.src2_i
-        yield self.busy_o
-        yield self.rd_rel_o
-        yield self.req_rel_o
-        yield self.data_o
-
-    def ports(self):
-        return list(self)
-
-
-def scoreboard_sim(dut):
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_read_i.eq(1)
-    yield
-    yield dut.go_read_i.eq(0)
-    yield
-    yield dut.go_write_i.eq(1)
-    yield
-    yield dut.go_write_i.eq(0)
-    yield
-
-def test_scoreboard():
-    from alu_hier import ALU
-    alu = ALU(16)
-    dut = ComputationUnitNoDelay(16, 8, alu)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_compalu.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, scoreboard_sim(dut), vcd_name='test_compalu.vcd')
-
-if __name__ == '__main__':
-    test_scoreboard()
diff --git a/src/experiment/compldst.py b/src/experiment/compldst.py
deleted file mode 100644
index 77ad39dd..00000000
--- a/src/experiment/compldst.py
+++ /dev/null
@@ -1,288 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Mux, Cat, Elaboratable
-
-from nmutil.latch import SRLatch, latchregister
-
-""" LOAD / STORE Computation Unit.  Also capable of doing ADD and ADD immediate
-
-    This module runs a "revolving door" set of four latches, based on
-    * Issue
-    * Go_Read
-    * Go_Addr
-    * Go_Write *OR* Go_Store
-
-    (Note that opc_l has been inverted (and qn used), due to SRLatch
-     default reset state being "0" rather than "1")
-"""
-
-# internal opcodes.  hypothetically this could do more combinations.
-# meanings:
-# * bit 0: 0 = ADD , 1 = SUB
-# * bit 1: 0 = src1, 1 = IMM
-# * bit 2: 1 = LD
-# * bit 3: 1 = ST
-LDST_OP_ADDI = 0b0000 # plain ADD (src1 + src2)
-LDST_OP_SUBI = 0b0001 # plain SUB (src1 - src2)
-LDST_OP_ADD  = 0b0010 # immed ADD (imm + src1)
-LDST_OP_SUB  = 0b0011 # immed SUB (imm - src1)
-LDST_OP_ST   = 0b0110 # immed ADD plus LD op.  ADD result is address
-LDST_OP_LD   = 0b1010 # immed ADD plus ST op.  ADD result is address
-
-
-class LDSTCompUnit(Elaboratable):
-    """ LOAD / STORE / ADD / SUB Computation Unit
-
-        Inputs
-        ------
-
-        * :rwid:   register width
-        * :alu:    an ALU module
-        * :mem:    a Memory Module (read-write capable)
-
-        Control Signals (In)
-        --------------------
-
-        * :issue_i:    LD/ST is being "issued".
-        * :isalu_i:    ADD/SUB is being "issued" (aka issue_alu_i)
-        * :shadown_i:  Inverted-shadow is being held (stops STORE *and* WRITE)
-        * :go_rd_i:    read is being actioned (latches in src regs)
-        * :go_ad_i:    address is being actioned (triggers actual mem LD)
-        * :go_st_i:    store is being actioned (triggers actual mem STORE)
-        * :go_die_i:   resets the unit back to "wait for issue"
-    """
-    def __init__(self, rwid, opwid, alu, mem):
-        self.opwid = opwid
-        self.rwid = rwid
-        self.alu = alu
-        self.mem = mem
-
-        self.counter = Signal(4)
-        self.go_rd_i = Signal(reset_less=True) # go read in
-        self.go_ad_i = Signal(reset_less=True) # go address in
-        self.go_wr_i = Signal(reset_less=True) # go write in
-        self.go_st_i = Signal(reset_less=True) # go store in
-        self.issue_i = Signal(reset_less=True) # fn issue in
-        self.isalu_i = Signal(reset_less=True) # fn issue as ALU in
-        self.shadown_i = Signal(reset=1) # shadow function, defaults to ON
-        self.go_die_i = Signal() # go die (reset)
-
-        self.oper_i = Signal(opwid, reset_less=True) # opcode in
-        self.imm_i = Signal(rwid, reset_less=True) # immediate in
-        self.src1_i = Signal(rwid, reset_less=True) # oper1 in
-        self.src2_i = Signal(rwid, reset_less=True) # oper2 in
-
-        self.busy_o = Signal(reset_less=True)       # fn busy out
-        self.rd_rel_o = Signal(reset_less=True) # request src1/src2
-        self.adr_rel_o = Signal(reset_less=True) # request address (from mem)
-        self.sto_rel_o = Signal(reset_less=True) # request store (to mem)
-        self.req_rel_o = Signal(reset_less=True) # request write (result)
-        self.data_o = Signal(rwid, reset_less=True) # Dest out (LD or ALU)
-        self.addr_o = Signal(rwid, reset_less=True) # Address out (LD or ST)
-
-        # hmm... TODO... move these to outside of LDSTCompUnit
-        self.load_mem_o = Signal(reset_less=True) # activate memory LOAD
-        self.stwd_mem_o = Signal(reset_less=True) # activate memory STORE
-        self.ld_o = Signal(reset_less=True) # operation is a LD
-        self.st_o = Signal(reset_less=True) # operation is a ST
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-        sync = m.d.sync
-
-        m.submodules.alu = self.alu
-        m.submodules.src_l = src_l = SRLatch(sync=False)
-        m.submodules.opc_l = opc_l = SRLatch(sync=False)
-        m.submodules.adr_l = adr_l = SRLatch(sync=False)
-        m.submodules.req_l = req_l = SRLatch(sync=False)
-        m.submodules.sto_l = sto_l = SRLatch(sync=False)
-
-        # shadow/go_die
-        reset_b = Signal(reset_less=True)
-        reset_w = Signal(reset_less=True)
-        reset_a = Signal(reset_less=True)
-        reset_s = Signal(reset_less=True)
-        reset_r = Signal(reset_less=True)
-        comb += reset_b.eq(self.go_st_i | self.go_wr_i | self.go_die_i)
-        comb += reset_w.eq(self.go_wr_i | self.go_die_i)
-        comb += reset_s.eq(self.go_st_i | self.go_die_i)
-        comb += reset_r.eq(self.go_rd_i | self.go_die_i)
-        # this one is slightly different, issue_alu_i selects go_wr_i)
-        a_sel = Mux(self.isalu_i, self.go_wr_i, self.go_ad_i)
-        comb += reset_a.eq(a_sel| self.go_die_i)
-
-        # opcode decode
-        op_alu = Signal(reset_less=True)
-        op_is_ld = Signal(reset_less=True)
-        op_is_st = Signal(reset_less=True)
-        op_ldst = Signal(reset_less=True)
-        op_is_imm = Signal(reset_less=True)
-
-        # select immediate or src2 reg to add
-        src2_or_imm = Signal(self.rwid, reset_less=True)
-        src_sel = Signal(reset_less=True)
-
-        # issue can be either issue_i or issue_alu_i (isalu_i)
-        issue_i = Signal(reset_less=True)
-        comb += issue_i.eq(self.issue_i | self.isalu_i)
-
-        # Ripple-down the latches, each one set cancels the previous.
-        # NOTE: use sync to stop combinatorial loops.
-
-        # opcode latch - inverted so that busy resets to 0
-        sync += opc_l.s.eq(issue_i) # XXX NOTE: INVERTED FROM book!
-        sync += opc_l.r.eq(reset_b) # XXX NOTE: INVERTED FROM book!
-
-        # src operand latch
-        sync += src_l.s.eq(issue_i)
-        sync += src_l.r.eq(reset_r)
-
-        # addr latch
-        sync += adr_l.s.eq(self.go_rd_i)
-        sync += adr_l.r.eq(reset_a)
-
-        # dest operand latch
-        sync += req_l.s.eq(self.go_ad_i)
-        sync += req_l.r.eq(reset_w)
-
-        # store latch
-        sync += sto_l.s.eq(self.go_ad_i)
-        sync += sto_l.r.eq(reset_s)
-
-        # outputs: busy and release signals
-        busy_o = self.busy_o
-        comb += self.busy_o.eq(opc_l.q) # busy out
-        comb += self.rd_rel_o.eq(src_l.q & busy_o) # src1/src2 req rel
-        comb += self.sto_rel_o.eq(sto_l.q & busy_o & self.shadown_i & op_is_st)
-
-        # request release enabled based on if op is a LD/ST or a plain ALU
-        # if op is an ADD/SUB or a LD, req_rel activates.
-        wr_q = Signal(reset_less=True)
-        comb += wr_q.eq(req_l.q & (~op_ldst | op_is_ld))
-
-        alulatch = Signal(reset_less=True)
-        comb += alulatch.eq((op_ldst & self.adr_rel_o) | \
-                            (~op_ldst & self.req_rel_o))
-
-        # only proceed if ALU says its output is valid
-        with m.If(self.alu.n_valid_o):
-
-            # write req release out.  waits until shadow is dropped.
-            comb += self.req_rel_o.eq(wr_q & busy_o & self.shadown_i)
-            # address release only happens on LD/ST, and is shadowed.
-            comb += self.adr_rel_o.eq(adr_l.q & op_ldst & busy_o & \
-                                      self.shadown_i)
-            # when output latch is ready, and ALU says ready, accept ALU output
-            with m.If(self.req_rel_o):
-                m.d.comb += self.alu.n_ready_i.eq(1) # tells ALU "thanks got it"
-
-        # select immediate if opcode says so.  however also change the latch
-        # to trigger *from* the opcode latch instead.
-        comb += src_sel.eq(Mux(op_is_imm, opc_l.qn, src_l.q))
-        comb += src2_or_imm.eq(Mux(op_is_imm, self.imm_i, self.src2_i))
-
-        # create a latch/register for src1/src2 (include immediate select)
-        latchregister(m, self.src1_i, self.alu.a, src_l.q)
-        latchregister(m, src2_or_imm, self.alu.b, src_sel)
-
-        # create a latch/register for the operand
-        oper_r = Signal(self.opwid, reset_less=True) # Dest register
-        latchregister(m, self.oper_i, oper_r, self.issue_i)
-        alu_op = Cat(op_alu, 0, op_is_imm) # using alu_hier, here.
-        comb += self.alu.op.eq(alu_op)
-
-        # and one for the output from the ALU
-        data_r = Signal(self.rwid, reset_less=True) # Dest register
-        latchregister(m, self.alu.o, data_r, alulatch)
-
-        # decode bits of operand (latched)
-        comb += op_alu.eq(oper_r[0])
-        comb += op_is_imm.eq(oper_r[1])
-        comb += op_is_ld.eq(oper_r[2])
-        comb += op_is_st.eq(oper_r[3])
-        comb += op_ldst.eq(op_is_ld | op_is_st)
-        comb += self.load_mem_o.eq(op_is_ld & self.go_ad_i)
-        comb += self.stwd_mem_o.eq(op_is_st & self.go_st_i)
-        comb += self.ld_o.eq(op_is_ld)
-        comb += self.st_o.eq(op_is_st)
-
-        # on a go_read, tell the ALU we're accepting data.
-        # NOTE: this spells TROUBLE if the ALU isn't ready!
-        # go_read is only valid for one clock!
-        with m.If(self.go_rd_i):                     # src operands ready, GO!
-            with m.If(~self.alu.p_ready_o):          # no ACK yet
-                m.d.comb += self.alu.p_valid_i.eq(1) # so indicate valid
-
-        # put the register directly onto the output bus on a go_write
-        with m.If(self.go_wr_i):
-            comb += self.data_o.eq(data_r)
-
-        # put the register directly onto the address bus
-        with m.If(self.go_ad_i):
-            comb += self.addr_o.eq(data_r)
-
-        return m
-
-    def __iter__(self):
-        yield self.go_rd_i
-        yield self.go_ad_i
-        yield self.go_wr_i
-        yield self.go_st_i
-        yield self.issue_i
-        yield self.isalu_i
-        yield self.shadown_i
-        yield self.go_die_i
-        yield self.oper_i
-        yield self.imm_i
-        yield self.src1_i
-        yield self.src2_i
-        yield self.busy_o
-        yield self.rd_rel_o
-        yield self.adr_rel_o
-        yield self.sto_rel_o
-        yield self.req_rel_o
-        yield self.data_o
-        yield self.load_mem_o
-        yield self.stwd_mem_o
-
-    def ports(self):
-        return list(self)
-
-
-def scoreboard_sim(dut):
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_read_i.eq(1)
-    yield
-    yield dut.go_read_i.eq(0)
-    yield
-    yield dut.go_write_i.eq(1)
-    yield
-    yield dut.go_write_i.eq(0)
-    yield
-
-
-def test_scoreboard():
-    from alu_hier import ALU
-    alu = ALU(16)
-    mem = alu # fake
-    dut = LDSTCompUnit(16, 4, alu, mem)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_ldst_comp.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, scoreboard_sim(dut), vcd_name='test_ldst_comp.vcd')
-
-if __name__ == '__main__':
-    test_scoreboard()
diff --git a/src/experiment/cscore.py b/src/experiment/cscore.py
deleted file mode 100644
index 18b71c80..00000000
--- a/src/experiment/cscore.py
+++ /dev/null
@@ -1,435 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Const, Signal, Array, Cat, Elaboratable
-
-from regfile.regfile import RegFileArray, treereduce
-from scoreboard.fn_unit import IntFnUnit, FPFnUnit, LDFnUnit, STFnUnit
-from scoreboard.fu_fu_matrix import FUFUDepMatrix
-from scoreboard.fu_reg_matrix import FURegDepMatrix
-from scoreboard.global_pending import GlobalPending
-from scoreboard.group_picker import GroupPicker
-from scoreboard.issue_unit import IntFPIssueUnit, RegDecode
-
-from compalu import ComputationUnitNoDelay
-
-from alu_hier import ALU
-from nmutil.latch import SRLatch
-
-from random import randint
-
-
-class Scoreboard(Elaboratable):
-    def __init__(self, rwid, n_regs):
-        """ Inputs:
-
-            * :rwid:   bit width of register file(s) - both FP and INT
-            * :n_regs: depth of register file(s) - number of FP and INT regs
-        """
-        self.rwid = rwid
-        self.n_regs = n_regs
-
-        # Register Files
-        self.intregs = RegFileArray(rwid, n_regs)
-        self.fpregs = RegFileArray(rwid, n_regs)
-
-        # inputs
-        self.int_store_i = Signal(reset_less=True) # instruction is a store
-        self.int_dest_i = Signal(max=n_regs, reset_less=True) # Dest R# in
-        self.int_src1_i = Signal(max=n_regs, reset_less=True) # oper1 R# in
-        self.int_src2_i = Signal(max=n_regs, reset_less=True) # oper2 R# in
-
-        self.issue_o = Signal(reset_less=True) # instruction was accepted
-
-    def elaborate(self, platform):
-        m = Module()
-
-        m.submodules.intregs = self.intregs
-        m.submodules.fpregs = self.fpregs
-
-        # register ports
-        int_dest = self.intregs.write_port("dest")
-        int_src1 = self.intregs.read_port("src1")
-        int_src2 = self.intregs.read_port("src2")
-
-        fp_dest = self.fpregs.write_port("dest")
-        fp_src1 = self.fpregs.read_port("src1")
-        fp_src2 = self.fpregs.read_port("src2")
-
-        # Int ALUs
-        add = ALU(self.rwid)
-        sub = ALU(self.rwid)
-        m.submodules.comp1 = comp1 = ComputationUnitNoDelay(self.rwid, 1, add)
-        m.submodules.comp2 = comp2 = ComputationUnitNoDelay(self.rwid, 1, sub)
-        int_alus = [comp1, comp2]
-
-        m.d.comb += comp1.oper_i.eq(Const(0)) # temporary/experiment: op=add
-        m.d.comb += comp2.oper_i.eq(Const(1)) # temporary/experiment: op=sub
-
-        # Int FUs
-        if_l = []
-        int_src1_pend_v = []
-        int_src2_pend_v = []
-        int_rd_pend_v = []
-        int_wr_pend_v = []
-        for i, a in enumerate(int_alus):
-            # set up Integer Function Unit, add to module (and python list)
-            fu = IntFnUnit(self.n_regs, shadow_wid=0)
-            setattr(m.submodules, "intfu%d" % i, fu)
-            if_l.append(fu)
-            # collate the read/write pending vectors (to go into global pending)
-            int_src1_pend_v.append(fu.src1_pend_o)
-            int_src2_pend_v.append(fu.src2_pend_o)
-            int_rd_pend_v.append(fu.int_rd_pend_o)
-            int_wr_pend_v.append(fu.int_wr_pend_o)
-        int_fus = Array(if_l)
-
-        # Count of number of FUs
-        n_int_fus = len(if_l)
-        n_fp_fus = 0 # for now
-
-        n_fus = n_int_fus + n_fp_fus # plus FP FUs
-
-        # XXX replaced by array of FUs? *FnUnit
-        # # Integer FU-FU Dep Matrix
-        # m.submodules.intfudeps = FUFUDepMatrix(n_int_fus, n_int_fus)
-        # Integer FU-Reg Dep Matrix
-        # intregdeps = FURegDepMatrix(self.n_regs, n_int_fus)
-        # m.submodules.intregdeps = intregdeps
-
-        # Integer Priority Picker 1: Adder + Subtractor
-        intpick1 = GroupPicker(2) # picks between add and sub
-        m.submodules.intpick1 = intpick1
-
-        # Global Pending Vectors (INT and FP)
-        # NOTE: number of vectors is NOT same as number of FUs.
-        g_int_src1_pend_v = GlobalPending(self.n_regs, int_src1_pend_v)
-        g_int_src2_pend_v = GlobalPending(self.n_regs, int_src2_pend_v)
-        g_int_rd_pend_v = GlobalPending(self.n_regs, int_rd_pend_v, True)
-        g_int_wr_pend_v = GlobalPending(self.n_regs, int_wr_pend_v, True)
-        m.submodules.g_int_src1_pend_v = g_int_src1_pend_v
-        m.submodules.g_int_src2_pend_v = g_int_src2_pend_v
-        m.submodules.g_int_rd_pend_v = g_int_rd_pend_v
-        m.submodules.g_int_wr_pend_v = g_int_wr_pend_v
-
-        # INT/FP Issue Unit
-        regdecode = RegDecode(self.n_regs)
-        m.submodules.regdecode = regdecode
-        issueunit = IntFPIssueUnit(self.n_regs, n_int_fus, n_fp_fus)
-        m.submodules.issueunit = issueunit
-
-        # FU-FU Dependency Matrices
-        intfudeps = FUFUDepMatrix(n_int_fus, n_int_fus)
-        m.submodules.intfudeps = intfudeps
-
-        #---------
-        # ok start wiring things together...
-        # "now hear de word of de looord... dem bones dem bones dem dryy bones"
-        # https://www.youtube.com/watch?v=pYb8Wm6-QfA
-        #---------
-
-        #---------
-        # Issue Unit is where it starts.  set up some in/outs for this module
-        #---------
-        m.d.comb += [issueunit.i.store_i.eq(self.int_store_i),
-                     regdecode.dest_i.eq(self.int_dest_i),
-                     regdecode.src1_i.eq(self.int_src1_i),
-                     regdecode.src2_i.eq(self.int_src2_i),
-                     regdecode.enable_i.eq(1),
-                     self.issue_o.eq(issueunit.issue_o),
-                    issueunit.i.dest_i.eq(regdecode.dest_o),
-                    ]
-        self.int_insn_i = issueunit.i.insn_i # enabled by instruction decode
-
-        # connect global rd/wr pending vectors
-        m.d.comb += issueunit.i.g_wr_pend_i.eq(g_int_wr_pend_v.g_pend_o)
-        # TODO: issueunit.f (FP)
-
-        # and int function issue / busy arrays, and dest/src1/src2
-        fn_issue_l = []
-        fn_busy_l = []
-        for i, fu in enumerate(if_l):
-            fn_issue_l.append(fu.issue_i)
-            fn_busy_l.append(fu.busy_o)
-            m.d.sync += fu.issue_i.eq(issueunit.i.fn_issue_o[i])
-            m.d.sync += fu.dest_i.eq(self.int_dest_i)
-            m.d.sync += fu.src1_i.eq(self.int_src1_i)
-            m.d.sync += fu.src2_i.eq(self.int_src2_i)
-            # XXX sync, so as to stop a simulation infinite loop
-            m.d.comb += issueunit.i.busy_i[i].eq(fu.busy_o)
-
-        #---------
-        # connect Function Units
-        #---------
-
-        # Group Picker... done manually for now.  TODO: cat array of pick sigs
-        m.d.comb += if_l[0].go_rd_i.eq(intpick1.go_rd_o[0]) # add rd
-        m.d.comb += if_l[0].go_wr_i.eq(intpick1.go_wr_o[0]) # add wr
-
-        m.d.comb += if_l[1].go_rd_i.eq(intpick1.go_rd_o[1]) # subtract rd
-        m.d.comb += if_l[1].go_wr_i.eq(intpick1.go_wr_o[1]) # subtract wr
-
-        # create read-pending FU-FU vectors
-        intfu_rd_pend_v = Signal(n_int_fus, reset_less = True)
-        intfu_wr_pend_v = Signal(n_int_fus, reset_less = True)
-        for i in range(n_int_fus):
-            #m.d.comb += intfu_rd_pend_v[i].eq(if_l[i].int_rd_pend_o.bool())
-            #m.d.comb += intfu_wr_pend_v[i].eq(if_l[i].int_wr_pend_o.bool())
-            m.d.comb += intfu_rd_pend_v[i].eq(if_l[i].int_readable_o)
-            m.d.comb += intfu_wr_pend_v[i].eq(if_l[i].int_writable_o)
-
-        # Connect INT Fn Unit global wr/rd pending
-        for fu in if_l:
-            m.d.comb += fu.g_int_wr_pend_i.eq(g_int_wr_pend_v.g_pend_o)
-            m.d.comb += fu.g_int_rd_pend_i.eq(g_int_rd_pend_v.g_pend_o)
-
-        # Connect FU-FU Matrix, NOTE: FN Units readable/writable considered
-        # to be unit "read-pending / write-pending"
-        m.d.comb += intfudeps.rd_pend_i.eq(intfu_rd_pend_v)
-        m.d.comb += intfudeps.wr_pend_i.eq(intfu_wr_pend_v)
-        m.d.comb += intfudeps.issue_i.eq(issueunit.i.fn_issue_o)
-        for i in range(n_int_fus):
-            m.d.comb += intfudeps.go_rd_i[i].eq(intpick1.go_rd_o[i])
-            m.d.comb += intfudeps.go_wr_i[i].eq(intpick1.go_wr_o[i])
-
-        # Connect Picker (note connection to FU-FU)
-        #---------
-        readable_o = intfudeps.readable_o
-        writable_o = intfudeps.writable_o
-        m.d.comb += intpick1.rd_rel_i[0].eq(int_alus[0].rd_rel_o)
-        m.d.comb += intpick1.rd_rel_i[1].eq(int_alus[1].rd_rel_o)
-        m.d.comb += intpick1.req_rel_i[0].eq(int_alus[0].req_rel_o)
-        m.d.comb += intpick1.req_rel_i[1].eq(int_alus[1].req_rel_o)
-        m.d.comb += intpick1.readable_i[0].eq(readable_o[0]) # add rd
-        m.d.comb += intpick1.writable_i[0].eq(writable_o[0]) # add wr
-        m.d.comb += intpick1.readable_i[1].eq(readable_o[1]) # sub rd
-        m.d.comb += intpick1.writable_i[1].eq(writable_o[1]) # sub wr
-
-        #---------
-        # Connect Register File(s)
-        #---------
-        #with m.If(if_l[0].go_wr_i | if_l[1].go_wr_i):
-        m.d.sync += int_dest.wen.eq(g_int_wr_pend_v.g_pend_o)
-        #with m.If(intpick1.go_rd_o):
-        #with m.If(if_l[0].go_rd_i | if_l[1].go_rd_i):
-        m.d.sync += int_src1.ren.eq(g_int_src1_pend_v.g_pend_o)
-        m.d.sync += int_src2.ren.eq(g_int_src2_pend_v.g_pend_o)
-
-        # merge (OR) all integer FU / ALU outputs to a single value
-        # bit of a hack: treereduce needs a list with an item named "dest_o"
-        dest_o = treereduce(int_alus)
-        m.d.sync += int_dest.data_i.eq(dest_o)
-
-        # connect ALUs
-        for i, alu in enumerate(int_alus):
-            m.d.comb += alu.go_rd_i.eq(intpick1.go_rd_o[i])
-            m.d.comb += alu.go_wr_i.eq(intpick1.go_wr_o[i])
-            m.d.comb += alu.issue_i.eq(fn_issue_l[i])
-            #m.d.comb += fn_busy_l[i].eq(alu.busy_o)  # XXX ignore, use fnissue
-            m.d.comb += alu.src1_i.eq(int_src1.data_o)
-            m.d.comb += alu.src2_i.eq(int_src2.data_o)
-            m.d.comb += if_l[i].req_rel_i.eq(alu.req_rel_o) # pipe out ready
-
-        return m
-
-
-    def __iter__(self):
-        yield from self.intregs
-        yield from self.fpregs
-        yield self.int_store_i
-        yield self.int_dest_i
-        yield self.int_src1_i
-        yield self.int_src2_i
-        yield self.issue_o
-        #yield from self.int_src1
-        #yield from self.int_dest
-        #yield from self.int_src1
-        #yield from self.int_src2
-        #yield from self.fp_dest
-        #yield from self.fp_src1
-        #yield from self.fp_src2
-
-    def ports(self):
-        return list(self)
-
-IADD = 0
-ISUB = 1
-
-class RegSim:
-    def __init__(self, rwidth, nregs):
-        self.rwidth = rwidth
-        self.regs = [0] * nregs
-
-    def op(self, op, src1, src2, dest):
-        src1 = self.regs[src1]
-        src2 = self.regs[src2]
-        if op == IADD:
-            val = (src1 + src2) & ((1<<(self.rwidth))-1)
-        elif op == ISUB:
-            val = (src1 - src2) & ((1<<(self.rwidth))-1)
-        self.regs[dest] = val
-
-    def setval(self, dest, val):
-        self.regs[dest] = val
-
-    def dump(self, dut):
-        for i, val in enumerate(self.regs):
-            reg = yield dut.intregs.regs[i].reg
-            okstr = "OK" if reg == val else "!ok"
-            print("reg %d expected %x received %x %s" % (i, val, reg, okstr))
-
-    def check(self, dut):
-        for i, val in enumerate(self.regs):
-            reg = yield dut.intregs.regs[i].reg
-            if reg != val:
-                print("reg %d expected %x received %x\n" % (i, val, reg))
-                yield from self.dump(dut)
-                assert False
-
-def int_instr(dut, alusim, op, src1, src2, dest):
-    for i in range(len(dut.int_insn_i)):
-        yield dut.int_insn_i[i].eq(0)
-    yield dut.int_dest_i.eq(dest)
-    yield dut.int_src1_i.eq(src1)
-    yield dut.int_src2_i.eq(src2)
-    yield dut.int_insn_i[op].eq(1)
-    alusim.op(op, src1, src2, dest)
-
-
-def print_reg(dut, rnums):
-    rs = []
-    for rnum in rnums:
-        reg = yield dut.intregs.regs[rnum].reg
-        rs.append("%x" % reg)
-    rnums = map(str, rnums)
-    print ("reg %s: %s" % (','.join(rnums), ','.join(rs)))
-
-
-def scoreboard_sim(dut, alusim):
-    yield dut.int_store_i.eq(0)
-
-    for i in range(1, dut.n_regs):
-        yield dut.intregs.regs[i].reg.eq(i)
-        alusim.setval(i, i)
-
-    if False:
-        yield from int_instr(dut, alusim, IADD, 4, 3, 5)
-        yield from print_reg(dut, [3,4,5])
-        yield
-        yield from int_instr(dut, alusim, IADD, 5, 2, 5)
-        yield from print_reg(dut, [3,4,5])
-        yield
-        yield from int_instr(dut, alusim, ISUB, 5, 1, 3)
-        yield from print_reg(dut, [3,4,5])
-        yield
-        for i in range(len(dut.int_insn_i)):
-            yield dut.int_insn_i[i].eq(0)
-        yield from print_reg(dut, [3,4,5])
-        yield
-        yield from print_reg(dut, [3,4,5])
-        yield
-        yield from print_reg(dut, [3,4,5])
-        yield
-
-        yield from alusim.check(dut)
-
-    for i in range(2):
-        src1 = randint(1, dut.n_regs-1)
-        src2 = randint(1, dut.n_regs-1)
-        while True:
-            dest = randint(1, dut.n_regs-1)
-            break
-            if dest not in [src1, src2]:
-                break
-        op = randint(0, 1)
-        if False:
-            if i % 2 == 0:
-                src1 = 6
-                src2 = 6
-                dest = 1
-            else:
-                src1 = 1
-                src2 = 7
-                dest = 2
-            #src1 = 2
-            #src2 = 3
-            #dest = 2
-
-            op = i
-
-        if True:
-            if i == 0:
-                src1 = 2
-                src2 = 3
-                dest = 3
-            else:
-                src1 = 5
-                src2 = 3
-                dest = 4
-
-            #op = (i+1) % 2
-            op = i
-
-        print ("random %d: %d %d %d %d\n" % (i, op, src1, src2, dest))
-        yield from int_instr(dut, alusim, op, src1, src2, dest)
-        yield from print_reg(dut, [3,4,5])
-        while True:
-            yield
-            issue_o = yield dut.issue_o
-            if issue_o:
-                yield from print_reg(dut, [3,4,5])
-                for i in range(len(dut.int_insn_i)):
-                    yield dut.int_insn_i[i].eq(0)
-                break
-            print ("busy",)
-            yield from print_reg(dut, [3,4,5])
-        yield
-        yield
-        yield
-
-
-    yield
-    yield from print_reg(dut, [3,4,5])
-    yield
-    yield from print_reg(dut, [3,4,5])
-    yield
-    yield from print_reg(dut, [3,4,5])
-    yield
-    yield from print_reg(dut, [3,4,5])
-    yield
-    yield
-    yield
-    yield
-    yield
-    yield
-    yield
-    yield
-    yield
-    yield from alusim.check(dut)
-    yield from alusim.dump(dut)
-
-
-def explore_groups(dut):
-    from nmigen.hdl.ir import Fragment
-    from nmigen.hdl.xfrm import LHSGroupAnalyzer
-
-    fragment = dut.elaborate(platform=None)
-    fr = Fragment.get(fragment, platform=None)
-
-    groups = LHSGroupAnalyzer()(fragment._statements)
-
-    print (groups)
-
-
-def test_scoreboard():
-    dut = Scoreboard(16, 8)
-    alusim = RegSim(16, 8)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_scoreboard.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, scoreboard_sim(dut, alusim),
-                        vcd_name='test_scoreboard.vcd')
-
-
-if __name__ == '__main__':
-    test_scoreboard()
diff --git a/src/experiment/score6600.py b/src/experiment/score6600.py
deleted file mode 100644
index 209bc99c..00000000
--- a/src/experiment/score6600.py
+++ /dev/null
@@ -1,1296 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen.hdl.ast import unsigned
-from nmigen import Module, Const, Signal, Array, Cat, Elaboratable, Memory
-
-from regfile.regfile import RegFileArray, treereduce
-from scoreboard.fu_fu_matrix import FUFUDepMatrix
-from scoreboard.fu_reg_matrix import FURegDepMatrix
-from scoreboard.global_pending import GlobalPending
-from scoreboard.group_picker import GroupPicker
-from scoreboard.issue_unit import IssueUnitGroup, IssueUnitArray, RegDecode
-from scoreboard.shadow import ShadowMatrix, BranchSpeculationRecord
-from scoreboard.instruction_q import Instruction, InstructionQ
-from scoreboard.memfu import MemFunctionUnits
-
-from compalu import ComputationUnitNoDelay
-from compldst import LDSTCompUnit
-
-from alu_hier import ALU, BranchALU
-from nmutil.latch import SRLatch
-from nmutil.nmoperator import eq
-
-from random import randint, seed
-from copy import deepcopy
-from math import log
-
-
-class TestMemory(Elaboratable):
-    def __init__(self, regwid, addrw):
-        self.ddepth = 1 # regwid //8
-        depth = (1<<addrw) // self.ddepth
-        self.mem   = Memory(width=regwid, depth=depth, init=range(0, depth))
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.rdport = self.rdport = self.mem.read_port()
-        m.submodules.wrport = self.wrport = self.mem.write_port()
-        return m
-
-
-class MemSim:
-    def __init__(self, regwid, addrw):
-        self.regwid = regwid
-        self.ddepth = 1 # regwid//8
-        depth = (1<<addrw) // self.ddepth
-        self.mem = list(range(0, depth))
-
-    def ld(self, addr):
-        return self.mem[addr>>self.ddepth]
-
-    def st(self, addr, data):
-        self.mem[addr>>self.ddepth] = data & ((1<<self.regwid)-1)
-
-
-class CompUnitsBase(Elaboratable):
-    """ Computation Unit Base class.
-
-        Amazingly, this class works recursively.  It's supposed to just
-        look after some ALUs (that can handle the same operations),
-        grouping them together, however it turns out that the same code
-        can also group *groups* of Computation Units together as well.
-
-        Basically it was intended just to concatenate the ALU's issue,
-        go_rd etc. signals together, which start out as bits and become
-        sequences.  Turns out that the same trick works just as well
-        on Computation Units!
-
-        So this class may be used recursively to present a top-level
-        sequential concatenation of all the signals in and out of
-        ALUs, whilst at the same time making it convenient to group
-        ALUs together.
-
-        At the lower level, the intent is that groups of (identical)
-        ALUs may be passed the same operation.  Even beyond that,
-        the intent is that that group of (identical) ALUs actually
-        share the *same pipeline* and as such become a "Concurrent
-        Computation Unit" as defined by Mitch Alsup (see section
-        11.4.9.3)
-    """
-    def __init__(self, rwid, units, ldstmode=False):
-        """ Inputs:
-
-            * :rwid:   bit width of register file(s) - both FP and INT
-            * :units: sequence of ALUs (or CompUnitsBase derivatives)
-        """
-        self.units = units
-        self.ldstmode = ldstmode
-        self.rwid = rwid
-        self.rwid = rwid
-        if units and isinstance(units[0], CompUnitsBase):
-            self.n_units = 0
-            for u in self.units:
-                self.n_units += u.n_units
-        else:
-            self.n_units = len(units)
-
-        n_units = self.n_units
-
-        # inputs
-        self.issue_i = Signal(n_units, reset_less=True)
-        self.go_rd_i = Signal(n_units, reset_less=True)
-        self.go_wr_i = Signal(n_units, reset_less=True)
-        self.shadown_i = Signal(n_units, reset_less=True)
-        self.go_die_i = Signal(n_units, reset_less=True)
-        if ldstmode:
-            self.go_ad_i = Signal(n_units, reset_less=True)
-            self.go_st_i = Signal(n_units, reset_less=True)
-
-        # outputs
-        self.busy_o = Signal(n_units, reset_less=True)
-        self.rd_rel_o = Signal(n_units, reset_less=True)
-        self.req_rel_o = Signal(n_units, reset_less=True)
-        if ldstmode:
-            self.ld_o = Signal(n_units, reset_less=True) # op is LD
-            self.st_o = Signal(n_units, reset_less=True) # op is ST
-            self.adr_rel_o = Signal(n_units, reset_less=True)
-            self.sto_rel_o = Signal(n_units, reset_less=True)
-            self.req_rel_o = Signal(n_units, reset_less=True)
-            self.load_mem_o = Signal(n_units, reset_less=True)
-            self.stwd_mem_o = Signal(n_units, reset_less=True)
-            self.addr_o = Signal(rwid, reset_less=True)
-
-        # in/out register data (note: not register#, actual data)
-        self.data_o = Signal(rwid, reset_less=True)
-        self.src1_i = Signal(rwid, reset_less=True)
-        self.src2_i = Signal(rwid, reset_less=True)
-        # input operand
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-
-        for i, alu in enumerate(self.units):
-            setattr(m.submodules, "comp%d" % i, alu)
-
-        go_rd_l = []
-        go_wr_l = []
-        issue_l = []
-        busy_l = []
-        req_rel_l = []
-        rd_rel_l = []
-        shadow_l = []
-        godie_l = []
-        for alu in self.units:
-            req_rel_l.append(alu.req_rel_o)
-            rd_rel_l.append(alu.rd_rel_o)
-            shadow_l.append(alu.shadown_i)
-            godie_l.append(alu.go_die_i)
-            go_wr_l.append(alu.go_wr_i)
-            go_rd_l.append(alu.go_rd_i)
-            issue_l.append(alu.issue_i)
-            busy_l.append(alu.busy_o)
-        comb += self.rd_rel_o.eq(Cat(*rd_rel_l))
-        comb += self.req_rel_o.eq(Cat(*req_rel_l))
-        comb += self.busy_o.eq(Cat(*busy_l))
-        comb += Cat(*godie_l).eq(self.go_die_i)
-        comb += Cat(*shadow_l).eq(self.shadown_i)
-        comb += Cat(*go_wr_l).eq(self.go_wr_i)
-        comb += Cat(*go_rd_l).eq(self.go_rd_i)
-        comb += Cat(*issue_l).eq(self.issue_i)
-
-        # connect data register input/output
-
-        # merge (OR) all integer FU / ALU outputs to a single value
-        if self.units:
-            data_o = treereduce(self.units, "data_o")
-            comb += self.data_o.eq(data_o)
-            if self.ldstmode:
-                addr_o = treereduce(self.units, "addr_o")
-                comb += self.addr_o.eq(addr_o)
-
-        for i, alu in enumerate(self.units):
-            comb += alu.src1_i.eq(self.src1_i)
-            comb += alu.src2_i.eq(self.src2_i)
-
-        if not self.ldstmode:
-            return m
-
-        ldmem_l = []
-        stmem_l = []
-        go_ad_l = []
-        go_st_l = []
-        ld_l = []
-        st_l = []
-        adr_rel_l = []
-        sto_rel_l = []
-        for alu in self.units:
-            ld_l.append(alu.ld_o)
-            st_l.append(alu.st_o)
-            adr_rel_l.append(alu.adr_rel_o)
-            sto_rel_l.append(alu.sto_rel_o)
-            ldmem_l.append(alu.load_mem_o)
-            stmem_l.append(alu.stwd_mem_o)
-            go_ad_l.append(alu.go_ad_i)
-            go_st_l.append(alu.go_st_i)
-        comb += self.ld_o.eq(Cat(*ld_l))
-        comb += self.st_o.eq(Cat(*st_l))
-        comb += self.adr_rel_o.eq(Cat(*adr_rel_l))
-        comb += self.sto_rel_o.eq(Cat(*sto_rel_l))
-        comb += self.load_mem_o.eq(Cat(*ldmem_l))
-        comb += self.stwd_mem_o.eq(Cat(*stmem_l))
-        comb += Cat(*go_ad_l).eq(self.go_ad_i)
-        comb += Cat(*go_st_l).eq(self.go_st_i)
-
-        return m
-
-
-class CompUnitLDSTs(CompUnitsBase):
-
-    def __init__(self, rwid, opwid, n_ldsts, mem):
-        """ Inputs:
-
-            * :rwid:   bit width of register file(s) - both FP and INT
-            * :opwid:  operand bit width
-        """
-        self.opwid = opwid
-
-        # inputs
-        self.oper_i = Signal(opwid, reset_less=True)
-        self.imm_i = Signal(rwid, reset_less=True)
-
-        # Int ALUs
-        self.alus = []
-        for i in range(n_ldsts):
-            self.alus.append(ALU(rwid))
-
-        units = []
-        for alu in self.alus:
-            aluopwid = 4 # see compldst.py for "internal" opcode
-            units.append(LDSTCompUnit(rwid, aluopwid, alu, mem))
-
-        CompUnitsBase.__init__(self, rwid, units, ldstmode=True)
-
-    def elaborate(self, platform):
-        m = CompUnitsBase.elaborate(self, platform)
-        comb = m.d.comb
-
-        # hand the same operation to all units, 4 lower bits though
-        for alu in self.units:
-            comb += alu.oper_i[0:4].eq(self.oper_i)
-            comb += alu.imm_i.eq(self.imm_i)
-            comb += alu.isalu_i.eq(0)
-
-        return m
-
-
-class CompUnitALUs(CompUnitsBase):
-
-    def __init__(self, rwid, opwid, n_alus):
-        """ Inputs:
-
-            * :rwid:   bit width of register file(s) - both FP and INT
-            * :opwid:  operand bit width
-        """
-        self.opwid = opwid
-
-        # inputs
-        self.oper_i = Signal(opwid, reset_less=True)
-        self.imm_i = Signal(rwid, reset_less=True)
-
-        # Int ALUs
-        alus = []
-        for i in range(n_alus):
-            alus.append(ALU(rwid))
-
-        units = []
-        for alu in alus:
-            aluopwid = 3 # extra bit for immediate mode
-            units.append(ComputationUnitNoDelay(rwid, aluopwid, alu))
-
-        CompUnitsBase.__init__(self, rwid, units)
-
-    def elaborate(self, platform):
-        m = CompUnitsBase.elaborate(self, platform)
-        comb = m.d.comb
-
-        # hand the same operation to all units, only lower 3 bits though
-        for alu in self.units:
-            comb += alu.oper_i[0:3].eq(self.oper_i)
-            comb += alu.imm_i.eq(self.imm_i)
-
-        return m
-
-
-class CompUnitBR(CompUnitsBase):
-
-    def __init__(self, rwid, opwid):
-        """ Inputs:
-
-            * :rwid:   bit width of register file(s) - both FP and INT
-            * :opwid:  operand bit width
-
-            Note: bgt unit is returned so that a shadow unit can be created
-            for it
-        """
-        self.opwid = opwid
-
-        # inputs
-        self.oper_i = Signal(opwid, reset_less=True)
-        self.imm_i = Signal(rwid, reset_less=True)
-
-        # Branch ALU and CU
-        self.bgt = BranchALU(rwid)
-        aluopwid = 3 # extra bit for immediate mode
-        self.br1 = ComputationUnitNoDelay(rwid, aluopwid, self.bgt)
-        CompUnitsBase.__init__(self, rwid, [self.br1])
-
-    def elaborate(self, platform):
-        m = CompUnitsBase.elaborate(self, platform)
-        comb = m.d.comb
-
-        # hand the same operation to all units
-        for alu in self.units:
-            comb += alu.oper_i.eq(self.oper_i)
-            comb += alu.imm_i.eq(self.imm_i)
-
-        return m
-
-
-class FunctionUnits(Elaboratable):
-
-    def __init__(self, n_regs, n_int_alus):
-        self.n_regs = n_regs
-        self.n_int_alus = n_int_alus
-
-        self.dest_i = Signal(n_regs, reset_less=True) # Dest R# in
-        self.src1_i = Signal(n_regs, reset_less=True) # oper1 R# in
-        self.src2_i = Signal(n_regs, reset_less=True) # oper2 R# in
-
-        self.g_int_rd_pend_o = Signal(n_regs, reset_less=True)
-        self.g_int_wr_pend_o = Signal(n_regs, reset_less=True)
-
-        self.dest_rsel_o = Signal(n_regs, reset_less=True) # dest reg (bot)
-        self.src1_rsel_o = Signal(n_regs, reset_less=True) # src1 reg (bot)
-        self.src2_rsel_o = Signal(n_regs, reset_less=True) # src2 reg (bot)
-
-        self.readable_o = Signal(n_int_alus, reset_less=True)
-        self.writable_o = Signal(n_int_alus, reset_less=True)
-
-        self.go_rd_i = Signal(n_int_alus, reset_less=True)
-        self.go_wr_i = Signal(n_int_alus, reset_less=True)
-        self.go_die_i = Signal(n_int_alus, reset_less=True)
-        self.fn_issue_i = Signal(n_int_alus, reset_less=True)
-
-        # Note: FURegs wr_pend_o is also outputted from here, for use in WaWGrid
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-        sync = m.d.sync
-
-        n_intfus = self.n_int_alus
-
-        # Integer FU-FU Dep Matrix
-        intfudeps = FUFUDepMatrix(n_intfus, n_intfus)
-        m.submodules.intfudeps = intfudeps
-        # Integer FU-Reg Dep Matrix
-        intregdeps = FURegDepMatrix(n_intfus, self.n_regs, 2)
-        m.submodules.intregdeps = intregdeps
-
-        comb += self.g_int_rd_pend_o.eq(intregdeps.v_rd_rsel_o)
-        comb += self.g_int_wr_pend_o.eq(intregdeps.v_wr_rsel_o)
-
-        comb += intregdeps.rd_pend_i.eq(intregdeps.v_rd_rsel_o)
-        comb += intregdeps.wr_pend_i.eq(intregdeps.v_wr_rsel_o)
-
-        comb += intfudeps.rd_pend_i.eq(intregdeps.rd_pend_o)
-        comb += intfudeps.wr_pend_i.eq(intregdeps.wr_pend_o)
-        self.wr_pend_o = intregdeps.wr_pend_o # also output for use in WaWGrid
-
-        comb += intfudeps.issue_i.eq(self.fn_issue_i)
-        comb += intfudeps.go_rd_i.eq(self.go_rd_i)
-        comb += intfudeps.go_wr_i.eq(self.go_wr_i)
-        comb += intfudeps.go_die_i.eq(self.go_die_i)
-        comb += self.readable_o.eq(intfudeps.readable_o)
-        comb += self.writable_o.eq(intfudeps.writable_o)
-
-        # Connect function issue / arrays, and dest/src1/src2
-        comb += intregdeps.dest_i.eq(self.dest_i)
-        comb += intregdeps.src_i[0].eq(self.src1_i)
-        comb += intregdeps.src_i[1].eq(self.src2_i)
-
-        comb += intregdeps.go_rd_i.eq(self.go_rd_i)
-        comb += intregdeps.go_wr_i.eq(self.go_wr_i)
-        comb += intregdeps.go_die_i.eq(self.go_die_i)
-        comb += intregdeps.issue_i.eq(self.fn_issue_i)
-
-        comb += self.dest_rsel_o.eq(intregdeps.dest_rsel_o)
-        comb += self.src1_rsel_o.eq(intregdeps.src_rsel_o[0])
-        comb += self.src2_rsel_o.eq(intregdeps.src_rsel_o[1])
-
-        return m
-
-
-class Scoreboard(Elaboratable):
-    def __init__(self, rwid, n_regs):
-        """ Inputs:
-
-            * :rwid:   bit width of register file(s) - both FP and INT
-            * :n_regs: depth of register file(s) - number of FP and INT regs
-        """
-        self.rwid = rwid
-        self.n_regs = n_regs
-
-        # Register Files
-        self.intregs = RegFileArray(rwid, n_regs)
-        self.fpregs = RegFileArray(rwid, n_regs)
-
-        # Memory (test for now)
-        self.mem = TestMemory(self.rwid, 8) # not too big, takes too long
-
-        # issue q needs to get at these
-        self.aluissue = IssueUnitGroup(2)
-        self.lsissue = IssueUnitGroup(2)
-        self.brissue = IssueUnitGroup(1)
-        # and these
-        self.alu_oper_i = Signal(4, reset_less=True)
-        self.alu_imm_i = Signal(rwid, reset_less=True)
-        self.br_oper_i = Signal(4, reset_less=True)
-        self.br_imm_i = Signal(rwid, reset_less=True)
-        self.ls_oper_i = Signal(4, reset_less=True)
-        self.ls_imm_i = Signal(rwid, reset_less=True)
-
-        # inputs
-        self.int_dest_i = Signal(range(n_regs), reset_less=True) # Dest R# in
-        self.int_src1_i = Signal(range(n_regs), reset_less=True) # oper1 R# in
-        self.int_src2_i = Signal(range(n_regs), reset_less=True) # oper2 R# in
-        self.reg_enable_i = Signal(reset_less=True) # enable reg decode
-
-        # outputs
-        self.issue_o = Signal(reset_less=True) # instruction was accepted
-        self.busy_o = Signal(reset_less=True) # at least one CU is busy
-
-        # for branch speculation experiment.  branch_direction = 0 if
-        # the branch hasn't been met yet.  1 indicates "success", 2 is "fail"
-        # branch_succ and branch_fail are requests to have the current
-        # instruction be dependent on the branch unit "shadow" capability.
-        self.branch_succ_i = Signal(reset_less=True)
-        self.branch_fail_i = Signal(reset_less=True)
-        self.branch_direction_o = Signal(2, reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-        sync = m.d.sync
-
-        m.submodules.intregs = self.intregs
-        m.submodules.fpregs = self.fpregs
-        m.submodules.mem = mem = self.mem
-
-        # register ports
-        int_dest = self.intregs.write_port("dest")
-        int_src1 = self.intregs.read_port("src1")
-        int_src2 = self.intregs.read_port("src2")
-
-        fp_dest = self.fpregs.write_port("dest")
-        fp_src1 = self.fpregs.read_port("src1")
-        fp_src2 = self.fpregs.read_port("src2")
-
-        # Int ALUs and BR ALUs
-        n_int_alus = 5
-        cua = CompUnitALUs(self.rwid, 3, n_alus=self.aluissue.n_insns)
-        cub = CompUnitBR(self.rwid, 3) # 1 BR ALUs
-
-        # LDST Comp Units
-        n_ldsts = 2
-        cul = CompUnitLDSTs(self.rwid, 4, self.lsissue.n_insns, None)
-
-        # Comp Units
-        m.submodules.cu = cu = CompUnitsBase(self.rwid, [cua, cul, cub])
-        bgt = cub.bgt # get at the branch computation unit
-        br1 = cub.br1
-
-        # Int FUs
-        m.submodules.intfus = intfus = FunctionUnits(self.n_regs, n_int_alus)
-
-        # Memory FUs
-        m.submodules.memfus = memfus = MemFunctionUnits(n_ldsts, 5)
-
-        # Memory Priority Picker 1: one gateway per memory port
-        mempick1 = GroupPicker(n_ldsts) # picks 1 reader and 1 writer to intreg
-        m.submodules.mempick1 = mempick1
-
-        # Count of number of FUs
-        n_intfus = n_int_alus
-        n_fp_fus = 0 # for now
-
-        # Integer Priority Picker 1: Adder + Subtractor (and LD/ST)
-        intpick1 = GroupPicker(n_intfus) # picks 1 reader and 1 writer to intreg
-        m.submodules.intpick1 = intpick1
-
-        # INT/FP Issue Unit
-        regdecode = RegDecode(self.n_regs)
-        m.submodules.regdecode = regdecode
-        issueunit = IssueUnitArray([self.aluissue, self.lsissue, self.brissue])
-        m.submodules.issueunit = issueunit
-
-        # Shadow Matrix.  currently n_intfus shadows, to be used for
-        # write-after-write hazards.  NOTE: there is one extra for branches,
-        # so the shadow width is increased by 1
-        m.submodules.shadows = shadows = ShadowMatrix(n_intfus, n_intfus, True)
-        m.submodules.bshadow = bshadow = ShadowMatrix(n_intfus, 1, False)
-
-        # record previous instruction to cast shadow on current instruction
-        prev_shadow = Signal(n_intfus)
-
-        # Branch Speculation recorder.  tracks the success/fail state as
-        # each instruction is issued, so that when the branch occurs the
-        # allow/cancel can be issued as appropriate.
-        m.submodules.specrec = bspec = BranchSpeculationRecord(n_intfus)
-
-        #---------
-        # ok start wiring things together...
-        # "now hear de word of de looord... dem bones dem bones dem dryy bones"
-        # https://www.youtube.com/watch?v=pYb8Wm6-QfA
-        #---------
-
-        #---------
-        # Issue Unit is where it starts.  set up some in/outs for this module
-        #---------
-        comb += [    regdecode.dest_i.eq(self.int_dest_i),
-                     regdecode.src1_i.eq(self.int_src1_i),
-                     regdecode.src2_i.eq(self.int_src2_i),
-                     regdecode.enable_i.eq(self.reg_enable_i),
-                     self.issue_o.eq(issueunit.issue_o)
-                    ]
-
-        # take these to outside (issue needs them)
-        comb += cua.oper_i.eq(self.alu_oper_i)
-        comb += cua.imm_i.eq(self.alu_imm_i)
-        comb += cub.oper_i.eq(self.br_oper_i)
-        comb += cub.imm_i.eq(self.br_imm_i)
-        comb += cul.oper_i.eq(self.ls_oper_i)
-        comb += cul.imm_i.eq(self.ls_imm_i)
-
-        # TODO: issueunit.f (FP)
-
-        # and int function issue / busy arrays, and dest/src1/src2
-        comb += intfus.dest_i.eq(regdecode.dest_o)
-        comb += intfus.src1_i.eq(regdecode.src1_o)
-        comb += intfus.src2_i.eq(regdecode.src2_o)
-
-        fn_issue_o = issueunit.fn_issue_o
-
-        comb += intfus.fn_issue_i.eq(fn_issue_o)
-        comb += issueunit.busy_i.eq(cu.busy_o)
-        comb += self.busy_o.eq(cu.busy_o.bool())
-
-        #---------
-        # Memory Function Unit
-        #---------
-        reset_b = Signal(cul.n_units, reset_less=True)
-        sync += reset_b.eq(cul.go_st_i | cul.go_wr_i | cul.go_die_i)
-
-        comb += memfus.fn_issue_i.eq(cul.issue_i) # Comp Unit Issue -> Mem FUs
-        comb += memfus.addr_en_i.eq(cul.adr_rel_o) # Match enable on adr rel
-        comb += memfus.addr_rs_i.eq(reset_b) # reset same as LDSTCompUnit
-
-        # LD/STs have to accumulate prior LD/STs (TODO: multi-issue as well,
-        # in a transitive fashion).  This cycle activates based on LDSTCompUnit
-        # issue_i.  multi-issue gets a bit more complex but not a lot.
-        prior_ldsts = Signal(cul.n_units, reset_less=True)
-        sync += prior_ldsts.eq(memfus.g_int_ld_pend_o | memfus.g_int_st_pend_o)
-        with m.If(self.ls_oper_i[2]): # LD bit of operand
-            comb += memfus.ld_i.eq(cul.issue_i | prior_ldsts)
-        with m.If(self.ls_oper_i[3]): # ST bit of operand
-            comb += memfus.st_i.eq(cul.issue_i | prior_ldsts)
-
-        # TODO: adr_rel_o needs to go into L1 Cache.  for now,
-        # just immediately activate go_adr
-        comb += cul.go_ad_i.eq(cul.adr_rel_o)
-
-        # connect up address data
-        comb += memfus.addrs_i[0].eq(cul.units[0].addr_o)
-        comb += memfus.addrs_i[1].eq(cul.units[1].addr_o)
-
-        # connect loadable / storable to go_ld/go_st.
-        # XXX should only be done when the memory ld/st has actually happened!
-        go_st_i = Signal(cul.n_units, reset_less=True)
-        go_ld_i = Signal(cul.n_units, reset_less=True)
-        comb += go_ld_i.eq(memfus.loadable_o & memfus.addr_nomatch_o &\
-                                  cul.req_rel_o & cul.ld_o)
-        comb += go_st_i.eq(memfus.storable_o & memfus.addr_nomatch_o &\
-                                  cul.sto_rel_o & cul.st_o)
-        comb += memfus.go_ld_i.eq(go_ld_i)
-        comb += memfus.go_st_i.eq(go_st_i)
-        #comb += cul.go_wr_i.eq(go_ld_i)
-        comb += cul.go_st_i.eq(go_st_i)
-
-        #comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
-        #comb += cu.go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus])
-        #comb += cu.issue_i[0:n_intfus].eq(fn_issue_o[0:n_intfus])
-
-        #---------
-        # merge shadow matrices outputs
-        #---------
-
-        # these are explained in ShadowMatrix docstring, and are to be
-        # connected to the FUReg and FUFU Matrices, to get them to reset
-        anydie = Signal(n_intfus, reset_less=True)
-        allshadown = Signal(n_intfus, reset_less=True)
-        shreset = Signal(n_intfus, reset_less=True)
-        comb += allshadown.eq(shadows.shadown_o & bshadow.shadown_o)
-        comb += anydie.eq(shadows.go_die_o | bshadow.go_die_o)
-        comb += shreset.eq(bspec.match_g_o | bspec.match_f_o)
-
-        #---------
-        # connect fu-fu matrix
-        #---------
-
-        # Group Picker... done manually for now.
-        go_rd_o = intpick1.go_rd_o
-        go_wr_o = intpick1.go_wr_o
-        go_rd_i = intfus.go_rd_i
-        go_wr_i = intfus.go_wr_i
-        go_die_i = intfus.go_die_i
-        # NOTE: connect to the shadowed versions so that they can "die" (reset)
-        comb += go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus]) # rd
-        comb += go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus]) # wr
-        comb += go_die_i[0:n_intfus].eq(anydie[0:n_intfus]) # die
-
-        # Connect Picker
-        #---------
-        comb += intpick1.rd_rel_i[0:n_intfus].eq(cu.rd_rel_o[0:n_intfus])
-        comb += intpick1.req_rel_i[0:n_intfus].eq(cu.req_rel_o[0:n_intfus])
-        int_rd_o = intfus.readable_o
-        int_wr_o = intfus.writable_o
-        comb += intpick1.readable_i[0:n_intfus].eq(int_rd_o[0:n_intfus])
-        comb += intpick1.writable_i[0:n_intfus].eq(int_wr_o[0:n_intfus])
-
-        #---------
-        # Shadow Matrix
-        #---------
-
-        comb += shadows.issue_i.eq(fn_issue_o)
-        #comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
-        comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
-        #---------
-        # NOTE; this setup is for the instruction order preservation...
-
-        # connect shadows / go_dies to Computation Units
-        comb += cu.shadown_i[0:n_intfus].eq(allshadown)
-        comb += cu.go_die_i[0:n_intfus].eq(anydie)
-
-        # ok connect first n_int_fu shadows to busy lines, to create an
-        # instruction-order linked-list-like arrangement, using a bit-matrix
-        # (instead of e.g. a ring buffer).
-
-        # when written, the shadow can be cancelled (and was good)
-        for i in range(n_intfus):
-            comb += shadows.s_good_i[i][0:n_intfus].eq(go_wr_o[0:n_intfus])
-
-        # *previous* instruction shadows *current* instruction, and, obviously,
-        # if the previous is completed (!busy) don't cast the shadow!
-        comb += prev_shadow.eq(~fn_issue_o & cu.busy_o)
-        for i in range(n_intfus):
-            comb += shadows.shadow_i[i][0:n_intfus].eq(prev_shadow)
-
-        #---------
-        # ... and this is for branch speculation.  it uses the extra bit
-        # tacked onto the ShadowMatrix (hence shadow_wid=n_intfus+1)
-        # only needs to set shadow_i, s_fail_i and s_good_i
-
-        # issue captures shadow_i (if enabled)
-        comb += bshadow.reset_i[0:n_intfus].eq(shreset[0:n_intfus])
-
-        bactive = Signal(reset_less=True)
-        comb += bactive.eq((bspec.active_i | br1.issue_i) & ~br1.go_wr_i)
-
-        # instruction being issued (fn_issue_o) has a shadow cast by the branch
-        with m.If(bactive & (self.branch_succ_i | self.branch_fail_i)):
-            comb += bshadow.issue_i.eq(fn_issue_o)
-            for i in range(n_intfus):
-                with m.If(fn_issue_o & (Const(1<<i))):
-                    comb += bshadow.shadow_i[i][0].eq(1)
-
-        # finally, we need an indicator to the test infrastructure as to
-        # whether the branch succeeded or failed, plus, link up to the
-        # "recorder" of whether the instruction was under shadow or not
-
-        with m.If(br1.issue_i):
-            sync += bspec.active_i.eq(1)
-        with m.If(self.branch_succ_i):
-            comb += bspec.good_i.eq(fn_issue_o & 0x1f) # XXX MAGIC CONSTANT
-        with m.If(self.branch_fail_i):
-            comb += bspec.fail_i.eq(fn_issue_o & 0x1f) # XXX MAGIC CONSTANT
-
-        # branch is active (TODO: a better signal: this is over-using the
-        # go_write signal - actually the branch should not be "writing")
-        with m.If(br1.go_wr_i):
-            sync += self.branch_direction_o.eq(br1.data_o+Const(1, 2))
-            sync += bspec.active_i.eq(0)
-            comb += bspec.br_i.eq(1)
-            # branch occurs if data == 1, failed if data == 0
-            comb += bspec.br_ok_i.eq(br1.data_o == 1)
-            for i in range(n_intfus):
-                # *expected* direction of the branch matched against *actual*
-                comb += bshadow.s_good_i[i][0].eq(bspec.match_g_o[i])
-                # ... or it didn't
-                comb += bshadow.s_fail_i[i][0].eq(bspec.match_f_o[i])
-
-        #---------
-        # Connect Register File(s)
-        #---------
-        comb += int_dest.wen.eq(intfus.dest_rsel_o)
-        comb += int_src1.ren.eq(intfus.src1_rsel_o)
-        comb += int_src2.ren.eq(intfus.src2_rsel_o)
-
-        # connect ALUs to regfule
-        comb += int_dest.data_i.eq(cu.data_o)
-        comb += cu.src1_i.eq(int_src1.data_o)
-        comb += cu.src2_i.eq(int_src2.data_o)
-
-        # connect ALU Computation Units
-        comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
-        comb += cu.go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus])
-        comb += cu.issue_i[0:n_intfus].eq(fn_issue_o[0:n_intfus])
-
-        return m
-
-    def __iter__(self):
-        yield from self.intregs
-        yield from self.fpregs
-        yield self.int_dest_i
-        yield self.int_src1_i
-        yield self.int_src2_i
-        yield self.issue_o
-        yield self.branch_succ_i
-        yield self.branch_fail_i
-        yield self.branch_direction_o
-
-    def ports(self):
-        return list(self)
-
-
-class IssueToScoreboard(Elaboratable):
-
-    def __init__(self, qlen, n_in, n_out, rwid, opwid, n_regs):
-        self.qlen = qlen
-        self.n_in = n_in
-        self.n_out = n_out
-        self.rwid = rwid
-        self.opw = opwid
-        self.n_regs = n_regs
-
-        mqbits = unsigned(int(log(qlen) / log(2))+2)
-        self.p_add_i = Signal(mqbits) # instructions to add (from data_i)
-        self.p_ready_o = Signal() # instructions were added
-        self.data_i = Instruction.nq(n_in, "data_i", rwid, opwid)
-
-        self.busy_o = Signal(reset_less=True) # at least one CU is busy
-        self.qlen_o = Signal(mqbits, reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-        sync = m.d.sync
-
-        iq = InstructionQ(self.rwid, self.opw, self.qlen, self.n_in, self.n_out)
-        sc = Scoreboard(self.rwid, self.n_regs)
-        m.submodules.iq = iq
-        m.submodules.sc = sc
-
-        # get at the regfile for testing
-        self.intregs = sc.intregs
-
-        # and the "busy" signal and instruction queue length
-        comb += self.busy_o.eq(sc.busy_o)
-        comb += self.qlen_o.eq(iq.qlen_o)
-
-        # link up instruction queue
-        comb += iq.p_add_i.eq(self.p_add_i)
-        comb += self.p_ready_o.eq(iq.p_ready_o)
-        for i in range(self.n_in):
-            comb += eq(iq.data_i[i], self.data_i[i])
-
-        # take instruction and process it.  note that it's possible to
-        # "inspect" the queue contents *without* actually removing the
-        # items.  items are only removed when the
-
-        # in "waiting" state
-        wait_issue_br = Signal()
-        wait_issue_alu = Signal()
-        wait_issue_ls = Signal()
-
-        with m.If(wait_issue_br | wait_issue_alu | wait_issue_ls):
-            # set instruction pop length to 1 if the unit accepted
-            with m.If(wait_issue_ls & (sc.lsissue.fn_issue_o != 0)):
-                with m.If(iq.qlen_o != 0):
-                    comb += iq.n_sub_i.eq(1)
-            with m.If(wait_issue_br & (sc.brissue.fn_issue_o != 0)):
-                with m.If(iq.qlen_o != 0):
-                    comb += iq.n_sub_i.eq(1)
-            with m.If(wait_issue_alu & (sc.aluissue.fn_issue_o != 0)):
-                with m.If(iq.qlen_o != 0):
-                    comb += iq.n_sub_i.eq(1)
-
-        # see if some instruction(s) are here.  note that this is
-        # "inspecting" the in-place queue.  note also that on the
-        # cycle following "waiting" for fn_issue_o to be set, the
-        # "resetting" done above (insn_i=0) could be re-ASSERTed.
-        with m.If(iq.qlen_o != 0):
-            # get the operands and operation
-            imm = iq.data_o[0].imm_i
-            dest = iq.data_o[0].dest_i
-            src1 = iq.data_o[0].src1_i
-            src2 = iq.data_o[0].src2_i
-            op = iq.data_o[0].oper_i
-            opi = iq.data_o[0].opim_i # immediate set
-
-            # set the src/dest regs
-            comb += sc.int_dest_i.eq(dest)
-            comb += sc.int_src1_i.eq(src1)
-            comb += sc.int_src2_i.eq(src2)
-            comb += sc.reg_enable_i.eq(1) # enable the regfile
-
-            # choose a Function-Unit-Group
-            with m.If((op & (0x3<<2)) != 0): # branch
-                comb += sc.br_oper_i.eq(Cat(op[0:2], opi))
-                comb += sc.br_imm_i.eq(imm)
-                comb += sc.brissue.insn_i.eq(1)
-                comb += wait_issue_br.eq(1)
-            with m.Elif((op & (0x3<<4)) != 0): # ld/st
-                # see compldst.py
-                # bit 0: ADD/SUB
-                # bit 1: immed
-                # bit 4: LD
-                # bit 5: ST
-                comb += sc.ls_oper_i.eq(Cat(op[0], opi[0], op[4:6]))
-                comb += sc.ls_imm_i.eq(imm)
-                comb += sc.lsissue.insn_i.eq(1)
-                comb += wait_issue_ls.eq(1)
-            with m.Else(): # alu
-                comb += sc.alu_oper_i.eq(Cat(op[0:2], opi))
-                comb += sc.alu_imm_i.eq(imm)
-                comb += sc.aluissue.insn_i.eq(1)
-                comb += wait_issue_alu.eq(1)
-
-            # XXX TODO
-            # these indicate that the instruction is to be made
-            # shadow-dependent on
-            # (either) branch success or branch fail
-            #yield sc.branch_fail_i.eq(branch_fail)
-            #yield sc.branch_succ_i.eq(branch_success)
-
-        return m
-
-    def __iter__(self):
-        yield self.p_ready_o
-        for o in self.data_i:
-            yield from list(o)
-        yield self.p_add_i
-
-    def ports(self):
-        return list(self)
-
-
-IADD = 0
-ISUB = 1
-IMUL = 2
-ISHF = 3
-IBGT = 4
-IBLT = 5
-IBEQ = 6
-IBNE = 7
-
-
-class RegSim:
-    def __init__(self, rwidth, nregs):
-        self.rwidth = rwidth
-        self.regs = [0] * nregs
-
-    def op(self, op, op_imm, imm, src1, src2, dest):
-        maxbits = (1 << self.rwidth) - 1
-        src1 = self.regs[src1] & maxbits
-        if op_imm:
-            src2 = imm
-        else:
-            src2 = self.regs[src2] & maxbits
-        if op == IADD:
-            val = src1 + src2
-        elif op == ISUB:
-            val = src1 - src2
-        elif op == IMUL:
-            val = src1 * src2
-        elif op == ISHF:
-            val = src1 >> (src2 & maxbits)
-        elif op == IBGT:
-            val = int(src1 > src2)
-        elif op == IBLT:
-            val = int(src1 < src2)
-        elif op == IBEQ:
-            val = int(src1 == src2)
-        elif op == IBNE:
-            val = int(src1 != src2)
-        else:
-            return 0 # LD/ST TODO
-        val &= maxbits
-        self.setval(dest, val)
-        return val
-
-    def setval(self, dest, val):
-        print ("sim setval", dest, hex(val))
-        self.regs[dest] = val
-
-    def dump(self, dut):
-        for i, val in enumerate(self.regs):
-            reg = yield dut.intregs.regs[i].reg
-            okstr = "OK" if reg == val else "!ok"
-            print("reg %d expected %x received %x %s" % (i, val, reg, okstr))
-
-    def check(self, dut):
-        for i, val in enumerate(self.regs):
-            reg = yield dut.intregs.regs[i].reg
-            if reg != val:
-                print("reg %d expected %x received %x\n" % (i, val, reg))
-                yield from self.dump(dut)
-                assert False
-
-def instr_q(dut, op, op_imm, imm, src1, src2, dest,
-            branch_success, branch_fail):
-    instrs = [{'oper_i': op, 'dest_i': dest, 'imm_i': imm, 'opim_i': op_imm,
-               'src1_i': src1, 'src2_i': src2}]
-
-    sendlen = 1
-    for idx in range(sendlen):
-        yield from eq(dut.data_i[idx], instrs[idx])
-        di = yield dut.data_i[idx]
-        print ("senddata %d %x" % (idx, di))
-    yield dut.p_add_i.eq(sendlen)
-    yield
-    o_p_ready = yield dut.p_ready_o
-    while not o_p_ready:
-        yield
-        o_p_ready = yield dut.p_ready_o
-
-    yield dut.p_add_i.eq(0)
-
-
-def int_instr(dut, op, imm, src1, src2, dest, branch_success, branch_fail):
-    yield from disable_issue(dut)
-    yield dut.int_dest_i.eq(dest)
-    yield dut.int_src1_i.eq(src1)
-    yield dut.int_src2_i.eq(src2)
-    if (op & (0x3<<2)) != 0: # branch
-        yield dut.brissue.insn_i.eq(1)
-        yield dut.br_oper_i.eq(Const(op & 0x3, 2))
-        yield dut.br_imm_i.eq(imm)
-        dut_issue = dut.brissue
-    else:
-        yield dut.aluissue.insn_i.eq(1)
-        yield dut.alu_oper_i.eq(Const(op & 0x3, 2))
-        yield dut.alu_imm_i.eq(imm)
-        dut_issue = dut.aluissue
-    yield dut.reg_enable_i.eq(1)
-
-    # these indicate that the instruction is to be made shadow-dependent on
-    # (either) branch success or branch fail
-    yield dut.branch_fail_i.eq(branch_fail)
-    yield dut.branch_succ_i.eq(branch_success)
-
-    yield
-    yield from wait_for_issue(dut, dut_issue)
-
-
-def print_reg(dut, rnums):
-    rs = []
-    for rnum in rnums:
-        reg = yield dut.intregs.regs[rnum].reg
-        rs.append("%x" % reg)
-    rnums = map(str, rnums)
-    print ("reg %s: %s" % (','.join(rnums), ','.join(rs)))
-
-
-def create_random_ops(dut, n_ops, shadowing=False, max_opnums=3):
-    insts = []
-    for i in range(n_ops):
-        src1 = randint(1, dut.n_regs-1)
-        src2 = randint(1, dut.n_regs-1)
-        imm = randint(1, (1<<dut.rwid)-1)
-        dest = randint(1, dut.n_regs-1)
-        op = randint(0, max_opnums)
-        opi = 0 if randint(0, 2) else 1 # set true if random is nonzero
-
-        if shadowing:
-            insts.append((src1, src2, dest, op, opi, imm, (0, 0)))
-        else:
-            insts.append((src1, src2, dest, op, opi, imm))
-    return insts
-
-
-def wait_for_busy_clear(dut):
-    while True:
-        busy_o = yield dut.busy_o
-        if not busy_o:
-            break
-        print ("busy",)
-        yield
-
-def disable_issue(dut):
-    yield dut.aluissue.insn_i.eq(0)
-    yield dut.brissue.insn_i.eq(0)
-    yield dut.lsissue.insn_i.eq(0)
-
-
-def wait_for_issue(dut, dut_issue):
-    while True:
-        issue_o = yield dut_issue.fn_issue_o
-        if issue_o:
-            yield from disable_issue(dut)
-            yield dut.reg_enable_i.eq(0)
-            break
-        print ("busy",)
-        #yield from print_reg(dut, [1,2,3])
-        yield
-    #yield from print_reg(dut, [1,2,3])
-
-def scoreboard_branch_sim(dut, alusim):
-
-    iseed = 3
-
-    for i in range(1):
-
-        print ("rseed", iseed)
-        seed(iseed)
-        iseed += 1
-
-        yield dut.branch_direction_o.eq(0)
-
-        # set random values in the registers
-        for i in range(1, dut.n_regs):
-            val = 31+i*3
-            val = randint(0, (1<<alusim.rwidth)-1)
-            yield dut.intregs.regs[i].reg.eq(val)
-            alusim.setval(i, val)
-
-        if False:
-            # create some instructions: branches create a tree
-            insts = create_random_ops(dut, 1, True, 1)
-            #insts.append((6, 6, 1, 2, (0, 0)))
-            #insts.append((4, 3, 3, 0, (0, 0)))
-
-            src1 = randint(1, dut.n_regs-1)
-            src2 = randint(1, dut.n_regs-1)
-            #op = randint(4, 7)
-            op = 4 # only BGT at the moment
-
-            branch_ok = create_random_ops(dut, 1, True, 1)
-            branch_fail = create_random_ops(dut, 1, True, 1)
-
-            insts.append((src1, src2, (branch_ok, branch_fail), op, (0, 0)))
-
-        if True:
-            insts = []
-            insts.append( (3, 5, 2, 0, (0, 0)) )
-            branch_ok = []
-            branch_fail = []
-            #branch_ok.append  ( (5, 7, 5, 1, (1, 0)) )
-            branch_ok.append( None )
-            branch_fail.append( (1, 1, 2, 0, (0, 1)) )
-            #branch_fail.append( None )
-            insts.append( (6, 4, (branch_ok, branch_fail), 4, (0, 0)) )
-
-        siminsts = deepcopy(insts)
-
-        # issue instruction(s)
-        i = -1
-        instrs = insts
-        branch_direction = 0
-        while instrs:
-            yield
-            yield
-            i += 1
-            branch_direction = yield dut.branch_direction_o # way branch went
-            (src1, src2, dest, op, (shadow_on, shadow_off)) = insts.pop(0)
-            if branch_direction == 1 and shadow_on:
-                print ("skip", i, src1, src2, dest, op, shadow_on, shadow_off)
-                continue # branch was "success" and this is a "failed"... skip
-            if branch_direction == 2 and shadow_off:
-                print ("skip", i, src1, src2, dest, op, shadow_on, shadow_off)
-                continue # branch was "fail" and this is a "success"... skip
-            if branch_direction != 0:
-                shadow_on = 0
-                shadow_off = 0
-            is_branch = op >= 4
-            if is_branch:
-                branch_ok, branch_fail = dest
-                dest = src2
-                # ok zip up the branch success / fail instructions and
-                # drop them into the queue, one marked "to have branch success"
-                # the other to be marked shadow branch "fail".
-                # one out of each of these will be cancelled
-                for ok, fl in zip(branch_ok, branch_fail):
-                    if ok:
-                        instrs.append((ok[0], ok[1], ok[2], ok[3], (1, 0)))
-                    if fl:
-                        instrs.append((fl[0], fl[1], fl[2], fl[3], (0, 1)))
-            print ("instr %d: (%d, %d, %d, %d, (%d, %d))" % \
-                            (i, src1, src2, dest, op, shadow_on, shadow_off))
-            yield from int_instr(dut, op, src1, src2, dest,
-                                 shadow_on, shadow_off)
-
-        # wait for all instructions to stop before checking
-        yield
-        yield from wait_for_busy_clear(dut)
-
-        i = -1
-        while siminsts:
-            instr = siminsts.pop(0)
-            if instr is None:
-                continue
-            (src1, src2, dest, op, (shadow_on, shadow_off)) = instr
-            i += 1
-            is_branch = op >= 4
-            if is_branch:
-                branch_ok, branch_fail = dest
-                dest = src2
-            print ("sim %d: (%d, %d, %d, %d, (%d, %d))" % \
-                            (i, src1, src2, dest, op, shadow_on, shadow_off))
-            branch_res = alusim.op(op, src1, src2, dest)
-            if is_branch:
-                if branch_res:
-                    siminsts += branch_ok
-                else:
-                    siminsts += branch_fail
-
-        # check status
-        yield from alusim.check(dut)
-        yield from alusim.dump(dut)
-
-
-def scoreboard_sim(dut, alusim):
-
-    seed(0)
-
-    for i in range(1):
-
-        # set random values in the registers
-        for i in range(1, dut.n_regs):
-            val = randint(0, (1<<alusim.rwidth)-1)
-            #val = 31+i*3
-            #val = i
-            yield dut.intregs.regs[i].reg.eq(val)
-            alusim.setval(i, val)
-
-        # create some instructions (some random, some regression tests)
-        instrs = []
-        if False:
-            instrs = create_random_ops(dut, 15, True, 4)
-
-        if False: # LD/ST test (with immediate)
-            instrs.append( (1, 2, 0, 0x10, 1, 1, (0, 0)) )
-            #instrs.append( (1, 2, 0, 0x10, 1, 1, (0, 0)) )
-
-        if True:
-            instrs.append( (1, 2, 2, 1, 1, 20, (0, 0)) )
-
-        if True:
-            instrs.append( (7, 3, 2, 4, 0, 0, (0, 0)) )
-            instrs.append( (7, 6, 6, 2, 0, 0, (0, 0)) )
-            instrs.append( (1, 7, 2, 2, 0, 0, (0, 0)) )
-
-        if True:
-            instrs.append((2, 3, 3, 0, 0, 0, (0, 0)))
-            instrs.append((5, 3, 3, 1, 0, 0, (0, 0)))
-            instrs.append((3, 5, 5, 2, 0, 0, (0, 0)))
-            instrs.append((5, 3, 3, 3, 0, 0, (0, 0)))
-            instrs.append((3, 5, 5, 0, 0, 0, (0, 0)))
-
-        if False:
-            instrs.append( (3, 3, 4, 0, 0, 13979, (0, 0)))
-            instrs.append( (6, 4, 1, 2, 0, 40976, (0, 0)))
-            instrs.append( (1, 4, 7, 4, 1, 23652, (0, 0)))
-
-        if False:
-            instrs.append((5, 6, 2, 1))
-            instrs.append((2, 2, 4, 0))
-            #instrs.append((2, 2, 3, 1))
-
-        if False:
-            instrs.append((2, 1, 2, 3))
-
-        if False:
-            instrs.append((2, 6, 2, 1))
-            instrs.append((2, 1, 2, 0))
-
-        if False:
-            instrs.append((1, 2, 7, 2))
-            instrs.append((7, 1, 5, 0))
-            instrs.append((4, 4, 1, 1))
-
-        if False:
-            instrs.append((5, 6, 2, 2))
-            instrs.append((1, 1, 4, 1))
-            instrs.append((6, 5, 3, 0))
-
-        if False:
-            # Write-after-Write Hazard
-            instrs.append( (3, 6, 7, 2) )
-            instrs.append( (4, 4, 7, 1) )
-
-        if False:
-            # self-read/write-after-write followed by Read-after-Write
-            instrs.append((1, 1, 1, 1))
-            instrs.append((1, 5, 3, 0))
-
-        if False:
-            # Read-after-Write followed by self-read-after-write
-            instrs.append((5, 6, 1, 2))
-            instrs.append((1, 1, 1, 1))
-
-        if False:
-            # self-read-write sandwich
-            instrs.append((5, 6, 1, 2))
-            instrs.append((1, 1, 1, 1))
-            instrs.append((1, 5, 3, 0))
-
-        if False:
-            # very weird failure
-            instrs.append( (5, 2, 5, 2) )
-            instrs.append( (2, 6, 3, 0) )
-            instrs.append( (4, 2, 2, 1) )
-
-        if False:
-            v1 = 4
-            yield dut.intregs.regs[5].reg.eq(v1)
-            alusim.setval(5, v1)
-            yield dut.intregs.regs[3].reg.eq(5)
-            alusim.setval(3, 5)
-            instrs.append((5, 3, 3, 4, (0, 0)))
-            instrs.append((4, 2, 1, 2, (0, 1)))
-
-        if False:
-            v1 = 6
-            yield dut.intregs.regs[5].reg.eq(v1)
-            alusim.setval(5, v1)
-            yield dut.intregs.regs[3].reg.eq(5)
-            alusim.setval(3, 5)
-            instrs.append((5, 3, 3, 4, (0, 0)))
-            instrs.append((4, 2, 1, 2, (1, 0)))
-
-        if False:
-            instrs.append( (4, 3, 5, 1, 0, (0, 0)) )
-            instrs.append( (5, 2, 3, 1, 0, (0, 0)) )
-            instrs.append( (7, 1, 5, 2, 0, (0, 0)) )
-            instrs.append( (5, 6, 6, 4, 0, (0, 0)) )
-            instrs.append( (7, 5, 2, 2, 0, (1, 0)) )
-            instrs.append( (1, 7, 5, 0, 0, (0, 1)) )
-            instrs.append( (1, 6, 1, 2, 0, (1, 0)) )
-            instrs.append( (1, 6, 7, 3, 0, (0, 0)) )
-            instrs.append( (6, 7, 7, 0, 0, (0, 0)) )
-
-        # issue instruction(s), wait for issue to be free before proceeding
-        for i, instr in enumerate(instrs):
-            src1, src2, dest, op, opi, imm, (br_ok, br_fail) = instr
-
-            print ("instr %d: (%d, %d, %d, %d, %d, %d)" % \
-                    (i, src1, src2, dest, op, opi, imm))
-            alusim.op(op, opi, imm, src1, src2, dest)
-            yield from instr_q(dut, op, opi, imm, src1, src2, dest,
-                               br_ok, br_fail)
-
-        # wait for all instructions to stop before checking
-        while True:
-            iqlen = yield dut.qlen_o
-            if iqlen == 0:
-                break
-            yield
-        yield
-        yield
-        yield
-        yield
-        yield from wait_for_busy_clear(dut)
-
-        # check status
-        yield from alusim.check(dut)
-        yield from alusim.dump(dut)
-
-
-def test_scoreboard():
-    dut = IssueToScoreboard(2, 1, 1, 16, 8, 8)
-    alusim = RegSim(16, 8)
-    memsim = MemSim(16, 16)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_scoreboard6600.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, scoreboard_sim(dut, alusim),
-                        vcd_name='test_scoreboard6600.vcd')
-
-    #run_simulation(dut, scoreboard_branch_sim(dut, alusim),
-    #                    vcd_name='test_scoreboard6600.vcd')
-
-
-if __name__ == '__main__':
-    test_scoreboard()
diff --git a/src/iommu/axi_rab/axi4_ar_buffer.py b/src/iommu/axi_rab/axi4_ar_buffer.py
deleted file mode 100644
index 1f3a5ff3..00000000
--- a/src/iommu/axi_rab/axi4_ar_buffer.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright 2018 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the "License"); you may not use this file except in
-# compliance with the License.  You may obtain a copy of the License at
-# http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-# module axi4_ar_buffer
-#  #(
-#    parameter AXI_ID_WIDTH   = 4,
-#    parameter AXI_USER_WIDTH = 4
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_arid,
-#    input  logic               [31:0] s_axi4_araddr,
-#    input  logic                      s_axi4_arvalid,
-#    output logic                      s_axi4_arready,
-#    input  logic                [7:0] s_axi4_arlen,
-#    input  logic                [2:0] s_axi4_arsize,
-#    input  logic                [1:0] s_axi4_arburst,
-#    input  logic                      s_axi4_arlock,
-#    input  logic                [2:0] s_axi4_arprot,
-#    input  logic                [3:0] s_axi4_arcache,
-#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_aruser,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_arid,
-#    output logic               [31:0] m_axi4_araddr,
-#    output logic                      m_axi4_arvalid,
-#    input  logic                      m_axi4_arready,
-#    output logic                [7:0] m_axi4_arlen,
-#    output logic                [2:0] m_axi4_arsize,
-#    output logic                [1:0] m_axi4_arburst,
-#    output logic                      m_axi4_arlock,
-#    output logic                [2:0] m_axi4_arprot,
-#    output logic                [3:0] m_axi4_arcache,
-#    output logic [AXI_USER_WIDTH-1:0] m_axi4_aruser
-#  );
-
-
-class axi4_ar_buffer(Elaboratable):
-
-    def __init__(self):
-        # self.axi4_aclk = Signal() # input
-        # self.axi4_arstn = Signal() # input
-        self.s_axi4_arid = Signal(AXI_ID_WIDTH)  # input
-        self.s_axi4_araddr = Signal(32)  # input
-        self.s_axi4_arvalid = Signal()  # input
-        self.s_axi4_arready = Signal()  # output
-        self.s_axi4_arlen = Signal(8)  # input
-        self.s_axi4_arsize = Signal(3)  # input
-        self.s_axi4_arburst = Signal(2)  # input
-        self.s_axi4_arlock = Signal()  # input
-        self.s_axi4_arprot = Signal(3)  # input
-        self.s_axi4_arcache = Signal(4)  # input
-        self.s_axi4_aruser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_arid = Signal(AXI_ID_WIDTH)  # output
-        self.m_axi4_araddr = Signal(32)  # output
-        self.m_axi4_arvalid = Signal()  # output
-        self.m_axi4_arready = Signal()  # input
-        self.m_axi4_arlen = Signal(8)  # output
-        self.m_axi4_arsize = Signal(3)  # output
-        self.m_axi4_arburst = Signal(2)  # output
-        self.m_axi4_arlock = Signal()  # output
-        self.m_axi4_arprot = Signal(3)  # output
-        self.m_axi4_arcache = Signal(4)  # output
-        self.m_axi4_aruser = Signal(AXI_USER_WIDTH)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        #  #TODO use record types here
-        #  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+52:0] data_in;
-        #  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+52:0] data_out;
-
-        # assign data_in                                           [3:0] = s_axi4_arcache;
-        # assign data_in                                           [6:4] = s_axi4_arprot;
-        # assign data_in                                             [7] = s_axi4_arlock;
-        # assign data_in                                           [9:8] = s_axi4_arburst;
-        # assign data_in                                         [12:10] = s_axi4_arsize;
-        # assign data_in                                         [20:13] = s_axi4_arlen;
-        # assign data_in                                         [52:21] = s_axi4_araddr;
-        # assign data_in                            [52+AXI_ID_WIDTH:53] = s_axi4_arid;
-        # assign data_in[52+AXI_ID_WIDTH+AXI_USER_WIDTH:53+AXI_ID_WIDTH] = s_axi4_aruser;
-        #
-        # assign m_axi4_arcache = data_out[3:0];
-        # assign m_axi4_arprot  = data_out[6:4];
-        # assign m_axi4_arlock  = data_out[7];
-        # assign m_axi4_arburst = data_out[9:8];
-        # assign m_axi4_arsize  = data_out[12:10];
-        # assign m_axi4_arlen   = data_out[20:13];
-        # assign m_axi4_araddr  = data_out[52:21];
-        # assign m_axi4_arid    = data_out[52+AXI_ID_WIDTH:53];
-        # assign m_axi4_aruser  = data_out[52+AXI_ID_WIDTH+AXI_USER_WIDTH:53+AXI_ID_WIDTH];
-
-        # m.d.comb += self.m_axi4_arcache.eq(..)
-        # m.d.comb += self.m_axi4_arprot.eq(..)
-        # m.d.comb += self.m_axi4_arlock.eq(..)
-        # m.d.comb += self.m_axi4_arburst.eq(..)
-        # m.d.comb += self.m_axi4_arsize.eq(..)
-        # m.d.comb += self.m_axi4_arlen.eq(..)
-        # m.d.comb += self.m_axi4_araddr.eq(..)
-        # m.d.comb += self.m_axi4_arid.eq(..)
-        # m.d.comb += self.m_axi4_aruser.eq(..)
-        return m
-
-# TODO convert axi_buffer_rab.sv
-#
-#  axi_buffer_rab
-#    #(
-#      .DATA_WIDTH   ( AXI_ID_WIDTH+AXI_USER_WIDTH+53  ),
-#      .BUFFER_DEPTH ( 4                               )
-#      )
-#    u_buffer
-#    (
-#      .clk       ( axi4_aclk      ),
-#      .rstn      ( axi4_arstn     ),
-#      .valid_out ( m_axi4_arvalid ),
-#      .data_out  ( data_out       ),
-#      .ready_in  ( m_axi4_arready ),
-#      .valid_in  ( s_axi4_arvalid ),
-#      .data_in   ( data_in        ),
-#      .ready_out ( s_axi4_arready )
-#    );
-#
-
-# endmodule
diff --git a/src/iommu/axi_rab/axi4_ar_sender.py b/src/iommu/axi_rab/axi4_ar_sender.py
deleted file mode 100644
index 4cbd97d5..00000000
--- a/src/iommu/axi_rab/axi4_ar_sender.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_ar_sender(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.l1_done_o = Signal()  # output
-        self.l1_accept_i = Signal()  # input
-        self.l1_drop_i = Signal()  # input
-        self.l1_save_i = Signal()  # input
-        self.l2_done_o = Signal()  # output
-        self.l2_accept_i = Signal()  # input
-        self.l2_drop_i = Signal()  # input
-        self.l2_sending_o = Signal()  # output
-        self.l1_araddr_i = Signal(AXI_ADDR_WIDTH)  # input
-        self.l2_araddr_i = Signal(AXI_ADDR_WIDTH)  # input
-        self.s_axi4_arid = Signal(AXI_ID_WIDTH)  # input
-        self.s_axi4_arvalid = Signal()  # input
-        self.s_axi4_arready = Signal()  # output
-        self.s_axi4_arlen = Signal(8)  # input
-        self.s_axi4_arsize = Signal(3)  # input
-        self.s_axi4_arburst = Signal(2)  # input
-        self.s_axi4_arlock = Signal()  # input
-        self.s_axi4_arprot = Signal(3)  # input
-        self.s_axi4_arcache = Signal(4)  # input
-        self.s_axi4_aruser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_arid = Signal(AXI_ID_WIDTH)  # output
-        self.m_axi4_araddr = Signal(AXI_ADDR_WIDTH)  # output
-        self.m_axi4_arvalid = Signal()  # output
-        self.m_axi4_arready = Signal()  # input
-        self.m_axi4_arlen = Signal(8)  # output
-        self.m_axi4_arsize = Signal(3)  # output
-        self.m_axi4_arburst = Signal(2)  # output
-        self.m_axi4_arlock = Signal()  # output
-        self.m_axi4_arprot = Signal(3)  # output
-        self.m_axi4_arcache = Signal(4)  # output
-        self.m_axi4_aruser = Signal(AXI_USER_WIDTH)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.l1_save.eq(self.None)
-        m.d.comb += self.l1_done_o.eq(self.None)
-        m.d.comb += self.m_axi4_arvalid.eq(self.None)
-        m.d.comb += self.s_axi4_arready.eq(self.None)
-        m.d.comb += self.m_axi4_aruser.eq(self.None)
-        m.d.comb += self.m_axi4_arcache.eq(self.None)
-        m.d.comb += self.m_axi4_arprot.eq(self.None)
-        m.d.comb += self.m_axi4_arlock.eq(self.None)
-        m.d.comb += self.m_axi4_arburst.eq(self.None)
-        m.d.comb += self.m_axi4_arsize.eq(self.None)
-        m.d.comb += self.m_axi4_arlen.eq(self.None)
-        m.d.comb += self.m_axi4_araddr.eq(self.None)
-        m.d.comb += self.m_axi4_arid.eq(self.None)
-        m.d.comb += self.l2_sending_o.eq(self.None)
-        m.d.comb += self.l2_sent.eq(self.None)
-        m.d.comb += self.l2_done_o.eq(self.None)
-        m.d.comb += self.m_axi4_aruser.eq(self.s_axi4_aruser)
-        m.d.comb += self.m_axi4_arcache.eq(self.s_axi4_arcache)
-        m.d.comb += self.m_axi4_arprot.eq(self.s_axi4_arprot)
-        m.d.comb += self.m_axi4_arlock.eq(self.s_axi4_arlock)
-        m.d.comb += self.m_axi4_arburst.eq(self.s_axi4_arburst)
-        m.d.comb += self.m_axi4_arsize.eq(self.s_axi4_arsize)
-        m.d.comb += self.m_axi4_arlen.eq(self.s_axi4_arlen)
-        m.d.comb += self.m_axi4_araddr.eq(self.l1_araddr_i)
-        m.d.comb += self.m_axi4_arid.eq(self.s_axi4_arid)
-        m.d.comb += self.l2_sending_o.eq(self.1: 'b0)
-        m.d.comb += self.l2_available_q.eq(self.1: 'b0)
-        m.d.comb += self.l2_done_o.eq(self.1: 'b0)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_ar_sender
-#  #(
-#    parameter AXI_ADDR_WIDTH = 40,
-#    parameter AXI_ID_WIDTH   = 4,
-#    parameter AXI_USER_WIDTH = 4,
-#    parameter ENABLE_L2TLB   = 0
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    output logic                      l1_done_o,
-#    input  logic                      l1_accept_i,
-#    input  logic                      l1_drop_i,
-#    input  logic                      l1_save_i,
-#
-#    output logic                      l2_done_o,
-#    input  logic                      l2_accept_i,
-#    input  logic                      l2_drop_i,
-#    output logic                      l2_sending_o,
-#
-#    input  logic [AXI_ADDR_WIDTH-1:0] l1_araddr_i,
-#    input  logic [AXI_ADDR_WIDTH-1:0] l2_araddr_i,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_arid,
-#    input  logic                      s_axi4_arvalid,
-#    output logic                      s_axi4_arready,
-#    input  logic                [7:0] s_axi4_arlen,
-#    input  logic                [2:0] s_axi4_arsize,
-#    input  logic                [1:0] s_axi4_arburst,
-#    input  logic                      s_axi4_arlock,
-#    input  logic                [2:0] s_axi4_arprot,
-#    input  logic                [3:0] s_axi4_arcache,
-#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_aruser,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_arid,
-#    output logic [AXI_ADDR_WIDTH-1:0] m_axi4_araddr,
-#    output logic                      m_axi4_arvalid,
-#    input  logic                      m_axi4_arready,
-#    output logic                [7:0] m_axi4_arlen,
-#    output logic                [2:0] m_axi4_arsize,
-#    output logic                [1:0] m_axi4_arburst,
-#    output logic                      m_axi4_arlock,
-#    output logic                [2:0] m_axi4_arprot,
-#    output logic                [3:0] m_axi4_arcache,
-#    output logic [AXI_USER_WIDTH-1:0] m_axi4_aruser
-#  );
-#
-#  logic l1_save;
-#
-#  logic l2_sent;
-#  logic l2_available_q;
-#
-#  assign l1_save      = l1_save_i & l2_available_q;
-#
-#  assign l1_done_o    = s_axi4_arvalid & s_axi4_arready ;
-#
-#  // if 1: accept and forward a transaction translated by L1
-#  //    2: drop or save request (if L2 slot not occupied already)
-#  assign m_axi4_arvalid = (s_axi4_arvalid & l1_accept_i) |
-#                          l2_sending_o;
-#  assign s_axi4_arready = (m_axi4_arvalid & m_axi4_arready & ~l2_sending_o) |
-#                          (s_axi4_arvalid & (l1_drop_i | l1_save));
-#
-# generate
-#  if (ENABLE_L2TLB == 1) begin
-#    logic [AXI_USER_WIDTH-1:0] l2_axi4_aruser  ;
-#    logic                [3:0] l2_axi4_arcache ;
-#    logic                [3:0] l2_axi4_arregion;
-#    logic                [3:0] l2_axi4_arqos   ;
-#    logic                [2:0] l2_axi4_arprot  ;
-#    logic                      l2_axi4_arlock  ;
-#    logic                [1:0] l2_axi4_arburst ;
-#    logic                [2:0] l2_axi4_arsize  ;
-#    logic                [7:0] l2_axi4_arlen   ;
-#    logic   [AXI_ID_WIDTH-1:0] l2_axi4_arid    ;
-#
-#    assign m_axi4_aruser  = l2_sending_o ? l2_axi4_aruser   : s_axi4_aruser;
-#    assign m_axi4_arcache = l2_sending_o ? l2_axi4_arcache  : s_axi4_arcache;
-#    assign m_axi4_arprot  = l2_sending_o ? l2_axi4_arprot   : s_axi4_arprot;
-#    assign m_axi4_arlock  = l2_sending_o ? l2_axi4_arlock   : s_axi4_arlock;
-#    assign m_axi4_arburst = l2_sending_o ? l2_axi4_arburst  : s_axi4_arburst;
-#    assign m_axi4_arsize  = l2_sending_o ? l2_axi4_arsize   : s_axi4_arsize;
-#    assign m_axi4_arlen   = l2_sending_o ? l2_axi4_arlen    : s_axi4_arlen;
-#    assign m_axi4_araddr  = l2_sending_o ? l2_araddr_i      : l1_araddr_i;
-#    assign m_axi4_arid    = l2_sending_o ? l2_axi4_arid     : s_axi4_arid;
-#
-#    // Buffer AXI signals in case of L1 miss
-#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
-#      if (axi4_arstn == 1'b0) begin
-#        l2_axi4_aruser  <=  'b0;
-#        l2_axi4_arcache <=  'b0;
-#        l2_axi4_arprot  <=  'b0;
-#        l2_axi4_arlock  <= 1'b0;
-#        l2_axi4_arburst <=  'b0;
-#        l2_axi4_arsize  <=  'b0;
-#        l2_axi4_arlen   <=  'b0;
-#        l2_axi4_arid    <=  'b0;
-#      end else if (l1_save) begin
-#        l2_axi4_aruser  <= s_axi4_aruser;
-#        l2_axi4_arcache <= s_axi4_arcache;
-#        l2_axi4_arprot  <= s_axi4_arprot;
-#        l2_axi4_arlock  <= s_axi4_arlock;
-#        l2_axi4_arburst <= s_axi4_arburst;
-#        l2_axi4_arsize  <= s_axi4_arsize;
-#        l2_axi4_arlen   <= s_axi4_arlen;
-#        l2_axi4_arid    <= s_axi4_arid;
-#      end
-#    end
-#
-#    // signal that an l1_save_i can be accepted
-#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
-#      if (axi4_arstn == 1'b0) begin
-#        l2_available_q <= 1'b1;
-#      end else if (l2_sent | l2_drop_i) begin
-#        l2_available_q <= 1'b1;
-#      end else if (l1_save) begin
-#        l2_available_q <= 1'b0;
-#      end
-#    end
-#
-#    assign l2_sending_o = l2_accept_i & ~l2_available_q;
-#    assign l2_sent      = l2_sending_o & m_axi4_arvalid & m_axi4_arready;
-#
-#    // if 1: having sent out a transaction translated by L2
-#    //    2: drop request (L2 slot is available again)
-#    assign l2_done_o    = l2_sent | l2_drop_i;
-#
-#  end else begin // !`ifdef ENABLE_L2TLB
-#    assign m_axi4_aruser  =  s_axi4_aruser;
-#    assign m_axi4_arcache =  s_axi4_arcache;
-#    assign m_axi4_arprot  =  s_axi4_arprot;
-#    assign m_axi4_arlock  =  s_axi4_arlock;
-#    assign m_axi4_arburst =  s_axi4_arburst;
-#    assign m_axi4_arsize  =  s_axi4_arsize;
-#    assign m_axi4_arlen   =  s_axi4_arlen;
-#    assign m_axi4_araddr  =  l1_araddr_i;
-#    assign m_axi4_arid    =  s_axi4_arid;
-#
-#    assign l2_sending_o   = 1'b0;
-#    assign l2_available_q = 1'b0;
-#    assign l2_done_o      = 1'b0;
-#  end // else: !if(ENABLE_L2TLB == 1)
-# endgenerate
-#
-# endmodule
-#
-#
diff --git a/src/iommu/axi_rab/axi4_aw_buffer.py b/src/iommu/axi_rab/axi4_aw_buffer.py
deleted file mode 100644
index f5ca37d1..00000000
--- a/src/iommu/axi_rab/axi4_aw_buffer.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_aw_buffer(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.s_axi4_awid = Signal(AXI_ID_WIDTH)  # input
-        self.s_axi4_awaddr = Signal(32)  # input
-        self.s_axi4_awvalid = Signal()  # input
-        self.s_axi4_awready = Signal()  # output
-        self.s_axi4_awlen = Signal(8)  # input
-        self.s_axi4_awsize = Signal(3)  # input
-        self.s_axi4_awburst = Signal(2)  # input
-        self.s_axi4_awlock = Signal()  # input
-        self.s_axi4_awprot = Signal(3)  # input
-        self.s_axi4_awcache = Signal(4)  # input
-        self.s_axi4_awregion = Signal(4)  # input
-        self.s_axi4_awqos = Signal(4)  # input
-        self.s_axi4_awuser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_awid = Signal(AXI_ID_WIDTH)  # output
-        self.m_axi4_awaddr = Signal(32)  # output
-        self.m_axi4_awvalid = Signal()  # output
-        self.m_axi4_awready = Signal()  # input
-        self.m_axi4_awlen = Signal(8)  # output
-        self.m_axi4_awsize = Signal(3)  # output
-        self.m_axi4_awburst = Signal(2)  # output
-        self.m_axi4_awlock = Signal()  # output
-        self.m_axi4_awprot = Signal(3)  # output
-        self.m_axi4_awcache = Signal(4)  # output
-        self.m_axi4_awregion = Signal(4)  # output
-        self.m_axi4_awqos = Signal(4)  # output
-        self.m_axi4_awuser = Signal(AXI_USER_WIDTH)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.None.eq(self.s_axi4_awcache)
-        m.d.comb += self.None.eq(self.s_axi4_awprot)
-        m.d.comb += self.None.eq(self.s_axi4_awlock)
-        m.d.comb += self.None.eq(self.s_axi4_awburst)
-        m.d.comb += self.None.eq(self.s_axi4_awsize)
-        m.d.comb += self.None.eq(self.s_axi4_awlen)
-        m.d.comb += self.None.eq(self.s_axi4_awaddr)
-        m.d.comb += self.None.eq(self.s_axi4_awregion)
-        m.d.comb += self.None.eq(self.s_axi4_awqos)
-        m.d.comb += self.None.eq(self.s_axi4_awid)
-        m.d.comb += self.None.eq(self.s_axi4_awuser)
-        m.d.comb += self.m_axi4_awcache.eq(self.None)
-        m.d.comb += self.m_axi4_awprot.eq(self.None)
-        m.d.comb += self.m_axi4_awlock.eq(self.None)
-        m.d.comb += self.m_axi4_awburst.eq(self.None)
-        m.d.comb += self.m_axi4_awsize.eq(self.None)
-        m.d.comb += self.m_axi4_awlen.eq(self.None)
-        m.d.comb += self.m_axi4_awaddr.eq(self.None)
-        m.d.comb += self.m_axi4_awregion.eq(self.None)
-        m.d.comb += self.m_axi4_awqos.eq(self.None)
-        m.d.comb += self.m_axi4_awid.eq(self.None)
-        m.d.comb += self.m_axi4_awuser.eq(self.None)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_aw_buffer
-#  #(
-#    parameter AXI_ID_WIDTH   = 4,
-#    parameter AXI_USER_WIDTH = 4
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_awid,
-#    input  logic               [31:0] s_axi4_awaddr,
-#    input  logic                      s_axi4_awvalid,
-#    output logic                      s_axi4_awready,
-#    input  logic                [7:0] s_axi4_awlen,
-#    input  logic                [2:0] s_axi4_awsize,
-#    input  logic                [1:0] s_axi4_awburst,
-#    input  logic                      s_axi4_awlock,
-#    input  logic                [2:0] s_axi4_awprot,
-#    input  logic                [3:0] s_axi4_awcache,
-#    input  logic                [3:0] s_axi4_awregion,
-#    input  logic                [3:0] s_axi4_awqos,
-#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_awuser,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_awid,
-#    output logic               [31:0] m_axi4_awaddr,
-#    output logic                      m_axi4_awvalid,
-#    input  logic                      m_axi4_awready,
-#    output logic                [7:0] m_axi4_awlen,
-#    output logic                [2:0] m_axi4_awsize,
-#    output logic                [1:0] m_axi4_awburst,
-#    output logic                      m_axi4_awlock,
-#    output logic                [2:0] m_axi4_awprot,
-#    output logic                [3:0] m_axi4_awcache,
-#    output logic                [3:0] m_axi4_awregion,
-#    output logic                [3:0] m_axi4_awqos,
-#    output logic [AXI_USER_WIDTH-1:0] m_axi4_awuser
-#  );
-#
-#  wire [AXI_USER_WIDTH+AXI_ID_WIDTH+60:0] data_in;
-#  wire [AXI_USER_WIDTH+AXI_ID_WIDTH+60:0] data_out;
-#
-#  assign data_in                                            [3:0] = s_axi4_awcache;
-#  assign data_in                                            [6:4] = s_axi4_awprot;
-#  assign data_in                                              [7] = s_axi4_awlock;
-#  assign data_in                                            [9:8] = s_axi4_awburst;
-#  assign data_in                                          [12:10] = s_axi4_awsize;
-#  assign data_in                                          [20:13] = s_axi4_awlen;
-#  assign data_in                                          [52:21] = s_axi4_awaddr;
-#  assign data_in                                          [56:53] = s_axi4_awregion;
-#  assign data_in                                          [60:57] = s_axi4_awqos;
-#  assign data_in                             [60+AXI_ID_WIDTH:61] = s_axi4_awid;
-#  assign data_in [60+AXI_ID_WIDTH+AXI_USER_WIDTH:61+AXI_ID_WIDTH] = s_axi4_awuser;
-#
-#  assign m_axi4_awcache  = data_out[3:0];
-#  assign m_axi4_awprot   = data_out[6:4];
-#  assign m_axi4_awlock   = data_out[7];
-#  assign m_axi4_awburst  = data_out[9:8];
-#  assign m_axi4_awsize   = data_out[12:10];
-#  assign m_axi4_awlen    = data_out[20:13];
-#  assign m_axi4_awaddr   = data_out[52:21];
-#  assign m_axi4_awregion = data_out[56:53];
-#  assign m_axi4_awqos    = data_out[60:57];
-#  assign m_axi4_awid     = data_out[60+AXI_ID_WIDTH:61];
-#  assign m_axi4_awuser   = data_out[60+AXI_ID_WIDTH+AXI_USER_WIDTH:61+AXI_ID_WIDTH];
-#
-#  axi_buffer_rab
-#    #(
-#      .DATA_WIDTH   ( AXI_ID_WIDTH+AXI_USER_WIDTH+61  ),
-#      .BUFFER_DEPTH ( 4                               )
-#    )
-#    u_buffer
-#    (
-#      .clk       ( axi4_aclk      ),
-#      .rstn      ( axi4_arstn     ),
-#      .valid_out ( m_axi4_awvalid ),
-#      .data_out  ( data_out       ),
-#      .ready_in  ( m_axi4_awready ),
-#      .valid_in  ( s_axi4_awvalid ),
-#      .data_in   ( data_in        ),
-#      .ready_out ( s_axi4_awready )
-#    );
-# endmodule
-#
-#
diff --git a/src/iommu/axi_rab/axi4_aw_sender.py b/src/iommu/axi_rab/axi4_aw_sender.py
deleted file mode 100644
index fbc917df..00000000
--- a/src/iommu/axi_rab/axi4_aw_sender.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_aw_sender(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.l1_done_o = Signal()  # output
-        self.l1_accept_i = Signal()  # input
-        self.l1_drop_i = Signal()  # input
-        self.l1_save_i = Signal()  # input
-        self.l2_done_o = Signal()  # output
-        self.l2_accept_i = Signal()  # input
-        self.l2_drop_i = Signal()  # input
-        self.l2_sending_o = Signal()  # output
-        self.l1_awaddr_i = Signal(AXI_ADDR_WIDTH)  # input
-        self.l2_awaddr_i = Signal(AXI_ADDR_WIDTH)  # input
-        self.s_axi4_awid = Signal(AXI_ID_WIDTH)  # input
-        self.s_axi4_awvalid = Signal()  # input
-        self.s_axi4_awready = Signal()  # output
-        self.s_axi4_awlen = Signal(8)  # input
-        self.s_axi4_awsize = Signal(3)  # input
-        self.s_axi4_awburst = Signal(2)  # input
-        self.s_axi4_awlock = Signal()  # input
-        self.s_axi4_awprot = Signal(3)  # input
-        self.s_axi4_awcache = Signal(4)  # input
-        self.s_axi4_awregion = Signal(4)  # input
-        self.s_axi4_awqos = Signal(4)  # input
-        self.s_axi4_awuser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_awid = Signal(AXI_ID_WIDTH)  # output
-        self.m_axi4_awaddr = Signal(AXI_ADDR_WIDTH)  # output
-        self.m_axi4_awvalid = Signal()  # output
-        self.m_axi4_awready = Signal()  # input
-        self.m_axi4_awlen = Signal(8)  # output
-        self.m_axi4_awsize = Signal(3)  # output
-        self.m_axi4_awburst = Signal(2)  # output
-        self.m_axi4_awlock = Signal()  # output
-        self.m_axi4_awprot = Signal(3)  # output
-        self.m_axi4_awcache = Signal(4)  # output
-        self.m_axi4_awregion = Signal(4)  # output
-        self.m_axi4_awqos = Signal(4)  # output
-        self.m_axi4_awuser = Signal(AXI_USER_WIDTH)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.l1_save.eq(self.None)
-        m.d.comb += self.l1_done_o.eq(self.None)
-        m.d.comb += self.m_axi4_awvalid.eq(self.None)
-        m.d.comb += self.s_axi4_awready.eq(self.None)
-        m.d.comb += self.m_axi4_awuser.eq(self.None)
-        m.d.comb += self.m_axi4_awcache.eq(self.None)
-        m.d.comb += self.m_axi4_awregion.eq(self.None)
-        m.d.comb += self.m_axi4_awqos.eq(self.None)
-        m.d.comb += self.m_axi4_awprot.eq(self.None)
-        m.d.comb += self.m_axi4_awlock.eq(self.None)
-        m.d.comb += self.m_axi4_awburst.eq(self.None)
-        m.d.comb += self.m_axi4_awsize.eq(self.None)
-        m.d.comb += self.m_axi4_awlen.eq(self.None)
-        m.d.comb += self.m_axi4_awaddr.eq(self.None)
-        m.d.comb += self.m_axi4_awid.eq(self.None)
-        m.d.comb += self.l2_sending_o.eq(self.None)
-        m.d.comb += self.l2_sent.eq(self.None)
-        m.d.comb += self.l2_done_o.eq(self.None)
-        m.d.comb += self.m_axi4_awuser.eq(self.s_axi4_awuser)
-        m.d.comb += self.m_axi4_awcache.eq(self.s_axi4_awcache)
-        m.d.comb += self.m_axi4_awregion.eq(self.s_axi4_awregion)
-        m.d.comb += self.m_axi4_awqos.eq(self.s_axi4_awqos)
-        m.d.comb += self.m_axi4_awprot.eq(self.s_axi4_awprot)
-        m.d.comb += self.m_axi4_awlock.eq(self.s_axi4_awlock)
-        m.d.comb += self.m_axi4_awburst.eq(self.s_axi4_awburst)
-        m.d.comb += self.m_axi4_awsize.eq(self.s_axi4_awsize)
-        m.d.comb += self.m_axi4_awlen.eq(self.s_axi4_awlen)
-        m.d.comb += self.m_axi4_awaddr.eq(self.l1_awaddr_i)
-        m.d.comb += self.m_axi4_awid.eq(self.s_axi4_awid)
-        m.d.comb += self.l2_sending_o.eq(self.1: 'b0)
-        m.d.comb += self.l2_available_q.eq(self.1: 'b0)
-        m.d.comb += self.l2_done_o.eq(self.1: 'b0)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_aw_sender
-#  #(
-#    parameter AXI_ADDR_WIDTH   = 40,
-#    parameter AXI_ID_WIDTH     = 4,
-#    parameter AXI_USER_WIDTH   = 4,
-#    parameter ENABLE_L2TLB     = 0
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    output logic                      l1_done_o,
-#    input  logic                      l1_accept_i,
-#    input  logic                      l1_drop_i,
-#    input  logic                      l1_save_i,
-#
-#    output logic                      l2_done_o,
-#    input  logic                      l2_accept_i,
-#    input  logic                      l2_drop_i,
-#    output logic                      l2_sending_o,
-#
-#    input  logic [AXI_ADDR_WIDTH-1:0] l1_awaddr_i,
-#    input  logic [AXI_ADDR_WIDTH-1:0] l2_awaddr_i,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_awid,
-#    input  logic                      s_axi4_awvalid,
-#    output logic                      s_axi4_awready,
-#    input  logic                [7:0] s_axi4_awlen,
-#    input  logic                [2:0] s_axi4_awsize,
-#    input  logic                [1:0] s_axi4_awburst,
-#    input  logic                      s_axi4_awlock,
-#    input  logic                [2:0] s_axi4_awprot,
-#    input  logic                [3:0] s_axi4_awcache,
-#    input  logic                [3:0] s_axi4_awregion,
-#    input  logic                [3:0] s_axi4_awqos,
-#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_awuser,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_awid,
-#    output logic [AXI_ADDR_WIDTH-1:0] m_axi4_awaddr,
-#    output logic                      m_axi4_awvalid,
-#    input  logic                      m_axi4_awready,
-#    output logic                [7:0] m_axi4_awlen,
-#    output logic                [2:0] m_axi4_awsize,
-#    output logic                [1:0] m_axi4_awburst,
-#    output logic                      m_axi4_awlock,
-#    output logic                [2:0] m_axi4_awprot,
-#    output logic                [3:0] m_axi4_awcache,
-#    output logic                [3:0] m_axi4_awregion,
-#    output logic                [3:0] m_axi4_awqos,
-#    output logic [AXI_USER_WIDTH-1:0] m_axi4_awuser
-#  );
-#
-#  logic l1_save;
-#
-#  logic l2_sent;
-#  logic l2_available_q;
-#
-#  assign l1_save      = l1_save_i & l2_available_q;
-#
-#  assign l1_done_o    = s_axi4_awvalid & s_axi4_awready ;
-#
-#  // if 1: accept and forward a transaction translated by L1
-#  //    2: drop or save request (if L2 slot not occupied already)
-#  assign m_axi4_awvalid = (s_axi4_awvalid & l1_accept_i) |
-#                          l2_sending_o;
-#  assign s_axi4_awready = (m_axi4_awvalid & m_axi4_awready & ~l2_sending_o) |
-#                          (s_axi4_awvalid & (l1_drop_i | l1_save));
-#
-# generate
-#  if (ENABLE_L2TLB    == 1) begin
-#    logic [AXI_USER_WIDTH-1:0] l2_axi4_awuser  ;
-#    logic                [3:0] l2_axi4_awcache ;
-#    logic                [3:0] l2_axi4_awregion;
-#    logic                [3:0] l2_axi4_awqos   ;
-#    logic                [2:0] l2_axi4_awprot  ;
-#    logic                      l2_axi4_awlock  ;
-#    logic                [1:0] l2_axi4_awburst ;
-#    logic                [2:0] l2_axi4_awsize  ;
-#    logic                [7:0] l2_axi4_awlen   ;
-#    logic   [AXI_ID_WIDTH-1:0] l2_axi4_awid    ;
-#
-#    assign m_axi4_awuser   = l2_sending_o ? l2_axi4_awuser   : s_axi4_awuser;
-#    assign m_axi4_awcache  = l2_sending_o ? l2_axi4_awcache  : s_axi4_awcache;
-#    assign m_axi4_awregion = l2_sending_o ? l2_axi4_awregion : s_axi4_awregion;
-#    assign m_axi4_awqos    = l2_sending_o ? l2_axi4_awqos    : s_axi4_awqos;
-#    assign m_axi4_awprot   = l2_sending_o ? l2_axi4_awprot   : s_axi4_awprot;
-#    assign m_axi4_awlock   = l2_sending_o ? l2_axi4_awlock   : s_axi4_awlock;
-#    assign m_axi4_awburst  = l2_sending_o ? l2_axi4_awburst  : s_axi4_awburst;
-#    assign m_axi4_awsize   = l2_sending_o ? l2_axi4_awsize   : s_axi4_awsize;
-#    assign m_axi4_awlen    = l2_sending_o ? l2_axi4_awlen    : s_axi4_awlen;
-#    assign m_axi4_awaddr   = l2_sending_o ? l2_awaddr_i      : l1_awaddr_i;
-#    assign m_axi4_awid     = l2_sending_o ? l2_axi4_awid     : s_axi4_awid;
-#
-#    // buffer AXI signals in case of L1 miss
-#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
-#      if (axi4_arstn == 1'b0) begin
-#        l2_axi4_awuser   <=  'b0;
-#        l2_axi4_awcache  <=  'b0;
-#        l2_axi4_awregion <=  'b0;
-#        l2_axi4_awqos    <=  'b0;
-#        l2_axi4_awprot   <=  'b0;
-#        l2_axi4_awlock   <= 1'b0;
-#        l2_axi4_awburst  <=  'b0;
-#        l2_axi4_awsize   <=  'b0;
-#        l2_axi4_awlen    <=  'b0;
-#        l2_axi4_awid     <=  'b0;
-#      end else if (l1_save) begin
-#        l2_axi4_awuser   <= s_axi4_awuser;
-#        l2_axi4_awcache  <= s_axi4_awcache;
-#        l2_axi4_awregion <= s_axi4_awregion;
-#        l2_axi4_awqos    <= s_axi4_awqos;
-#        l2_axi4_awprot   <= s_axi4_awprot;
-#        l2_axi4_awlock   <= s_axi4_awlock;
-#        l2_axi4_awburst  <= s_axi4_awburst;
-#        l2_axi4_awsize   <= s_axi4_awsize;
-#        l2_axi4_awlen    <= s_axi4_awlen;
-#        l2_axi4_awid     <= s_axi4_awid;
-#      end
-#    end
-#
-#    // signal that an l1_save_i can be accepted
-#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
-#      if (axi4_arstn == 1'b0) begin
-#        l2_available_q <= 1'b1;
-#      end else if (l2_sent | l2_drop_i) begin
-#        l2_available_q <= 1'b1;
-#      end else if (l1_save) begin
-#        l2_available_q <= 1'b0;
-#      end
-#    end
-#
-#    assign l2_sending_o = l2_accept_i & ~l2_available_q;
-#    assign l2_sent      = l2_sending_o & m_axi4_awvalid & m_axi4_awready;
-#
-#    // if 1: having sent out a transaction translated by L2
-#    //    2: drop request (L2 slot is available again)
-#    assign l2_done_o    = l2_sent | l2_drop_i;
-#
-#  end else begin // !`ifdef ENABLE_L2TLB
-#    assign m_axi4_awuser   =  s_axi4_awuser;
-#    assign m_axi4_awcache  =  s_axi4_awcache;
-#    assign m_axi4_awregion =  s_axi4_awregion;
-#    assign m_axi4_awqos    =  s_axi4_awqos;
-#    assign m_axi4_awprot   =  s_axi4_awprot;
-#    assign m_axi4_awlock   =  s_axi4_awlock;
-#    assign m_axi4_awburst  =  s_axi4_awburst;
-#    assign m_axi4_awsize   =  s_axi4_awsize;
-#    assign m_axi4_awlen    =  s_axi4_awlen;
-#    assign m_axi4_awaddr   =  l1_awaddr_i;
-#    assign m_axi4_awid     =  s_axi4_awid;
-#
-#    assign l2_sending_o    = 1'b0;
-#    assign l2_available_q  = 1'b0;
-#    assign l2_done_o       = 1'b0;
-#  end // !`ifdef ENABLE_L2TLB
-# endgenerate
-#
-# endmodule
-#
-#
diff --git a/src/iommu/axi_rab/axi4_b_buffer.py b/src/iommu/axi_rab/axi4_b_buffer.py
deleted file mode 100644
index 42fce1ad..00000000
--- a/src/iommu/axi_rab/axi4_b_buffer.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_b_buffer(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.s_axi4_bid = Signal(AXI_ID_WIDTH)  # output
-        self.s_axi4_bresp = Signal(2)  # output
-        self.s_axi4_bvalid = Signal()  # output
-        self.s_axi4_buser = Signal(AXI_USER_WIDTH)  # output
-        self.s_axi4_bready = Signal()  # input
-        self.m_axi4_bid = Signal(AXI_ID_WIDTH)  # input
-        self.m_axi4_bresp = Signal(2)  # input
-        self.m_axi4_bvalid = Signal()  # input
-        self.m_axi4_buser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_bready = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.None.eq(self.m_axi4_bresp)
-        m.d.comb += self.None.eq(self.m_axi4_bid)
-        m.d.comb += self.None.eq(self.m_axi4_buser)
-        m.d.comb += self.s_axi4_buser.eq(self.None)
-        m.d.comb += self.s_axi4_bid.eq(self.None)
-        m.d.comb += self.s_axi4_bresp.eq(self.None)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_b_buffer
-#  #(
-#    parameter AXI_ID_WIDTH   = 4,
-#    parameter AXI_USER_WIDTH = 4
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_bid,
-#    output logic                [1:0] s_axi4_bresp,
-#    output logic                      s_axi4_bvalid,
-#    output logic [AXI_USER_WIDTH-1:0] s_axi4_buser,
-#    input  logic                      s_axi4_bready,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_bid,
-#    input  logic                [1:0] m_axi4_bresp,
-#    input  logic                      m_axi4_bvalid,
-#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_buser,
-#    output logic                      m_axi4_bready
-#  );
-#
-#  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+1:0] data_in;
-#  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+1:0] data_out;
-#
-#  assign data_in                                         [1:0] = m_axi4_bresp;
-#  assign data_in                            [AXI_ID_WIDTH+1:2] = m_axi4_bid;
-#  assign data_in[AXI_ID_WIDTH+AXI_USER_WIDTH+1:AXI_ID_WIDTH+2] = m_axi4_buser;
-#
-#  assign s_axi4_buser = data_out[AXI_ID_WIDTH+AXI_USER_WIDTH+1:AXI_ID_WIDTH+2];
-#  assign s_axi4_bid   = data_out[AXI_ID_WIDTH+1:2];
-#  assign s_axi4_bresp = data_out[1:0];
-#
-#  axi_buffer_rab
-#  #(
-#    .DATA_WIDTH   ( AXI_ID_WIDTH+AXI_USER_WIDTH+2 ),
-#    .BUFFER_DEPTH ( 4                             )
-#    )
-#  u_buffer
-#  (
-#    .clk      ( axi4_aclk     ),
-#    .rstn     ( axi4_arstn    ),
-#    .valid_out( s_axi4_bvalid ),
-#    .data_out ( data_out      ),
-#    .ready_in ( s_axi4_bready ),
-#    .valid_in ( m_axi4_bvalid ),
-#    .data_in  ( data_in       ),
-#    .ready_out( m_axi4_bready )
-#  );
-#
-# endmodule
-#
-#
diff --git a/src/iommu/axi_rab/axi4_b_sender.py b/src/iommu/axi_rab/axi4_b_sender.py
deleted file mode 100644
index 1c61a2a5..00000000
--- a/src/iommu/axi_rab/axi4_b_sender.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_b_sender(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.drop_i = Signal()  # input
-        self.done_o = Signal()  # output
-        self.id_i = Signal(AXI_ID_WIDTH)  # input
-        self.prefetch_i = Signal()  # input
-        self.hit_i = Signal()  # input
-        self.s_axi4_bid = Signal(AXI_ID_WIDTH)  # output
-        self.s_axi4_bresp = Signal(2)  # output
-        self.s_axi4_bvalid = Signal()  # output
-        self.s_axi4_buser = Signal(AXI_USER_WIDTH)  # output
-        self.s_axi4_bready = Signal()  # input
-        self.m_axi4_bid = Signal(AXI_ID_WIDTH)  # input
-        self.m_axi4_bresp = Signal(2)  # input
-        self.m_axi4_bvalid = Signal()  # input
-        self.m_axi4_buser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_bready = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.fifo_push.eq(self.None)
-        m.d.comb += self.done_o.eq(self.fifo_push)
-        m.d.comb += self.fifo_pop.eq(self.None)
-        m.d.comb += self.s_axi4_buser.eq(self.None)
-        m.d.comb += self.s_axi4_bid.eq(self.None)
-        m.d.comb += self.s_axi4_bresp.eq(self.None)
-        m.d.comb += self.s_axi4_bvalid.eq(self.None)
-        m.d.comb += self.m_axi4_bready.eq(self.None)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_b_sender
-#  #(
-#    parameter AXI_ID_WIDTH   = 10,
-#    parameter AXI_USER_WIDTH = 4
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    input  logic                      drop_i,
-#    output logic                      done_o,
-#    input  logic   [AXI_ID_WIDTH-1:0] id_i,
-#    input  logic                      prefetch_i,
-#    input  logic                      hit_i,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_bid,
-#    output logic                [1:0] s_axi4_bresp,
-#    output logic                      s_axi4_bvalid,
-#    output logic [AXI_USER_WIDTH-1:0] s_axi4_buser,
-#    input  logic                      s_axi4_bready,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_bid,
-#    input  logic                [1:0] m_axi4_bresp,
-#    input  logic                      m_axi4_bvalid,
-#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_buser,
-#    output logic                      m_axi4_bready
-#  );
-#
-#  logic                    fifo_valid;
-#  logic                    fifo_pop;
-#  logic                    fifo_push;
-#  logic                    fifo_ready;
-#  logic [AXI_ID_WIDTH-1:0] id;
-#  logic                    prefetch;
-#  logic                    hit;
-#
-#  logic                    dropping;
-#
-#  axi_buffer_rab
-#    #(
-#      .DATA_WIDTH   ( 2+AXI_ID_WIDTH  ),
-#      .BUFFER_DEPTH ( 4               )
-#      )
-#    u_fifo
-#      (
-#        .clk       ( axi4_aclk                 ),
-#        .rstn      ( axi4_arstn                ),
-#        // Pop
-#        .data_out  ( {prefetch,   hit,   id}   ),
-#        .valid_out ( fifo_valid                ),
-#        .ready_in  ( fifo_pop                  ),
-#        // Push
-#        .valid_in  ( fifo_push                 ),
-#        .data_in   ( {prefetch_i, hit_i, id_i} ),
-#        .ready_out ( fifo_ready                )
-#      );
-#
-#  assign fifo_push = drop_i & fifo_ready;
-#  assign done_o    = fifo_push;
-#
-#  assign fifo_pop  = dropping & s_axi4_bready;
-#
-#  always @ (posedge axi4_aclk or negedge axi4_arstn) begin
-#    if (axi4_arstn == 1'b0) begin
-#      dropping <= 1'b0;
-#    end else begin
-#      if (fifo_valid && ~dropping)
-#        dropping <= 1'b1;
-#      else if (fifo_pop)
-#        dropping <= 1'b0;
-#    end
-#  end
-#
-#  assign s_axi4_buser  = dropping ? {AXI_USER_WIDTH{1'b0}} : m_axi4_buser;
-#  assign s_axi4_bid    = dropping ? id : m_axi4_bid;
-#
-#  assign s_axi4_bresp  = (dropping & prefetch & hit) ? 2'b00 : // prefetch hit, mutli, prot
-#                         (dropping & prefetch      ) ? 2'b10 : // prefetch miss
-#                         (dropping            & hit) ? 2'b10 : // non-prefetch multi, prot
-#                         (dropping                 ) ? 2'b10 : // non-prefetch miss
-#                         m_axi4_bresp;
-#
-#  assign s_axi4_bvalid =  dropping | m_axi4_bvalid;
-#  assign m_axi4_bready = ~dropping & s_axi4_bready;
-#
-# endmodule
-#
-#
diff --git a/src/iommu/axi_rab/axi4_r_buffer.py b/src/iommu/axi_rab/axi4_r_buffer.py
deleted file mode 100644
index 91bdf0a5..00000000
--- a/src/iommu/axi_rab/axi4_r_buffer.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_r_buffer(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.s_axi4_rid = Signal(AXI_ID_WIDTH)  # output
-        self.s_axi4_rresp = Signal(2)  # output
-        self.s_axi4_rdata = Signal(AXI_DATA_WIDTH)  # output
-        self.s_axi4_rlast = Signal()  # output
-        self.s_axi4_rvalid = Signal()  # output
-        self.s_axi4_ruser = Signal(AXI_USER_WIDTH)  # output
-        self.s_axi4_rready = Signal()  # input
-        self.m_axi4_rid = Signal(AXI_ID_WIDTH)  # input
-        self.m_axi4_rresp = Signal(2)  # input
-        self.m_axi4_rdata = Signal(AXI_DATA_WIDTH)  # input
-        self.m_axi4_rlast = Signal()  # input
-        self.m_axi4_rvalid = Signal()  # input
-        self.m_axi4_ruser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_rready = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.None.eq(self.m_axi4_rresp)
-        m.d.comb += self.None.eq(self.m_axi4_rlast)
-        m.d.comb += self.None.eq(self.m_axi4_rid)
-        m.d.comb += self.None.eq(self.m_axi4_rdata)
-        m.d.comb += self.None.eq(self.m_axi4_ruser)
-        m.d.comb += self.s_axi4_rresp.eq(self.None)
-        m.d.comb += self.s_axi4_rlast.eq(self.None)
-        m.d.comb += self.s_axi4_rid.eq(self.None)
-        m.d.comb += self.s_axi4_rdata.eq(self.None)
-        m.d.comb += self.s_axi4_ruser.eq(self.None)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_r_buffer
-#  #(
-#    parameter AXI_DATA_WIDTH = 32,
-#    parameter AXI_ID_WIDTH   = 4,
-#    parameter AXI_USER_WIDTH = 4
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_rid,
-#    output logic                [1:0] s_axi4_rresp,
-#    output logic [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
-#    output logic                      s_axi4_rlast,
-#    output logic                      s_axi4_rvalid,
-#    output logic [AXI_USER_WIDTH-1:0] s_axi4_ruser,
-#    input  logic                      s_axi4_rready,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_rid,
-#    input  logic                [1:0] m_axi4_rresp,
-#    input  logic [AXI_DATA_WIDTH-1:0] m_axi4_rdata,
-#    input  logic                      m_axi4_rlast,
-#    input  logic                      m_axi4_rvalid,
-#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_ruser,
-#    output logic                      m_axi4_rready
-#  );
-#
-#  wire [AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3-1:0] data_in;
-#  wire [AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3-1:0] data_out;
-#
-#  localparam ID_START   = 3;
-#  localparam ID_END     = AXI_ID_WIDTH-1 + ID_START;
-#  localparam DATA_START = ID_END + 1;
-#  localparam DATA_END   = AXI_DATA_WIDTH-1 + DATA_START;
-#  localparam USER_START = DATA_END + 1;
-#  localparam USER_END   = AXI_USER_WIDTH-1 + USER_START;
-#
-#  assign data_in                [1:0] = m_axi4_rresp;
-#  assign data_in                  [2] = m_axi4_rlast;
-#  assign data_in    [ID_END:ID_START] = m_axi4_rid;
-#  assign data_in[DATA_END:DATA_START] = m_axi4_rdata;
-#  assign data_in[USER_END:USER_START] = m_axi4_ruser;
-#
-#  assign s_axi4_rresp  = data_out                [1:0];
-#  assign s_axi4_rlast  = data_out                  [2];
-#  assign s_axi4_rid    = data_out    [ID_END:ID_START];
-#  assign s_axi4_rdata  = data_out[DATA_END:DATA_START];
-#  assign s_axi4_ruser  = data_out[USER_END:USER_START];
-#
-#  axi_buffer_rab
-#  #(
-#    .DATA_WIDTH   ( AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3  ),
-#    .BUFFER_DEPTH ( 4                                             )
-#    )
-#  u_buffer
-#  (
-#    .clk       ( axi4_aclk     ),
-#    .rstn      ( axi4_arstn    ),
-#    // Pop
-#    .valid_out ( s_axi4_rvalid ),
-#    .data_out  ( data_out      ),
-#    .ready_in  ( s_axi4_rready ),
-#    // Push
-#    .valid_in  ( m_axi4_rvalid ),
-#    .data_in   ( data_in       ),
-#    .ready_out ( m_axi4_rready )
-#  );
-#
-# endmodule
-#
-#
diff --git a/src/iommu/axi_rab/axi4_r_sender.py b/src/iommu/axi_rab/axi4_r_sender.py
deleted file mode 100644
index d4e22bb2..00000000
--- a/src/iommu/axi_rab/axi4_r_sender.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_r_sender(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.drop_i = Signal()  # input
-        self.drop_len_i = Signal(8)  # input
-        self.done_o = Signal()  # output
-        self.id_i = Signal(AXI_ID_WIDTH)  # input
-        self.prefetch_i = Signal()  # input
-        self.hit_i = Signal()  # input
-        self.s_axi4_rid = Signal(AXI_ID_WIDTH)  # output
-        self.s_axi4_rresp = Signal(2)  # output
-        self.s_axi4_rdata = Signal(AXI_DATA_WIDTH)  # output
-        self.s_axi4_rlast = Signal()  # output
-        self.s_axi4_rvalid = Signal()  # output
-        self.s_axi4_ruser = Signal(AXI_USER_WIDTH)  # output
-        self.s_axi4_rready = Signal()  # input
-        self.m_axi4_rid = Signal(AXI_ID_WIDTH)  # input
-        self.m_axi4_rresp = Signal(2)  # input
-        self.m_axi4_rdata = Signal(AXI_DATA_WIDTH)  # input
-        self.m_axi4_rlast = Signal()  # input
-        self.m_axi4_rvalid = Signal()  # input
-        self.m_axi4_ruser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_rready = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.fifo_push.eq(self.None)
-        m.d.comb += self.done_o.eq(self.fifo_push)
-        m.d.comb += self.s_axi4_rdata.eq(self.m_axi4_rdata)
-        m.d.comb += self.s_axi4_ruser.eq(self.None)
-        m.d.comb += self.s_axi4_rid.eq(self.None)
-        m.d.comb += self.s_axi4_rresp.eq(self.None)
-        m.d.comb += self.s_axi4_rvalid.eq(self.None)
-        m.d.comb += self.m_axi4_rready.eq(self.None)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# //import CfMath::log2;
-#
-# module axi4_r_sender
-#  #(
-#    parameter AXI_DATA_WIDTH = 32,
-#    parameter AXI_ID_WIDTH   = 4,
-#    parameter AXI_USER_WIDTH = 4
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    input  logic                      drop_i,
-#    input  logic                [7:0] drop_len_i,
-#    output logic                      done_o,
-#    input  logic   [AXI_ID_WIDTH-1:0] id_i,
-#    input  logic                      prefetch_i,
-#    input  logic                      hit_i,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_rid,
-#    output logic                [1:0] s_axi4_rresp,
-#    output logic [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
-#    output logic                      s_axi4_rlast,
-#    output logic                      s_axi4_rvalid,
-#    output logic [AXI_USER_WIDTH-1:0] s_axi4_ruser,
-#    input  logic                      s_axi4_rready,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_rid,
-#    input  logic                [1:0] m_axi4_rresp,
-#    input  logic [AXI_DATA_WIDTH-1:0] m_axi4_rdata,
-#    input  logic                      m_axi4_rlast,
-#    input  logic                      m_axi4_rvalid,
-#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_ruser,
-#    output logic                      m_axi4_rready
-#  );
-#
-#  localparam BUFFER_DEPTH = 16;
-#
-#  logic                    fifo_valid;
-#  logic                    fifo_pop;
-#  logic                    fifo_push;
-#  logic                    fifo_ready;
-#  logic [AXI_ID_WIDTH-1:0] id;
-#  logic              [7:0] len;
-#  logic                    prefetch;
-#  logic                    hit;
-#
-#  logic                    dropping;
-#
-#  enum logic [1:0]  { FORWARDING, DROPPING }
-#                            state_d,                state_q;
-#  logic                     burst_ongoing_d,        burst_ongoing_q;
-#  logic [7:0]               drop_cnt_d,             drop_cnt_q;
-#
-#  axi_buffer_rab
-#    #(
-#      .DATA_WIDTH       ( 2+AXI_ID_WIDTH+8  ),
-#      .BUFFER_DEPTH     ( BUFFER_DEPTH      )
-#      )
-#    u_fifo
-#      (
-#        .clk       ( axi4_aclk                              ),
-#        .rstn      ( axi4_arstn                             ),
-#        // Pop
-#        .data_out  ( {prefetch,   hit,   id,   len}         ),
-#        .valid_out ( fifo_valid                             ),
-#        .ready_in  ( fifo_pop                               ),
-#        // Push
-#        .valid_in  ( fifo_push                              ),
-#        .data_in   ( {prefetch_i, hit_i, id_i, drop_len_i}  ),
-#        .ready_out ( fifo_ready                             )
-#      );
-#
-#  assign fifo_push = drop_i & fifo_ready;
-#  assign done_o    = fifo_push;
-#
-#  always_comb begin
-#    burst_ongoing_d = burst_ongoing_q;
-#    drop_cnt_d      = drop_cnt_q;
-#    dropping        = 1'b0;
-#    s_axi4_rlast    = 1'b0;
-#    fifo_pop        = 1'b0;
-#    state_d         = state_q;
-#
-#    case (state_q)
-#      FORWARDING: begin
-#        s_axi4_rlast = m_axi4_rlast;
-#        // Remember whether there is currently a burst ongoing.
-#        if (m_axi4_rvalid && m_axi4_rready) begin
-#          if (m_axi4_rlast) begin
-#            burst_ongoing_d = 1'b0;
-#          end else begin
-#            burst_ongoing_d = 1'b1;
-#          end
-#        end
-#        // If there is no burst ongoing and the FIFO has a drop request ready, process it.
-#        if (!burst_ongoing_d && fifo_valid) begin
-#          drop_cnt_d  = len;
-#          state_d     = DROPPING;
-#        end
-#      end
-#
-#      DROPPING: begin
-#        dropping      = 1'b1;
-#        s_axi4_rlast  = (drop_cnt_q == '0);
-#        // Handshake on slave interface
-#        if (s_axi4_rready) begin
-#          drop_cnt_d -= 1;
-#          if (drop_cnt_q == '0) begin
-#            drop_cnt_d  = '0;
-#            fifo_pop    = 1'b1;
-#            state_d     = FORWARDING;
-#          end
-#        end
-#      end
-#
-#      default: begin
-#        state_d = FORWARDING;
-#      end
-#    endcase
-#  end
-#
-#  assign s_axi4_rdata  = m_axi4_rdata;
-#
-#  assign s_axi4_ruser  = dropping ? {AXI_USER_WIDTH{1'b0}} : m_axi4_ruser;
-#  assign s_axi4_rid    = dropping ? id : m_axi4_rid;
-#
-#  assign s_axi4_rresp  = (dropping & prefetch & hit) ? 2'b00 : // prefetch hit, mutli, prot
-#                         (dropping & prefetch      ) ? 2'b10 : // prefetch miss
-#                         (dropping            & hit) ? 2'b10 : // non-prefetch multi, prot
-#                         (dropping                 ) ? 2'b10 : // non-prefetch miss
-#                         m_axi4_rresp;
-#
-#  assign s_axi4_rvalid =  dropping | m_axi4_rvalid;
-#  assign m_axi4_rready = ~dropping & s_axi4_rready;
-#
-#  always_ff @(posedge axi4_aclk, negedge axi4_arstn) begin
-#    if (axi4_arstn == 1'b0) begin
-#      burst_ongoing_q <= 1'b0;
-#      drop_cnt_q      <=  'b0;
-#      state_q         <= FORWARDING;
-#    end else begin
-#      burst_ongoing_q <= burst_ongoing_d;
-#      drop_cnt_q      <= drop_cnt_d;
-#      state_q         <= state_d;
-#    end
-#  end
-#
-# endmodule
-#
-#
-#
-#
diff --git a/src/iommu/axi_rab/axi4_w_buffer.py b/src/iommu/axi_rab/axi4_w_buffer.py
deleted file mode 100644
index aa06dc22..00000000
--- a/src/iommu/axi_rab/axi4_w_buffer.py
+++ /dev/null
@@ -1,777 +0,0 @@
-# this file has been generated by sv2nmigen
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_w_buffer(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.l1_done_o = Signal()  # output
-        self.l1_accept_i = Signal()  # input
-        self.l1_save_i = Signal()  # input
-        self.l1_drop_i = Signal()  # input
-        self.l1_master_i = Signal()  # input
-        self.l1_id_i = Signal(AXI_ID_WIDTH)  # input
-        self.l1_len_i = Signal(8)  # input
-        self.l1_prefetch_i = Signal()  # input
-        self.l1_hit_i = Signal()  # input
-        self.l2_done_o = Signal()  # output
-        self.l2_accept_i = Signal()  # input
-        self.l2_drop_i = Signal()  # input
-        self.l2_master_i = Signal()  # input
-        self.l2_id_i = Signal(AXI_ID_WIDTH)  # input
-        self.l2_len_i = Signal(8)  # input
-        self.l2_prefetch_i = Signal()  # input
-        self.l2_hit_i = Signal()  # input
-        self.master_select_o = Signal()  # output
-        self.input_stall_o = Signal()  # output
-        self.output_stall_o = Signal()  # output
-        self.b_drop_o = Signal()  # output
-        self.b_done_i = Signal()  # input
-        self.id_o = Signal(AXI_ID_WIDTH)  # output
-        self.prefetch_o = Signal()  # output
-        self.hit_o = Signal()  # output
-        self.s_axi4_wdata = Signal(AXI_DATA_WIDTH)  # input
-        self.s_axi4_wvalid = Signal()  # input
-        self.s_axi4_wready = Signal()  # output
-        self.s_axi4_wstrb = Signal(1+ERROR p_expression_25)  # input
-        self.s_axi4_wlast = Signal()  # input
-        self.s_axi4_wuser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_wdata = Signal(AXI_DATA_WIDTH)  # output
-        self.m_axi4_wvalid = Signal()  # output
-        self.m_axi4_wready = Signal()  # input
-        self.m_axi4_wstrb = Signal(1+ERROR p_expression_25)  # output
-        self.m_axi4_wlast = Signal()  # output
-        self.m_axi4_wuser = Signal(AXI_USER_WIDTH)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-#
-# //import CfMath::log2;
-#
-# module axi4_w_buffer
-#  #(
-#    parameter AXI_DATA_WIDTH   = 32,
-#    parameter AXI_ID_WIDTH     = 4,
-#    parameter AXI_USER_WIDTH   = 4,
-#    parameter ENABLE_L2TLB     = 0,
-#    parameter HUM_BUFFER_DEPTH = 16
-#  )
-#  (
-#    input  logic                        axi4_aclk,
-#    input  logic                        axi4_arstn,
-#
-#    // L1 & L2 interfaces
-#    output logic                        l1_done_o,
-#    input  logic                        l1_accept_i,
-#    input  logic                        l1_save_i,
-#    input  logic                        l1_drop_i,
-#    input  logic                        l1_master_i,
-#    input  logic     [AXI_ID_WIDTH-1:0] l1_id_i,
-#    input  logic                  [7:0] l1_len_i,
-#    input  logic                        l1_prefetch_i,
-#    input  logic                        l1_hit_i,
-#
-#    output logic                        l2_done_o,
-#    input  logic                        l2_accept_i,
-#    input  logic                        l2_drop_i,
-#    input  logic                        l2_master_i,
-#    input  logic     [AXI_ID_WIDTH-1:0] l2_id_i,
-#    input  logic                  [7:0] l2_len_i,
-#    input  logic                        l2_prefetch_i,
-#    input  logic                        l2_hit_i,
-#
-#    output logic                        master_select_o,
-#    output logic                        input_stall_o,
-#    output logic                        output_stall_o,
-#
-#    // B sender interface
-#    output logic                        b_drop_o,
-#    input  logic                        b_done_i,
-#    output logic     [AXI_ID_WIDTH-1:0] id_o,
-#    output logic                        prefetch_o,
-#    output logic                        hit_o,
-#
-#    // AXI W channel interfaces
-#    input  logic   [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
-#    input  logic                        s_axi4_wvalid,
-#    output logic                        s_axi4_wready,
-#    input  logic [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
-#    input  logic                        s_axi4_wlast,
-#    input  logic   [AXI_USER_WIDTH-1:0] s_axi4_wuser,
-#
-#    output logic   [AXI_DATA_WIDTH-1:0] m_axi4_wdata,
-#    output logic                        m_axi4_wvalid,
-#    input  logic                        m_axi4_wready,
-#    output logic [AXI_DATA_WIDTH/8-1:0] m_axi4_wstrb,
-#    output logic                        m_axi4_wlast,
-#    output logic   [AXI_USER_WIDTH-1:0] m_axi4_wuser
-#  );
-#
-"""
-
-  localparam BUFFER_WIDTH  = AXI_DATA_WIDTH+AXI_USER_WIDTH+AXI_DATA_WIDTH/8+1;
-
-  localparam INPUT_BUFFER_DEPTH = 4;
-  localparam L1_FIFO_DEPTH      = 8;
-  localparam L2_FIFO_DEPTH      = 4;
-
-  logic      [AXI_DATA_WIDTH-1:0] axi4_wdata;
-  logic                           axi4_wvalid;
-  logic                           axi4_wready;
-  logic    [AXI_DATA_WIDTH/8-1:0] axi4_wstrb;
-  logic                           axi4_wlast;
-  logic      [AXI_USER_WIDTH-1:0] axi4_wuser;
-
-  logic                           l1_fifo_valid_out;
-  logic                           l1_fifo_ready_in;
-  logic                           l1_fifo_valid_in;
-  logic                           l1_fifo_ready_out;
-
-  logic                           l1_req;
-  logic                           l1_accept_cur, l1_save_cur, l1_drop_cur;
-  logic                           l1_master_cur;
-  logic        [AXI_ID_WIDTH-1:0] l1_id_cur;
-  logic                     [7:0] l1_len_cur;
-  logic                           l1_hit_cur, l1_prefetch_cur;
-  logic                           l1_save_in, l1_save_out;
-  logic [log2(L1_FIFO_DEPTH)-1:0] n_l1_save_SP;
-
-  logic                           l2_fifo_valid_out;
-  logic                           l2_fifo_ready_in;
-  logic                           l2_fifo_valid_in;
-  logic                           l2_fifo_ready_out;
-
-  logic                           l2_req;
-  logic                           l2_accept_cur, l2_drop_cur;
-  logic                           l2_master_cur;
-  logic        [AXI_ID_WIDTH-1:0] l2_id_cur;
-  logic                     [7:0] l2_len_cur;
-  logic                           l2_hit_cur, l2_prefetch_cur;
-
-  logic                           fifo_select, fifo_select_SN, fifo_select_SP;
-  logic                           w_done;
-  logic                           b_drop_set;
-
-  // HUM buffer signals
-  logic                           hum_buf_ready_out;
-  logic                           hum_buf_valid_in;
-  logic                           hum_buf_ready_in;
-  logic                           hum_buf_valid_out;
-  logic                           hum_buf_underfull;
-
-  logic      [AXI_DATA_WIDTH-1:0] hum_buf_wdata;
-  logic    [AXI_DATA_WIDTH/8-1:0] hum_buf_wstrb;
-  logic                           hum_buf_wlast;
-  logic      [AXI_USER_WIDTH-1:0] hum_buf_wuser;
-
-  logic                           hum_buf_drop_req_SN, hum_buf_drop_req_SP;
-  logic                     [7:0] hum_buf_drop_len_SN, hum_buf_drop_len_SP;
-  logic                           hum_buf_almost_full;
-
-  logic                           stop_store;
-  logic                           wlast_in, wlast_out;
-  logic signed              [3:0] n_wlast_SN,          n_wlast_SP;
-  logic                           block_forwarding;
-
-  // Search FSM
-  typedef enum logic        [3:0] {STORE,                       BYPASS,
-                                   WAIT_L1_BYPASS_YES,          WAIT_L2_BYPASS_YES,
-                                   WAIT_L1_BYPASS_NO,           WAIT_L2_BYPASS_NO,
-                                   FLUSH,                       DISCARD,
-                                   DISCARD_FINISH}
-                                  hum_buf_state_t;
-  hum_buf_state_t                 hum_buf_SP; // Present state
-  hum_buf_state_tbg                 hum_buf_SN; // Next State
-
-  axi_buffer_rab
-    #(
-      .DATA_WIDTH       ( BUFFER_WIDTH        ),
-      .BUFFER_DEPTH     ( INPUT_BUFFER_DEPTH  )
-      )
-    u_input_buf
-    (
-      .clk       ( axi4_aclk                                                ),
-      .rstn      ( axi4_arstn                                               ),
-      // Push
-      .data_in   ( {s_axi4_wuser, s_axi4_wstrb, s_axi4_wdata, s_axi4_wlast} ),
-      .valid_in  ( s_axi4_wvalid                                            ),
-      .ready_out ( s_axi4_wready                                            ),
-      // Pop
-      .data_out  ( {axi4_wuser,   axi4_wstrb,   axi4_wdata,   axi4_wlast}   ),
-      .valid_out ( axi4_wvalid                                              ),
-      .ready_in  ( axi4_wready                                              )
-    );
-
-  axi_buffer_rab
-    #(
-      .DATA_WIDTH       ( 2+AXI_ID_WIDTH+8+4  ),
-      .BUFFER_DEPTH     ( L1_FIFO_DEPTH       )
-      )
-    u_l1_fifo
-    (
-      .clk       ( axi4_aclk                                                                                                    ),
-      .rstn      ( axi4_arstn                                                                                                   ),
-      // Push
-      .data_in   ( {l1_prefetch_i,   l1_hit_i,   l1_id_i,   l1_len_i,   l1_master_i,   l1_accept_i,   l1_save_i,   l1_drop_i}   ),
-      .valid_in  ( l1_fifo_valid_in                                                                                             ),
-      .ready_out ( l1_fifo_ready_out                                                                                            ),
-      // Pop
-      .data_out  ( {l1_prefetch_cur, l1_hit_cur, l1_id_cur, l1_len_cur, l1_master_cur, l1_accept_cur, l1_save_cur, l1_drop_cur} ),
-      .valid_out ( l1_fifo_valid_out                                                                                            ),
-      .ready_in  ( l1_fifo_ready_in                                                                                             )
-    );
-
-    // Push upon receiving new requests from the TLB.
-    assign l1_req           = l1_accept_i | l1_save_i | l1_drop_i;
-    assign l1_fifo_valid_in = l1_req & l1_fifo_ready_out;
-
-    // Signal handshake
-    assign l1_done_o  = l1_fifo_valid_in;
-    assign l2_done_o  = l2_fifo_valid_in;
-
-    // Stall AW input of L1 TLB
-    assign input_stall_o = ~(l1_fifo_ready_out & l2_fifo_ready_out);
-
-    // Interface b_drop signals + handshake
-    always_comb begin
-      if (fifo_select == 1'b0) begin
-        prefetch_o       = l1_prefetch_cur;
-        hit_o            = l1_hit_cur;
-        id_o             = l1_id_cur;
-
-        l1_fifo_ready_in = w_done | b_done_i;
-        l2_fifo_ready_in = 1'b0;
-      end else begin
-        prefetch_o       = l2_prefetch_cur;
-        hit_o            = l2_hit_cur;
-        id_o             = l2_id_cur;
-
-        l1_fifo_ready_in = 1'b0;
-        l2_fifo_ready_in = w_done | b_done_i;
-      end
-    end
-
-    // Detect when an L1 transaction save request enters or exits the L1 FIFO.
-    assign l1_save_in  = l1_fifo_valid_in & l1_save_i;
-    assign l1_save_out = l1_fifo_ready_in & l1_save_cur;
-
-    // Count the number of L1 transaction to save in the L1 FIFO.
-    always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
-      if (axi4_arstn == 0) begin
-        n_l1_save_SP <= '0;
-      end else if (l1_save_in ^ l1_save_out) begin
-        if (l1_save_in) begin
-          n_l1_save_SP <= n_l1_save_SP + 1'b1;
-        end else if (l1_save_out) begin
-          n_l1_save_SP <= n_l1_save_SP - 1'b1;
-        end
-      end
-    end
-
-    // Stall forwarding of AW L1 hits if:
-    // 1. The HUM buffer does not allow to be bypassed.
-    // 2. There are multiple L1 save requests in the FIFO, i.e., multiple L2 outputs pending.
-    assign output_stall_o = (n_l1_save_SP > 1) || (block_forwarding == 1'b1);
-
-  generate
-  if (ENABLE_L2TLB == 1) begin : HUM_BUFFER
-
-    axi_buffer_rab_bram
-    #(
-      .DATA_WIDTH       ( BUFFER_WIDTH      ),
-      .BUFFER_DEPTH     ( HUM_BUFFER_DEPTH  )
-      )
-    u_hum_buf
-    (
-      .clk           ( axi4_aclk                                                    ),
-      .rstn          ( axi4_arstn                                                   ),
-      // Push
-      .data_in       ( {axi4_wuser,    axi4_wstrb,    axi4_wdata,    axi4_wlast}    ),
-      .valid_in      ( hum_buf_valid_in                                             ),
-      .ready_out     ( hum_buf_ready_out                                            ),
-      // Pop
-      .data_out      ( {hum_buf_wuser, hum_buf_wstrb, hum_buf_wdata, hum_buf_wlast} ),
-      .valid_out     ( hum_buf_valid_out                                            ),
-      .ready_in      ( hum_buf_ready_in                                             ),
-      // Clear
-      .almost_full   ( hum_buf_almost_full                                          ),
-      .underfull     ( hum_buf_underfull                                            ),
-      .drop_req      ( hum_buf_drop_req_SP                                          ),
-      .drop_len      ( hum_buf_drop_len_SP                                          )
-    );
-
-    axi_buffer_rab
-    #(
-      .DATA_WIDTH       ( 2+AXI_ID_WIDTH+8+3  ),
-      .BUFFER_DEPTH     ( L2_FIFO_DEPTH       )
-      )
-    u_l2_fifo
-    (
-      .clk       ( axi4_aclk                                                                                        ),
-      .rstn      ( axi4_arstn                                                                                       ),
-      // Push
-      .data_in   ( {l2_prefetch_i,   l2_hit_i,   l2_id_i,   l2_len_i,   l2_master_i,   l2_accept_i,   l2_drop_i}    ),
-      .valid_in  ( l2_fifo_valid_in                                                                                 ),
-      .ready_out ( l2_fifo_ready_out                                                                                ),
-      // Pop
-      .data_out  ( {l2_prefetch_cur, l2_hit_cur, l2_id_cur, l2_len_cur, l2_master_cur, l2_accept_cur, l2_drop_cur}  ),
-      .valid_out ( l2_fifo_valid_out                                                                                ),
-      .ready_in  ( l2_fifo_ready_in                                                                                 )
-    );
-
-    // Push upon receiving new result from TLB.
-    assign l2_req           = l2_accept_i | l2_drop_i;
-    assign l2_fifo_valid_in = l2_req & l2_fifo_ready_out;
-
-    assign wlast_in  =    axi4_wlast & hum_buf_valid_in  & hum_buf_ready_out;
-    assign wlast_out = hum_buf_wlast & hum_buf_valid_out & hum_buf_ready_in;
-
-    always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
-      if (axi4_arstn == 0) begin
-        fifo_select_SP      <= 1'b0;
-        hum_buf_drop_len_SP <=  'b0;
-        hum_buf_drop_req_SP <= 1'b0;
-        hum_buf_SP          <= STORE;
-        n_wlast_SP          <=  'b0;
-      end else begin
-        fifo_select_SP      <= fifo_select_SN;
-        hum_buf_drop_len_SP <= hum_buf_drop_len_SN;
-        hum_buf_drop_req_SP <= hum_buf_drop_req_SN;
-        hum_buf_SP          <= hum_buf_SN;
-        n_wlast_SP          <= n_wlast_SN;
-      end
-    end
-
-    always_comb begin
-      n_wlast_SN = n_wlast_SP;
-      if (hum_buf_drop_req_SP) begin  // Happens exactly once per burst to be dropped.
-        n_wlast_SN -= 1;
-      end
-      if (wlast_in) begin
-        n_wlast_SN += 1;
-      end
-      if (wlast_out) begin
-        n_wlast_SN -= 1;
-      end
-    end
-
-    always_comb begin : HUM_BUFFER_FSM
-      hum_buf_SN       = hum_buf_SP;
-
-      m_axi4_wlast     = 1'b0;
-      m_axi4_wdata     =  'b0;
-      m_axi4_wstrb     =  'b0;
-      m_axi4_wuser     =  'b0;
-
-      m_axi4_wvalid    = 1'b0;
-      axi4_wready      = 1'b0;
-
-      hum_buf_valid_in = 1'b0;
-      hum_buf_ready_in = 1'b0;
-
-      hum_buf_drop_req_SN = hum_buf_drop_req_SP;
-      hum_buf_drop_len_SN = hum_buf_drop_len_SP;
-      master_select_o  = 1'b0;
-
-      w_done           = 1'b0; // read from FIFO without handshake with B sender
-      b_drop_o         = 1'b0; // send data from FIFO to B sender (with handshake)
-      fifo_select      = 1'b0;
-
-      fifo_select_SN   = fifo_select_SP;
-      stop_store       = 1'b0;
-
-      block_forwarding = 1'b0;
-
-      unique case (hum_buf_SP)
-
-        STORE : begin
-          // Simply store the data in the buffer.
-          hum_buf_valid_in = axi4_wvalid & hum_buf_ready_out;
-          axi4_wready      = hum_buf_ready_out;
-
-          // We have got a full burst in the HUM buffer, thus stop storing.
-          if (wlast_in & !hum_buf_underfull | (n_wlast_SP > $signed(0))) begin
-            hum_buf_SN = WAIT_L1_BYPASS_YES;
-
-          // The buffer is full, thus wait for decision.
-          end else if (~hum_buf_ready_out) begin
-            hum_buf_SN = WAIT_L1_BYPASS_NO;
-          end
-
-          // Avoid the forwarding of L1 hits until we know whether we can bypass.
-          if (l1_fifo_valid_out & l1_save_cur) begin
-            block_forwarding = 1'b1;
-          end
-        end
-
-        WAIT_L1_BYPASS_YES : begin
-          // Wait for orders from L1 TLB.
-          if (l1_fifo_valid_out) begin
-
-            // L1 hit - forward data from buffer
-            if (l1_accept_cur) begin
-              m_axi4_wlast       = hum_buf_wlast;
-              m_axi4_wdata       = hum_buf_wdata;
-              m_axi4_wstrb       = hum_buf_wstrb;
-              m_axi4_wuser       = hum_buf_wuser;
-
-              m_axi4_wvalid      = hum_buf_valid_out;
-              hum_buf_ready_in   = m_axi4_wready;
-
-              master_select_o    = l1_master_cur;
-
-              // Detect last data beat.
-              if (wlast_out) begin
-                fifo_select      = 1'b0;
-                w_done           = 1'b1;
-                hum_buf_SN       = STORE;
-              end
-
-            // L1 miss - wait for L2
-            end else if (l1_save_cur) begin
-              fifo_select        = 1'b0;
-              w_done             = 1'b1;
-              hum_buf_SN         = WAIT_L2_BYPASS_YES;
-
-            // L1 prefetch, prot, multi - drop data
-            end else if (l1_drop_cur) begin
-              fifo_select_SN      = 1'b0; // L1
-              hum_buf_drop_req_SN = 1'b1;
-              hum_buf_drop_len_SN = l1_len_cur;
-              hum_buf_SN          = FLUSH;
-            end
-          end
-        end
-
-        WAIT_L2_BYPASS_YES : begin
-          // Wait for orders from L2 TLB.
-          if (l2_fifo_valid_out) begin
-
-            // L2 hit - forward data from buffer
-            if (l2_accept_cur) begin
-              m_axi4_wlast       = hum_buf_wlast;
-              m_axi4_wdata       = hum_buf_wdata;
-              m_axi4_wstrb       = hum_buf_wstrb;
-              m_axi4_wuser       = hum_buf_wuser;
-
-              m_axi4_wvalid      = hum_buf_valid_out;
-              hum_buf_ready_in   = m_axi4_wready;
-
-              master_select_o    = l2_master_cur;
-
-              // Detect last data beat.
-              if (wlast_out) begin
-                fifo_select      = 1'b1;
-                w_done           = 1'b1;
-                hum_buf_SN       = STORE;
-              end
-
-            // L2 miss/prefetch hit
-            end else if (l2_drop_cur) begin
-              fifo_select_SN      = 1'b1; // L2
-              hum_buf_drop_req_SN = 1'b1;
-              hum_buf_drop_len_SN = l2_len_cur;
-              hum_buf_SN          = FLUSH;
-            end
-
-          // While we wait for orders from L2 TLB, we can still drop and accept L1 transactions.
-          end else if (l1_fifo_valid_out) begin
-
-            // L1 hit
-            if (l1_accept_cur) begin
-              hum_buf_SN         = BYPASS;
-
-            // L1 prefetch/prot/multi
-            end else if (l1_drop_cur) begin
-              hum_buf_SN         = DISCARD;
-            end
-          end
-        end
-
-        FLUSH : begin
-          // Clear HUM buffer flush request.
-          hum_buf_drop_req_SN = 1'b0;
-
-          // perform handshake with B sender
-          fifo_select      = fifo_select_SP;
-          b_drop_o         = 1'b1;
-          if (b_done_i) begin
-            hum_buf_SN     = STORE;
-          end
-        end
-
-        BYPASS : begin
-          // Forward one full transaction from input buffer.
-          m_axi4_wlast       = axi4_wlast;
-          m_axi4_wdata       = axi4_wdata;
-          m_axi4_wstrb       = axi4_wstrb;
-          m_axi4_wuser       = axi4_wuser;
-
-          m_axi4_wvalid      = axi4_wvalid;
-          axi4_wready        = m_axi4_wready;
-
-          master_select_o    = l1_master_cur;
-
-          // We have got a full transaction.
-          if (axi4_wlast & axi4_wready & axi4_wvalid) begin
-            fifo_select      = 1'b0;
-            w_done           = 1'b1;
-            hum_buf_SN       = WAIT_L2_BYPASS_YES;
-          end
-        end
-
-        DISCARD : begin
-          // Discard one full transaction from input buffer.
-          axi4_wready        = 1'b1;
-
-          // We have got a full transaction.
-          if (axi4_wlast & axi4_wready & axi4_wvalid) begin
-            // Try to perform handshake with B sender.
-            fifo_select      = 1'b0;
-            b_drop_o         = 1'b1;
-            // We cannot wait here due to axi4_wready.
-            if (b_done_i) begin
-              hum_buf_SN     = WAIT_L2_BYPASS_YES;
-            end else begin
-              hum_buf_SN     = DISCARD_FINISH;
-            end
-          end
-        end
-
-        DISCARD_FINISH : begin
-          // Perform handshake with B sender.
-          fifo_select      = 1'b0;
-          b_drop_o         = 1'b1;
-          if (b_done_i) begin
-            hum_buf_SN     = WAIT_L2_BYPASS_YES;
-          end
-        end
-
-        WAIT_L1_BYPASS_NO : begin
-          // Do not allow the forwarding of L1 hits.
-          block_forwarding       = 1'b1;
-
-          // Wait for orders from L1 TLB.
-          if (l1_fifo_valid_out) begin
-
-            // L1 hit - forward data from/through HUM buffer and refill the buffer
-            if (l1_accept_cur) begin
-              // Forward data from HUM buffer.
-              m_axi4_wlast       = hum_buf_wlast;
-              m_axi4_wdata       = hum_buf_wdata;
-              m_axi4_wstrb       = hum_buf_wstrb;
-              m_axi4_wuser       = hum_buf_wuser;
-
-              m_axi4_wvalid      = hum_buf_valid_out;
-              hum_buf_ready_in   = m_axi4_wready;
-
-              master_select_o    = l1_master_cur;
-
-              // Refill the HUM buffer. Stop when buffer full.
-              stop_store         = ~hum_buf_ready_out;
-              hum_buf_valid_in   = stop_store ? 1'b0 : axi4_wvalid      ;
-              axi4_wready        = stop_store ? 1'b0 : hum_buf_ready_out;
-
-              // Detect last data beat.
-              if (wlast_out) begin
-                fifo_select      = 1'b0;
-                w_done           = 1'b1;
-                if (~hum_buf_ready_out | hum_buf_almost_full) begin
-                  hum_buf_SN     = WAIT_L1_BYPASS_NO;
-                end else begin
-                  hum_buf_SN     = STORE;
-                end
-              end
-
-              // Allow the forwarding of L1 hits.
-              block_forwarding   = 1'b0;
-
-            // L1 miss - wait for L2
-            end else if (l1_save_cur) begin
-              fifo_select        = 1'b0;
-              w_done             = 1'b1;
-              hum_buf_SN         = WAIT_L2_BYPASS_NO;
-
-            // L1 prefetch, prot, multi - drop data
-            end else if (l1_drop_cur) begin
-              fifo_select_SN      = 1'b0; // L1
-              hum_buf_drop_req_SN = 1'b1;
-              hum_buf_drop_len_SN = l1_len_cur;
-              hum_buf_SN          = FLUSH;
-
-              // Allow the forwarding of L1 hits.
-              block_forwarding   = 1'b0;
-            end
-          end
-        end
-
-        WAIT_L2_BYPASS_NO : begin
-          // Do not allow the forwarding of L1 hits.
-          block_forwarding       = 1'b1;
-
-          // Wait for orders from L2 TLB.
-          if (l2_fifo_valid_out) begin
-
-            // L2 hit - forward first part from HUM buffer, rest from input buffer
-            if (l2_accept_cur) begin
-              // Forward data from HUM buffer.
-              m_axi4_wlast       = hum_buf_wlast;
-              m_axi4_wdata       = hum_buf_wdata;
-              m_axi4_wstrb       = hum_buf_wstrb;
-              m_axi4_wuser       = hum_buf_wuser;
-
-              m_axi4_wvalid      = hum_buf_valid_out;
-              hum_buf_ready_in   = m_axi4_wready;
-
-              master_select_o    = l2_master_cur;
-
-              // Refill the HUM buffer. Stop when buffer full.
-              stop_store         = ~hum_buf_ready_out;
-              hum_buf_valid_in   = stop_store ? 1'b0 : axi4_wvalid      ;
-              axi4_wready        = stop_store ? 1'b0 : hum_buf_ready_out;
-
-              // Detect last data beat.
-              if (wlast_out) begin
-                fifo_select      = 1'b1;
-                w_done           = 1'b1;
-                if (~hum_buf_ready_out | hum_buf_almost_full) begin
-                  hum_buf_SN     = WAIT_L1_BYPASS_NO;
-                end else begin
-                  hum_buf_SN     = STORE;
-                end
-              end
-
-              // Allow the forwarding of L1 hits.
-              block_forwarding   = 1'b0;
-
-            // L2 miss/prefetch hit - drop data
-            end else if (l2_drop_cur) begin
-              fifo_select_SN      = 1'b1; // L2
-              hum_buf_drop_req_SN = 1'b1;
-              hum_buf_drop_len_SN = l2_len_cur;
-              hum_buf_SN          = FLUSH;
-
-              // Allow the forwarding of L1 hits.
-              block_forwarding   = 1'b0;
-            end
-          end
-        end
-
-
-        default: begin
-          hum_buf_SN = STORE;
-        end
-
-      endcase // hum_buf_SP
-    end // HUM_BUFFER_FSM
-
-    assign b_drop_set = 1'b0;
-
-  end else begin // HUM_BUFFER
-
-    // register to perform the handshake with B sender
-    always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
-      if (axi4_arstn == 0) begin
-        b_drop_o <= 1'b0;
-      end else if (b_done_i) begin
-        b_drop_o <= 1'b0;
-      end else if (b_drop_set) begin
-        b_drop_o <= 1'b1;;
-      end
-    end
-
-    always_comb begin : OUTPUT_CTRL
-
-      fifo_select   = 1'b0;
-      w_done        = 1'b0;
-      b_drop_set    = 1'b0;
-
-      m_axi4_wlast  = 1'b0;
-      m_axi4_wdata  =  'b0;
-      m_axi4_wstrb  =  'b0;
-      m_axi4_wuser  =  'b0;
-
-      m_axi4_wvalid = 1'b0;
-      axi4_wready   = 1'b0;
-
-      if (l1_fifo_valid_out) begin
-        // forward data
-        if (l1_accept_cur) begin
-          m_axi4_wlast  = axi4_wlast;
-          m_axi4_wdata  = axi4_wdata;
-          m_axi4_wstrb  = axi4_wstrb;
-          m_axi4_wuser  = axi4_wuser;
-
-          m_axi4_wvalid = axi4_wvalid;
-          axi4_wready   = m_axi4_wready;
-
-          // Simply pop from FIFO upon last data beat.
-          w_done        = axi4_wlast & axi4_wvalid & axi4_wready;
-
-        // discard entire burst
-        end else if (b_drop_o == 1'b0) begin
-          axi4_wready   = 1'b1;
-
-          // Simply pop from FIFO upon last data beat. Perform handshake with B sender.
-          if (axi4_wlast & axi4_wvalid & axi4_wready)
-            b_drop_set  = 1'b1;
-        end
-      end
-
-    end // OUTPUT_CTRL
-
-    assign master_select_o     = l1_master_cur;
-    assign l2_fifo_ready_out   = 1'b1;
-    assign block_forwarding    = 1'b0;
-
-    // unused signals
-    assign hum_buf_ready_out   = 1'b0;
-    assign hum_buf_valid_in    = 1'b0;
-    assign hum_buf_ready_in    = 1'b0;
-    assign hum_buf_valid_out   = 1'b0;
-    assign hum_buf_wdata       =  'b0;
-    assign hum_buf_wstrb       =  'b0;
-    assign hum_buf_wlast       = 1'b0;
-    assign hum_buf_wuser       =  'b0;
-    assign hum_buf_drop_len_SN =  'b0;
-    assign hum_buf_drop_req_SN = 1'b0;
-    assign hum_buf_almost_full = 1'b0;
-
-    assign l2_fifo_valid_in    = 1'b0;
-    assign l2_fifo_valid_out   = 1'b0;
-    assign l2_prefetch_cur     = 1'b0;
-    assign l2_hit_cur          = 1'b0;
-    assign l2_id_cur           =  'b0;
-    assign l2_len_cur          =  'b0;
-    assign l2_master_cur       = 1'b0;
-    assign l2_accept_cur       = 1'b0;
-    assign l2_drop_cur         = 1'b0;
-
-    assign l2_req              = 1'b0;
-
-    assign fifo_select_SN      = 1'b0;
-    assign fifo_select_SP      = 1'b0;
-
-    assign stop_store          = 1'b0;
-    assign n_wlast_SP          =  'b0;
-    assign wlast_in            = 1'b0;
-    assign wlast_out           = 1'b0;
-
-  end // HUM_BUFFER
-
-  endgenerate
-"""
diff --git a/src/iommu/axi_rab/axi4_w_sender.py b/src/iommu/axi_rab/axi4_w_sender.py
deleted file mode 100644
index 9916334f..00000000
--- a/src/iommu/axi_rab/axi4_w_sender.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_w_sender(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.s_axi4_wdata = Signal()  # input
-        self.s_axi4_wvalid = Signal()  # input
-        self.s_axi4_wready = Signal()  # output
-        self.s_axi4_wstrb = Signal()  # input
-        self.s_axi4_wlast = Signal()  # input
-        self.s_axi4_wuser = Signal()  # input
-        self.m_axi4_wdata = Signal()  # output
-        self.m_axi4_wvalid = Signal()  # output
-        self.m_axi4_wready = Signal()  # input
-        self.m_axi4_wstrb = Signal()  # output
-        self.m_axi4_wlast = Signal()  # output
-        self.m_axi4_wuser = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.m_axi4_wdata.eq(self.s_axi4_wdata)
-        m.d.comb += self.m_axi4_wstrb.eq(self.s_axi4_wstrb)
-        m.d.comb += self.m_axi4_wlast.eq(self.s_axi4_wlast)
-        m.d.comb += self.m_axi4_wuser.eq(self.s_axi4_wuser)
-        m.d.comb += self.m_axi4_wvalid.eq(self.s_axi4_wvalid)
-        m.d.comb += self.s_axi4_wready.eq(self.m_axi4_wready)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_w_sender
-#  #(
-#    parameter AXI_DATA_WIDTH = 32,
-#    parameter AXI_USER_WIDTH = 2
-#  )
-#  (
-#    input                         axi4_aclk,
-#    input                         axi4_arstn,
-#
-#    input    [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
-#    input                         s_axi4_wvalid,
-#    output                        s_axi4_wready,
-#    input  [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
-#    input                         s_axi4_wlast,
-#    input    [AXI_USER_WIDTH-1:0] s_axi4_wuser,
-#
-#    output   [AXI_DATA_WIDTH-1:0] m_axi4_wdata,
-#    output                        m_axi4_wvalid,
-#    input                         m_axi4_wready,
-#    output [AXI_DATA_WIDTH/8-1:0] m_axi4_wstrb,
-#    output                        m_axi4_wlast,
-#    output   [AXI_USER_WIDTH-1:0] m_axi4_wuser
-#  );
-#
-#  assign m_axi4_wdata  = s_axi4_wdata;
-#  assign m_axi4_wstrb  = s_axi4_wstrb;
-#  assign m_axi4_wlast  = s_axi4_wlast;
-#  assign m_axi4_wuser  = s_axi4_wuser;
-#
-#  assign m_axi4_wvalid = s_axi4_wvalid;
-#  assign s_axi4_wready = m_axi4_wready;
-#
-# endmodule
-#
-#
diff --git a/src/iommu/axi_rab/axi_buffer_rab.py b/src/iommu/axi_rab/axi_buffer_rab.py
deleted file mode 100644
index b4d99299..00000000
--- a/src/iommu/axi_rab/axi_buffer_rab.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi_buffer_rab(Elaboratable):
-
-    def __init__(self):
-        self.clk = Signal()  # input
-        self.rstn = Signal()  # input
-        self.data_out = Signal(DATA_WIDTH)  # output
-        self.valid_out = Signal()  # output
-        self.ready_in = Signal()  # input
-        self.valid_in = Signal()  # input
-        self.data_in = Signal(DATA_WIDTH)  # input
-        self.ready_out = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.full.eq(self.None)
-        m.d.comb += self.data_out.eq(self.None)
-        m.d.comb += self.valid_out.eq(self.None)
-        m.d.comb += self.ready_out.eq(self.None)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# //import CfMath::log2;
-#
-# module axi_buffer_rab
-#  //#(
-#  //  parameter DATA_WIDTH,
-#  //  parameter BUFFER_DEPTH
-#  //)
-#  (
-#    input logic                   clk,
-#    input logic                   rstn,
-#
-#    // Downstream port
-#    output logic [DATA_WIDTH-1:0] data_out,
-#    output logic                  valid_out,
-#    input  logic                  ready_in,
-#
-#    // Upstream port
-#    input  logic                  valid_in,
-#    input  logic [DATA_WIDTH-1:0] data_in,
-#    output logic                  ready_out
-#  );
-#
-#  localparam integer LOG_BUFFER_DEPTH = log2(BUFFER_DEPTH);
-#
-#    // Internal data structures
-#    reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_in;   // location to which we last wrote
-#    reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_out;  // location from which we last sent
-#    reg     [LOG_BUFFER_DEPTH : 0] elements;     // number of elements in the buffer
-#    reg       [DATA_WIDTH - 1 : 0] buffer [BUFFER_DEPTH - 1 : 0];
-#
-#    wire full;
-#
-#    integer loop1;
-#
-#    assign full = (elements == BUFFER_DEPTH);
-#
-#    always @(posedge clk or negedge rstn)
-#      begin: elements_sequential
-#        if (rstn == 1'b0)
-#          elements <= 0;
-#        else
-#        begin
-#          // ------------------
-#          // Are we filling up?
-#          // ------------------
-#          // One out, none in
-#          if (ready_in && valid_out && (!valid_in || full))
-#            elements <= elements - 1;
-#          // None out, one in
-#          else if ((!valid_out || !ready_in) && valid_in && !full)
-#            elements <= elements + 1;
-#          // Else, either one out and one in, or none out and none in - stays unchanged
-#        end
-#      end
-#
-#    always @(posedge clk or negedge rstn)
-#      begin: buffers_sequential
-#        if (rstn == 1'b0)
-#        begin
-#          for (loop1 = 0 ; loop1 < BUFFER_DEPTH ; loop1 = loop1 + 1)
-#            buffer[loop1] <= 0;
-#        end
-#        else
-#        begin
-#          // Update the memory
-#          if (valid_in && !full)
-#            buffer[pointer_in] <= data_in;
-#        end
-#      end
-#
-#    always @(posedge clk or negedge rstn)
-#      begin: sequential
-#        if (rstn == 1'b0)
-#        begin
-#          pointer_out <= 0;
-#          pointer_in <= 0;
-#        end
-#        else
-#        begin
-#          // ------------------------------------
-#          // Check what to do with the input side
-#          // ------------------------------------
-#          // We have some input, increase by 1 the input pointer
-#          if (valid_in && !full)
-#          begin
-#            if (pointer_in == $unsigned(BUFFER_DEPTH - 1))
-#              pointer_in <= 0;
-#            else
-#              pointer_in <= pointer_in + 1;
-#          end
-#          // Else we don't have any input, the input pointer stays the same
-#
-#          // -------------------------------------
-#          // Check what to do with the output side
-#          // -------------------------------------
-#          // We had pushed one flit out, we can try to go for the next one
-#          if (ready_in && valid_out)
-#          begin
-#            if (pointer_out == $unsigned(BUFFER_DEPTH - 1))
-#              pointer_out <= 0;
-#            else
-#              pointer_out <= pointer_out + 1;
-#          end
-#          // Else stay on the same output location
-#        end
-#      end
-#
-#    // Update output ports
-#    assign data_out = buffer[pointer_out];
-#    assign valid_out = (elements != 0);
-#
-#    assign ready_out = ~full;
-#
-# endmodule
-#
-#
diff --git a/src/iommu/axi_rab/axi_buffer_rab_bram.py b/src/iommu/axi_rab/axi_buffer_rab_bram.py
deleted file mode 100644
index 349b314e..00000000
--- a/src/iommu/axi_rab/axi_buffer_rab_bram.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi_buffer_rab_bram(Elaboratable):
-
-    def __init__(self):
-        self.clk = Signal()  # input
-        self.rstn = Signal()  # input
-        self.data_out = Signal(DATA_WIDTH)  # output
-        self.valid_out = Signal()  # output
-        self.ready_in = Signal()  # input
-        self.valid_in = Signal()  # input
-        self.data_in = Signal(DATA_WIDTH)  # input
-        self.ready_out = Signal()  # output
-        self.almost_full = Signal()  # output
-        self.underfull = Signal()  # output
-        self.drop_req = Signal()  # input
-        self.drop_len = Signal(8)  # input
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# ////import CfMath::log2;
-#
-# module axi_buffer_rab_bram
-#  //#(
-#  //  parameter DATA_WIDTH,
-#  //  parameter BUFFER_DEPTH
-#  //  )
-#   (
-#    input logic                   clk,
-#    input logic                   rstn,
-#
-#    // Downstream port
-#    output logic [DATA_WIDTH-1:0] data_out,
-#    output logic                  valid_out,
-#    input  logic                  ready_in,
-#
-#    // Upstream port
-#    input  logic                  valid_in,
-#    input  logic [DATA_WIDTH-1:0] data_in,
-#    output logic                  ready_out,
-#
-#    // Status and drop control
-#    output logic                  almost_full,
-#    output logic                  underfull,
-#    input  logic                  drop_req,
-#    // Number of items to drop.  As for AXI lengths, counting starts at zero, i.e., `drop_len == 0`
-#    // and `drop_req` means drop one item.
-#    input  logic [7:0]            drop_len
-#    );
-#
-"""  #docstring_begin
-  // The BRAM needs to be in "write-first" mode for first-word fall-through FIFO behavior.
-  // To still push and pop simultaneously if the buffer is full, we internally increase the
-  // buffer depth by 1.
-  localparam ACT_BUFFER_DEPTH     = BUFFER_DEPTH+1;
-  localparam ACT_LOG_BUFFER_DEPTH = log2(ACT_BUFFER_DEPTH+1);
-
-  /**
-    * Internal data structures
-    */
-  // Location to which we last wrote
-  logic        [ACT_LOG_BUFFER_DEPTH-1:0] ptr_in_d,                   ptr_in_q;
-  // Location from which we last sent
-  logic        [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_d,                  ptr_out_q;
-  // Required for fall-through behavior on the first word
-  logic        [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_bram;
-  // Number of elements in the buffer.  Can be negative if elements that have been dropped have not
-  // yet been written.
-  logic signed   [ACT_LOG_BUFFER_DEPTH:0] n_elems_d,                  n_elems_q;
-
-  logic           [DATA_WIDTH-1:0]        data_out_bram, data_out_q;
-  logic                                   valid_out_q;
-
-  logic full;
-
-  assign almost_full = (n_elems_q == BUFFER_DEPTH-1);
-  assign full        = (n_elems_q == BUFFER_DEPTH);
-
-  always_ff @(posedge clk, negedge rstn) begin
-    if (~rstn) begin
-      n_elems_q <= '0;
-      ptr_in_q  <= '0;
-      ptr_out_q <= '0;
-    end else begin
-      n_elems_q <= n_elems_d;
-      ptr_in_q  <= ptr_in_d;
-      ptr_out_q <= ptr_out_d;
-    end
-  end
-
-  // Update the number of elements.
-  always_comb begin
-    n_elems_d = n_elems_q;
-    if (drop_req) begin
-      n_elems_d -= (drop_len + 1);
-    end
-    if (valid_in && ready_out) begin
-      n_elems_d += 1;
-    end
-    if (valid_out && ready_in) begin
-      n_elems_d -= 1;
-    end
-  end
-
-  // Update the output pointer.
-  always_comb begin
-    ptr_out_d = ptr_out_q;
-    if (drop_req) begin
-      if ((ptr_out_q + drop_len + 1) > (ACT_BUFFER_DEPTH - 1)) begin
-        ptr_out_d = drop_len + 1 - (ACT_BUFFER_DEPTH - ptr_out_q);
-      end else begin
-        ptr_out_d += (drop_len + 1);
-      end
-    end
-    if (valid_out && ready_in) begin
-      if (ptr_out_d == (ACT_BUFFER_DEPTH - 1)) begin
-        ptr_out_d = '0;
-      end else begin
-        ptr_out_d += 1;
-      end
-    end
-  end
-
-  // The BRAM has a read latency of one cycle, so apply the new address one cycle earlier for
-  // first-word fall-through FIFO behavior.
-  //assign ptr_out_bram = (ptr_out_q == (ACT_BUFFER_DEPTH-1)) ? '0 : (ptr_out_q + 1);
-  assign ptr_out_bram = ptr_out_d;
-
-  // Update the input pointer.
-  always_comb begin
-    ptr_in_d = ptr_in_q;
-    if (valid_in && ready_out) begin
-      if (ptr_in_d == (ACT_BUFFER_DEPTH - 1)) begin
-        ptr_in_d = '0;
-      end else begin
-        ptr_in_d += 1;
-      end
-    end
-  end
-
-  // Update output ports.
-  assign valid_out = (n_elems_q > $signed(0));
-  assign underfull = (n_elems_q < $signed(0));
-  assign ready_out = ~full;
-
-  ram_tp_write_first #(
-    .ADDR_WIDTH ( ACT_LOG_BUFFER_DEPTH ),
-    .DATA_WIDTH ( DATA_WIDTH           )
-  )
-  ram_tp_write_first_0
-  (
-    .clk   ( clk              ),
-    .we    ( valid_in & ~full ),
-    .addr0 ( ptr_in_q         ),
-    .addr1 ( ptr_out_bram     ),
-    .d_i   ( data_in          ),
-    .d0_o  (                  ),
-    .d1_o  ( data_out_bram    )
-  );
-
-  // When reading from/writing two the same address on both ports ("Write-Read Collision"),
-  // the data on the read port is invalid (during the write cycle). In this implementation,
-  // this can happen only when the buffer is empty. Thus, we forward the data from an
-  // register in this case.
-  always @(posedge clk) begin
-    if (rstn == 1'b0) begin
-      data_out_q <= 'b0;
-    end else if ( (ptr_out_bram == ptr_in_q) && (valid_in && !full) ) begin
-      data_out_q <= data_in;
-    end
-  end
-
-  always @(posedge clk) begin
-    if (rstn == 1'b0) begin
-      valid_out_q <= 'b0;
-    end else begin
-      valid_out_q <= valid_out;
-    end
-  end
-
-  // Drive output data
-  always_comb begin
-    if (valid_out && !valid_out_q) begin // We have just written to an empty FIFO
-      data_out = data_out_q;
-    end else begin
-      data_out = data_out_bram;
-    end
-  end
-
-"""
-# endmodule
-#
-#
diff --git a/src/iommu/axi_rab/axi_rab_cfg.py b/src/iommu/axi_rab/axi_rab_cfg.py
deleted file mode 100644
index 43843b95..00000000
--- a/src/iommu/axi_rab/axi_rab_cfg.py
+++ /dev/null
@@ -1,707 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi_rab_cfg(Elaboratable):
-
-    def __init__(self):
-        self.Clk_CI = Signal()  # input
-        self.Rst_RBI = Signal()  # input
-        self.s_axi_awaddr = Signal(AXI_ADDR_WIDTH)  # input
-        self.s_axi_awvalid = Signal()  # input
-        self.s_axi_awready = Signal()  # output
-        self.s_axi_wdata = Signal()  # input
-        self.s_axi_wstrb = Signal(1+ERROR p_expression_25)  # input
-        self.s_axi_wvalid = Signal()  # input
-        self.s_axi_wready = Signal()  # output
-        self.s_axi_bresp = Signal(2)  # output
-        self.s_axi_bvalid = Signal()  # output
-        self.s_axi_bready = Signal()  # input
-        self.s_axi_araddr = Signal(AXI_ADDR_WIDTH)  # input
-        self.s_axi_arvalid = Signal()  # input
-        self.s_axi_arready = Signal()  # output
-        self.s_axi_rdata = Signal(AXI_DATA_WIDTH)  # output
-        self.s_axi_rresp = Signal(2)  # output
-        self.s_axi_rvalid = Signal()  # output
-        self.s_axi_rready = Signal()  # input
-        self.L1Cfg_DO = Signal()  # output
-        self.L1AllowMultiHit_SO = Signal()  # output
-        self.MissAddr_DI = Signal(ADDR_WIDTH_VIRT)  # input
-        self.MissMeta_DI = Signal(MISS_META_WIDTH)  # input
-        self.Miss_SI = Signal()  # input
-        self.MhFifoFull_SO = Signal()  # output
-        self.wdata_l2 = Signal()  # output
-        self.waddr_l2 = Signal()  # output
-        self.wren_l2 = Signal(N_PORTS)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# // --=========================================================================--
-# //
-# //  ââââââ âââ  ââââââ    âââââââ  ââââââ âââââââ      âââââââââââââââ âââââââ
-# // âââââââââââââââââââ    ââââââââââââââââââââââââ    ââââââââââââââââââââââââ
-# // ââââââââ ââââââ âââ    ââââââââââââââââââââââââ    âââ     ââââââ  âââ  ââââ
-# // ââââââââ ââââââ âââ    ââââââââââââââââââââââââ    âââ     ââââââ  âââ   âââ
-# // âââ  âââââââ ââââââ    âââ  ââââââ  âââââââââââ    âââââââââââ     âââââââââ
-# // âââ  ââââââ  ââââââ    âââ  ââââââ  ââââââââââ      ââââââââââ      âââââââ
-# //
-# //
-# // Author: Pirmin Vogel - vogelpi@iis.ee.ethz.ch
-# //
-# // Purpose : AXI4-Lite configuration and miss handling interface for RAB
-# //
-# // --=========================================================================--
-#
-# //import CfMath::log2;
-#
-# module axi_rab_cfg
-#  #(
-#    parameter N_PORTS         =   3,
-#    parameter N_REGS          = 196,
-#    parameter N_L2_SETS       =  32,
-#    parameter N_L2_SET_ENTRIES=  32,
-#    parameter ADDR_WIDTH_PHYS =  40,
-#    parameter ADDR_WIDTH_VIRT =  32,
-#    parameter N_FLAGS         =   4,
-#    parameter AXI_DATA_WIDTH  =  64,
-#    parameter AXI_ADDR_WIDTH  =  32,
-#    parameter MISS_META_WIDTH =  10,  // <= FIFO_WIDTH
-#    parameter MH_FIFO_DEPTH   =  16
-#    )
-#   (
-#    input  logic                                    Clk_CI,
-#    input  logic                                    Rst_RBI,
-#
-#    // AXI Lite interface
-#    input  logic [AXI_ADDR_WIDTH-1:0]               s_axi_awaddr,
-#    input  logic                                    s_axi_awvalid,
-#    output logic                                    s_axi_awready,
-#    input  logic [AXI_DATA_WIDTH/8-1:0][7:0]        s_axi_wdata,
-#    input  logic [AXI_DATA_WIDTH/8-1:0]             s_axi_wstrb,
-#    input  logic                                    s_axi_wvalid,
-#    output logic                                    s_axi_wready,
-#    output logic [1:0]                              s_axi_bresp,
-#    output logic                                    s_axi_bvalid,
-#    input  logic                                    s_axi_bready,
-#    input  logic [AXI_ADDR_WIDTH-1:0]               s_axi_araddr,
-#    input  logic                                    s_axi_arvalid,
-#    output logic                                    s_axi_arready,
-#    output logic [AXI_DATA_WIDTH-1:0]               s_axi_rdata,
-#    output logic [1:0]                              s_axi_rresp,
-#    output logic                                    s_axi_rvalid,
-#    input  logic                                    s_axi_rready,
-#
-#    // Slice configuration
-#    output logic [N_REGS-1:0][63:0]                 L1Cfg_DO,
-#    output logic                                    L1AllowMultiHit_SO,
-#
-#    // Miss handling
-#    input  logic [ADDR_WIDTH_VIRT-1:0]              MissAddr_DI,
-#    input  logic [MISS_META_WIDTH-1:0]              MissMeta_DI,
-#    input  logic                                    Miss_SI,
-#    output logic                                    MhFifoFull_SO,
-#
-#    // L2 TLB
-#    output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] wdata_l2,
-#    output logic [N_PORTS-1:0] [AXI_ADDR_WIDTH-1:0] waddr_l2,
-#    output logic [N_PORTS-1:0]                      wren_l2
-#  );
-#
-"""  #docstring_begin
-
-  localparam ADDR_LSB = log2(64/8); // 64 even if the AXI Lite interface is 32,
-                                    // because RAB slices are 64 bit wide.
-  localparam ADDR_MSB = log2(N_REGS)+ADDR_LSB-1;
-
-  localparam L2SINGLE_AMAP_SIZE = 16'h4000; // Maximum 2048 TLB entries in L2
-
-  localparam integer N_L2_ENTRIES = N_L2_SETS * N_L2_SET_ENTRIES;
-
-  localparam logic [AXI_ADDR_WIDTH-1:0] L2_VA_MAX_ADDR = (N_L2_ENTRIES-1) << 2;
-
-  logic [AXI_DATA_WIDTH/8-1:0][7:0] L1Cfg_DP[N_REGS]; // [Byte][Bit]
-  genvar j;
-
-  //  ââââââ âââ  âââââââââ  âââ      âââ     ââââââââââââââââââââ
-  // ââââââââââââââââââââââ  âââ      âââ     ââââââââââââââââââââ
-  // ââââââââ ââââââ ââââââââââââââââââââ     âââ   âââ   ââââââ
-  // ââââââââ ââââââ ââââââââââââââââââââ     âââ   âââ   ââââââ
-  // âââ  âââââââ ââââââ     âââ      âââââââââââ   âââ   ââââââââ
-  // âââ  ââââââ  ââââââ     âââ      âââââââââââ   âââ   ââââââââ
-  //
-  logic [AXI_ADDR_WIDTH-1:0]        awaddr_reg;
-  logic                             awaddr_done_rise;
-  logic                             awaddr_done_reg;
-  logic                             awaddr_done_reg_dly;
-
-  logic [AXI_DATA_WIDTH/8-1:0][7:0] wdata_reg;
-  logic [AXI_DATA_WIDTH/8-1:0]      wstrb_reg;
-  logic                             wdata_done_rise;
-  logic                             wdata_done_reg;
-  logic                             wdata_done_reg_dly;
-
-  logic                             wresp_done_reg;
-  logic                             wresp_running_reg;
-
-  logic [AXI_ADDR_WIDTH-1:0]        araddr_reg;
-  logic                             araddr_done_reg;
-
-  logic [AXI_DATA_WIDTH-1:0]        rdata_reg;
-  logic                             rresp_done_reg;
-  logic                             rresp_running_reg;
-
-  logic                             awready;
-  logic                             wready;
-  logic                             bvalid;
-
-  logic                             arready;
-  logic                             rvalid;
-
-  logic                             wren;
-  logic                             wren_l1;
-
-  assign wren = ( wdata_done_rise & awaddr_done_reg ) | ( awaddr_done_rise & wdata_done_reg );
-  assign wdata_done_rise  = wdata_done_reg  & ~wdata_done_reg_dly;
-  assign awaddr_done_rise = awaddr_done_reg & ~awaddr_done_reg_dly;
-
-  // reg_dly
-  always @(posedge Clk_CI or negedge Rst_RBI)
-    begin
-       if (!Rst_RBI)
-         begin
-            wdata_done_reg_dly  <= 1'b0;
-            awaddr_done_reg_dly <= 1'b0;
-         end
-       else
-         begin
-            wdata_done_reg_dly  <= wdata_done_reg;
-            awaddr_done_reg_dly <= awaddr_done_reg;
-         end
-    end
-
-  // AW Channel
-  always @(posedge Clk_CI or negedge Rst_RBI)
-    begin
-       if (!Rst_RBI)
-         begin
-            awaddr_done_reg <= 1'b0;
-            awaddr_reg      <= '0;
-            awready         <= 1'b1;
-         end
-       else
-         begin
-            if (awready && s_axi_awvalid)
-              begin
-                 awready         <= 1'b0;
-                 awaddr_done_reg <= 1'b1;
-                 awaddr_reg      <= s_axi_awaddr;
-              end
-            else if (awaddr_done_reg && wresp_done_reg)
-              begin
-                 awready         <= 1'b1;
-                 awaddr_done_reg <= 1'b0;
-              end
-         end
-    end
-
-  // W Channel
-  always @(posedge Clk_CI or negedge Rst_RBI)
-    begin
-       if (!Rst_RBI)
-         begin
-            wdata_done_reg <= 1'b0;
-            wready         <= 1'b1;
-            wdata_reg      <= '0;
-            wstrb_reg      <= '0;
-         end
-       else
-         begin
-            if (wready && s_axi_wvalid)
-              begin
-                 wready         <= 1'b0;
-                 wdata_done_reg <= 1'b1;
-                 wdata_reg      <= s_axi_wdata;
-                 wstrb_reg      <= s_axi_wstrb;
-              end
-            else if (wdata_done_reg && wresp_done_reg)
-              begin
-                 wready         <= 1'b1;
-                 wdata_done_reg <= 1'b0;
-              end
-         end
-    end
-
-  // B Channel
-  always @(posedge Clk_CI or negedge Rst_RBI)
-    begin
-       if (!Rst_RBI)
-         begin
-            bvalid            <= 1'b0;
-            wresp_done_reg    <= 1'b0;
-            wresp_running_reg <= 1'b0;
-         end
-       else
-         begin
-            if (awaddr_done_reg && wdata_done_reg && !wresp_done_reg)
-              begin
-                 if (!wresp_running_reg)
-                   begin
-                      bvalid            <= 1'b1;
-                      wresp_running_reg <= 1'b1;
-                   end
-                 else if (s_axi_bready)
-                   begin
-                      bvalid            <= 1'b0;
-                      wresp_done_reg    <= 1'b1;
-                      wresp_running_reg <= 1'b0;
-                   end
-              end
-            else
-              begin
-                 bvalid            <= 1'b0;
-                 wresp_done_reg    <= 1'b0;
-                 wresp_running_reg <= 1'b0;
-              end
-         end
-    end
-
-  // AR Channel
-  always @(posedge Clk_CI or negedge Rst_RBI)
-    begin
-       if (!Rst_RBI)
-         begin
-            araddr_done_reg <= 1'b0;
-            arready         <= 1'b1;
-            araddr_reg       <= '0;
-         end
-       else
-         begin
-            if (arready && s_axi_arvalid)
-              begin
-                 arready         <= 1'b0;
-                 araddr_done_reg <= 1'b1;
-                 araddr_reg      <= s_axi_araddr;
-              end
-            else if (araddr_done_reg && rresp_done_reg)
-              begin
-                 arready         <= 1'b1;
-                 araddr_done_reg <= 1'b0;
-              end
-         end
-    end
-
-  // R Channel
-  always @(posedge Clk_CI or negedge Rst_RBI)
-    begin
-       if (!Rst_RBI)
-         begin
-            rresp_done_reg    <= 1'b0;
-            rvalid            <= 1'b0;
-            rresp_running_reg <= 1'b0;
-         end
-       else
-         begin
-            if (araddr_done_reg && !rresp_done_reg)
-              begin
-                 if (!rresp_running_reg)
-                   begin
-                      rvalid            <= 1'b1;
-                      rresp_running_reg <= 1'b1;
-                   end
-                 else if (s_axi_rready)
-                   begin
-                      rvalid            <= 1'b0;
-                      rresp_done_reg    <= 1'b1;
-                      rresp_running_reg <= 1'b0;
-                   end
-              end
-            else
-              begin
-                 rvalid            <= 1'b0;
-                 rresp_done_reg    <= 1'b0;
-                 rresp_running_reg <= 1'b0;
-              end
-         end
-    end
-
-  // âââ     âââ     âââââââââââââââ âââââââ     âââââââ ââââââââ âââââââ
-  // âââ    ââââ    ââââââââââââââââââââââââ     ââââââââââââââââââââââââ
-  // âââ    ââââ    âââ     ââââââ  âââ  ââââ    ââââââââââââââ  âââ  ââââ
-  // âââ     âââ    âââ     ââââââ  âââ   âââ    ââââââââââââââ  âââ   âââ
-  // âââââââââââ    âââââââââââ     âââââââââ    âââ  ââââââââââââââââââââ
-  // âââââââââââ     ââââââââââ      âââââââ     âââ  âââââââââââ âââââââ
-  //
-  assign wren_l1 = wren && (awaddr_reg < L2SINGLE_AMAP_SIZE);
-
-  always @( posedge Clk_CI or negedge Rst_RBI )
-    begin
-      var integer idx_reg, idx_byte;
-      if ( Rst_RBI == 1'b0 )
-        begin
-          for ( idx_reg = 0; idx_reg < N_REGS; idx_reg++ )
-            L1Cfg_DP[idx_reg] <= '0;
-        end
-      else if ( wren_l1 )
-          begin
-            if ( awaddr_reg[ADDR_LSB+1] == 1'b0 ) begin                     // VIRT_ADDR
-              for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
-                if ( (idx_byte < ADDR_WIDTH_VIRT/8) ) begin
-                  if ( wstrb_reg[idx_byte] ) begin
-                    L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte];
-                  end
-                end
-                else begin  // Let synthesizer optimize away unused registers.
-                  L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
-                end
-              end
-            end
-            else if ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b10 ) begin      // PHYS_ADDR
-              for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
-                if ( (idx_byte < ADDR_WIDTH_PHYS/8) ) begin
-                  if ( wstrb_reg[idx_byte] ) begin
-                    L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte];
-                  end
-                end
-                else begin  // Let synthesizer optimize away unused registers.
-                  L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
-                end
-              end
-            end
-            else begin // ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b11 )      // FLAGS
-              for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
-                if ( (idx_byte < 1) ) begin
-                  if ( wstrb_reg[idx_byte] ) begin
-                    L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte] & { {{8-N_FLAGS}{1'b0}}, {{N_FLAGS}{1'b1}} };
-                  end
-                end
-                else begin  // Let synthesizer optimize away unused registers.
-                  L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
-                end
-              end
-            end
-          end
-    end // always @ ( posedge Clk_CI or negedge Rst_RBI )
-
-  generate
-    // Mask unused bits -> Synthesizer should optimize away unused registers
-    for( j=0; j<N_REGS; j++ ) begin
-      if ( j[1] == 1'b0 ) // VIRT_ADDR
-        assign L1Cfg_DO[j] = { {{64-ADDR_WIDTH_VIRT}{1'b0}},{ADDR_WIDTH_VIRT{1'b1}} } & L1Cfg_DP[j];
-      else if ( j[1:0] == 2'b10 ) // PHYS_ADDR
-        assign L1Cfg_DO[j] = { {{64-ADDR_WIDTH_PHYS}{1'b0}},{ADDR_WIDTH_PHYS{1'b1}} } & L1Cfg_DP[j];
-      else // if ( j[1:0] == 2'b11 ) // FLAGS
-        assign L1Cfg_DO[j] = { {{64-N_FLAGS}{1'b0}},{N_FLAGS{1'b1}} } & L1Cfg_DP[j];
-    end
-  endgenerate
-
-  always_comb
-    begin
-      if ( araddr_reg[ADDR_LSB-1] == 1'b1 ) // read upper 32 bit, for debugging over 32-bit interface
-        rdata_reg = { {32'h00000000},{L1Cfg_DO[araddr_reg[ADDR_MSB:ADDR_LSB]][63:32]} };
-      else
-        rdata_reg = L1Cfg_DO[araddr_reg[ADDR_MSB:ADDR_LSB]];
-    end
-
-  assign s_axi_awready = awready;
-  assign s_axi_wready  = wready;
-
-  assign s_axi_bresp   = 2'b00;
-  assign s_axi_bvalid  = bvalid;
-
-  assign s_axi_arready = arready;
-  assign s_axi_rresp   = 2'b00;
-  assign s_axi_rvalid  = rvalid;
-
-  // âââ     âââââââ      âââââââââââââââ âââââââ
-  // âââ     ââââââââ    ââââââââââââââââââââââââ
-  // âââ      âââââââ    âââ     ââââââ  âââ  ââââ
-  // âââ     âââââââ     âââ     ââââââ  âââ   âââ
-  // ââââââââââââââââ    âââââââââââ     âââââââââ
-  // ââââââââââââââââ     ââââââââââ      âââââââ
-  //
-  logic [N_PORTS-1:0] l2_addr_is_in_va_rams;
-  logic [N_PORTS-1:0] upper_word_is_written;
-  logic [N_PORTS-1:0] lower_word_is_written;
-  generate
-    for( j=0; j< N_PORTS; j++)
-      begin
-        if (AXI_DATA_WIDTH == 64) begin
-          assign l2_addr_is_in_va_rams[j] = (awaddr_reg >= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg[log2(L2SINGLE_AMAP_SIZE)-1:0] <= L2_VA_MAX_ADDR);
-          assign upper_word_is_written[j] = (wstrb_reg[7:4] != 4'b0000);
-          assign lower_word_is_written[j] = (wstrb_reg[3:0] != 4'b0000);
-        end else begin
-          assign l2_addr_is_in_va_rams[j] = 1'b0;
-          assign upper_word_is_written[j] = 1'b0;
-          assign lower_word_is_written[j] = 1'b0;
-        end
-
-        always @( posedge Clk_CI or negedge Rst_RBI ) begin
-          var integer idx_byte, off_byte;
-          if ( Rst_RBI == 1'b0 )
-            begin
-              wren_l2[j]  <= 1'b0;
-              wdata_l2[j] <= '0;
-            end
-          else if (wren)
-            begin
-              if ( (awaddr_reg >= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg < (j+2)*L2SINGLE_AMAP_SIZE) && (|wstrb_reg) )
-                wren_l2[j] <= 1'b1;
-              if      (AXI_DATA_WIDTH == 32) begin
-                for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ )
-                  wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte] & {8{wstrb_reg[idx_byte]}};
-              end
-              else if (AXI_DATA_WIDTH == 64) begin
-                if (lower_word_is_written[j] == 1'b1)
-                  off_byte = 0;
-                else
-                  off_byte = 4;
-                // always put the payload in the lower word and set upper word to 0
-                for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8/2; idx_byte++ )
-                    wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte+off_byte] & {8{wstrb_reg[idx_byte+off_byte]}};
-                wdata_l2[j][AXI_DATA_WIDTH-1:AXI_DATA_WIDTH/2] <= 'b0;
-              end
-              // pragma translate_off
-              else
-                $fatal(1, "Unsupported AXI_DATA_WIDTH!");
-              // pragma translate_on
-            end
-          else
-            wren_l2[j] <= '0;
-        end // always @ ( posedge Clk_CI or negedge Rst_RBI )
-
-        // Properly align the 32-bit word address when writing from 64-bit interface:
-        // Depending on the system, the incoming address is (non-)aligned to the 64-bit
-        // word when writing the upper 32-bit word.
-        always_comb begin
-          waddr_l2[j] = (awaddr_reg -(j+1)*L2SINGLE_AMAP_SIZE)/4;
-          if (wren_l2[j]) begin
-            if (AXI_DATA_WIDTH == 64) begin
-              if (upper_word_is_written[j] == 1'b1) begin
-                // address must be non-aligned
-                waddr_l2[j][0] = 1'b1;
-              end
-            end
-            // pragma translate_off
-            else if (AXI_DATA_WIDTH != 32) begin
-              $fatal(1, "Unsupported AXI_DATA_WIDTH!");
-            end
-            // pragma translate_on
-          end
-        end
-
-        // Assert that only one 32-bit word is ever written at a time to VA RAMs on 64-bit data
-        // systems.
-        // pragma translate_off
-        always_ff @ (posedge Clk_CI) begin
-          if (AXI_DATA_WIDTH == 64) begin
-            if  (l2_addr_is_in_va_rams[j]) begin
-              if (upper_word_is_written[j]) begin
-                assert (!lower_word_is_written[j])
-                  else $error("Unsupported write across two 32-bit words to VA RAMs!");
-              end
-              else if (lower_word_is_written[j]) begin
-                assert (!upper_word_is_written[j])
-                  else $error("Unsupported write across two 32-bit words to VA RAMs!");
-              end
-            end
-          end
-        end
-        // pragma translate_on
-
-      end // for (j=0; j< N_PORTS; j++)
-   endgenerate
-
-  // ââââ   âââââââ  âââ    âââââââââââââââââââ âââââââ ââââââââ
-  // âââââ ââââââââ  âââ    ââââââââââââââââââââââââââââââââââââ
-  // âââââââââââââââââââ    ââââââ  âââââââââ  âââ   âââââââââââ
-  // âââââââââââââââââââ    ââââââ  âââââââââ  âââ   âââââââââââ
-  // âââ âââ ââââââ  âââ    âââ     ââââââ     âââââââââââââââââ
-  // âââ     ââââââ  âââ    âââ     ââââââ      âââââââ ââââââââ
-  //
-  logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDin_D;
-  logic                       AddrFifoWen_S;
-  logic                       AddrFifoRen_S;
-  logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDout_D;
-  logic                       AddrFifoFull_S;
-  logic                       AddrFifoEmpty_S;
-  logic                       AddrFifoEmpty_SB;
-  logic                       AddrFifoFull_SB;
-
-  logic [MISS_META_WIDTH-1:0] MetaFifoDin_D;
-  logic                       MetaFifoWen_S;
-  logic                       MetaFifoRen_S;
-  logic [MISS_META_WIDTH-1:0] MetaFifoDout_D;
-  logic                       MetaFifoFull_S;
-  logic                       MetaFifoEmpty_S;
-  logic                       MetaFifoEmpty_SB;
-  logic                       MetaFifoFull_SB;
-
-  logic                       FifosDisabled_S;
-  logic                       ConfRegWen_S;
-  logic                 [1:0] ConfReg_DN;
-  logic                 [1:0] ConfReg_DP;
-
-  logic [AXI_DATA_WIDTH-1:0] wdata_reg_vec;
-
-  assign FifosDisabled_S    = ConfReg_DP[0];
-  assign L1AllowMultiHit_SO = ConfReg_DP[1];
-
-  assign AddrFifoEmpty_S = ~AddrFifoEmpty_SB;
-  assign MetaFifoEmpty_S = ~MetaFifoEmpty_SB;
-
-  assign AddrFifoFull_S = ~AddrFifoFull_SB;
-  assign MetaFifoFull_S = ~MetaFifoFull_SB;
-
-  assign MhFifoFull_SO = (AddrFifoWen_S & AddrFifoFull_S) | (MetaFifoWen_S & MetaFifoFull_S);
-
-  generate
-     for ( j=0; j<AXI_DATA_WIDTH/8; j++ )
-       assign wdata_reg_vec[(j+1)*8-1:j*8] = wdata_reg[j];
-  endgenerate
-
-  // write address FIFO
-  always_comb
-    begin
-       AddrFifoWen_S = 1'b0;
-       AddrFifoDin_D = 'b0;
-       if ( (Miss_SI == 1'b1) && (FifosDisabled_S == 1'b0) ) // register a new miss
-         begin
-            AddrFifoWen_S = 1'b1;
-            AddrFifoDin_D = MissAddr_DI;
-         end
-       else if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 'b0) && (FifosDisabled_S == 1'b0)) // write request from AXI interface
-         begin
-            AddrFifoWen_S = 1'b1;
-            AddrFifoDin_D = wdata_reg_vec[ADDR_WIDTH_VIRT-1:0];
-         end
-    end
-
-  // write meta FIFO
-  always_comb
-    begin
-       MetaFifoWen_S = 1'b0;
-       MetaFifoDin_D = 'b0;
-       if ( (Miss_SI == 1'b1) && (FifosDisabled_S == 1'b0) ) // register a new miss
-         begin
-            MetaFifoWen_S                      = 1'b1;
-            MetaFifoDin_D[MISS_META_WIDTH-1:0] = MissMeta_DI;
-         end
-       else if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 4'h8) && (FifosDisabled_S == 1'b0) ) // write request from AXI interface
-         begin
-            MetaFifoWen_S = 1'b1;
-            MetaFifoDin_D = wdata_reg_vec[MISS_META_WIDTH-1:0];
-         end
-    end
-
-  // write configuration register
-  always_comb
-    begin
-       ConfRegWen_S = 1'b0;
-       ConfReg_DN   = 1'b0;
-       if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 8'h10) ) // write request from AXI interface
-         begin
-            ConfRegWen_S = 1'b1;
-            ConfReg_DN   = wdata_reg_vec[$high(ConfReg_DN):0];
-         end
-    end
-
-  // AXI read data
-  always_comb
-    begin
-       s_axi_rdata   = rdata_reg; // read L1 config
-       AddrFifoRen_S = 1'b0;
-       MetaFifoRen_S = 1'b0;
-       if ( rvalid == 1'b1 )
-         begin
-            // read address FIFO
-            if ( araddr_reg[ADDR_MSB:0] == 'b0 )
-              begin
-                s_axi_rdata                      = {AXI_DATA_WIDTH{1'b0}};
-                s_axi_rdata[ADDR_WIDTH_VIRT-1:0] = AddrFifoDout_D;
-                if ( AddrFifoEmpty_S == 1'b0 )
-                  AddrFifoRen_S = 1'b1;
-              end
-            // read meta FIFO
-            else if ( araddr_reg[ADDR_MSB:0] == 4'h8 )
-              begin
-                s_axi_rdata                      = {AXI_DATA_WIDTH{1'b0}};
-                s_axi_rdata[31]                  = MetaFifoEmpty_S;
-                s_axi_rdata[MISS_META_WIDTH-1:0] = MetaFifoDout_D;
-                if ( MetaFifoEmpty_S == 1'b0 )
-                  MetaFifoRen_S = 1'b1;
-              end
-            // read configuration register
-            else if ( araddr_reg[ADDR_MSB:0] == 8'h10 )
-              begin
-                s_axi_rdata                      = {AXI_DATA_WIDTH{1'b0}};
-                s_axi_rdata[$high(ConfReg_DP):0] = ConfReg_DP;
-              end
-         end // if ( rvalid == 1'b1 )
-    end // always_comb begin
-
-  // configuration register
-  always_ff @(posedge Clk_CI or negedge Rst_RBI) begin
-    if (Rst_RBI == 1'b0)
-      begin
-        ConfReg_DP <= 'b0;
-      end
-    else if (ConfRegWen_S == 1'b1)
-      begin
-        ConfReg_DP <= ConfReg_DN;
-      end
-  end
-
-  generic_fifo
-    #(
-      .DATA_WIDTH ( ADDR_WIDTH_VIRT ),
-      .DATA_DEPTH ( MH_FIFO_DEPTH   )
-      )
-    fifo_addr_i
-    (
-      .clk         ( Clk_CI                          ),
-      .rst_n       ( Rst_RBI                         ),
-      .data_i      ( AddrFifoDin_D                   ),
-      .valid_i     ( AddrFifoWen_S & AddrFifoFull_SB ),
-      .grant_o     ( AddrFifoFull_SB                 ),
-      .data_o      ( AddrFifoDout_D                  ),
-      .valid_o     ( AddrFifoEmpty_SB                ),
-      .grant_i     ( AddrFifoRen_S                   ),
-      .test_mode_i ( 1'b0                            )
-    );
-
-  generic_fifo
-    #(
-      .DATA_WIDTH ( MISS_META_WIDTH ),
-      .DATA_DEPTH ( MH_FIFO_DEPTH   )
-      )
-    fifo_meta_i
-    (
-      .clk         ( Clk_CI                          ),
-      .rst_n       ( Rst_RBI                         ),
-      .data_i      ( MetaFifoDin_D                   ),
-      .valid_i     ( MetaFifoWen_S & MetaFifoFull_SB ),
-      .grant_o     ( MetaFifoFull_SB                 ),
-      .data_o      ( MetaFifoDout_D                  ),
-      .valid_o     ( MetaFifoEmpty_SB                ),
-      .grant_i     ( MetaFifoRen_S                   ),
-      .test_mode_i ( 1'b0                            )
-    );
-"""
-#
-# endmodule
-#
-#
diff --git a/src/iommu/axi_rab/axi_rab_top.py b/src/iommu/axi_rab/axi_rab_top.py
deleted file mode 100644
index ea1a802d..00000000
--- a/src/iommu/axi_rab/axi_rab_top.py
+++ /dev/null
@@ -1,2642 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi_rab_top(Elaboratable):
-
-    def __init__(self):
-        self.Clk_CI = Signal()  # input
-        self.NonGatedClk_CI = Signal()  # input
-        self.Rst_RBI = Signal()  # input
-        self.s_axi4_awid = Signal()  # input
-        self.s_axi4_awaddr = Signal()  # input
-        self.s_axi4_awvalid = Signal(N_PORTS)  # input
-        self.s_axi4_awready = Signal(N_PORTS)  # output
-        self.s_axi4_awlen = Signal()  # input
-        self.s_axi4_awsize = Signal()  # input
-        self.s_axi4_awburst = Signal()  # input
-        self.s_axi4_awlock = Signal(N_PORTS)  # input
-        self.s_axi4_awprot = Signal()  # input
-        self.s_axi4_awcache = Signal()  # input
-        self.s_axi4_awregion = Signal()  # input
-        self.s_axi4_awqos = Signal()  # input
-        self.s_axi4_awuser = Signal()  # input
-        self.s_axi4_wdata = Signal()  # input
-        self.s_axi4_wvalid = Signal(N_PORTS)  # input
-        self.s_axi4_wready = Signal(N_PORTS)  # output
-        self.s_axi4_wstrb = Signal()  # input
-        self.s_axi4_wlast = Signal(N_PORTS)  # input
-        self.s_axi4_wuser = Signal()  # input
-        self.s_axi4_bid = Signal()  # output
-        self.s_axi4_bresp = Signal()  # output
-        self.s_axi4_bvalid = Signal(N_PORTS)  # output
-        self.s_axi4_buser = Signal()  # output
-        self.s_axi4_bready = Signal(N_PORTS)  # input
-        self.s_axi4_arid = Signal()  # input
-        self.s_axi4_araddr = Signal()  # input
-        self.s_axi4_arvalid = Signal(N_PORTS)  # input
-        self.s_axi4_arready = Signal(N_PORTS)  # output
-        self.s_axi4_arlen = Signal()  # input
-        self.s_axi4_arsize = Signal()  # input
-        self.s_axi4_arburst = Signal()  # input
-        self.s_axi4_arlock = Signal(N_PORTS)  # input
-        self.s_axi4_arprot = Signal()  # input
-        self.s_axi4_arcache = Signal()  # input
-        self.s_axi4_aruser = Signal()  # input
-        self.s_axi4_rid = Signal()  # output
-        self.s_axi4_rdata = Signal()  # output
-        self.s_axi4_rresp = Signal()  # output
-        self.s_axi4_rvalid = Signal(N_PORTS)  # output
-        self.s_axi4_rready = Signal(N_PORTS)  # input
-        self.s_axi4_rlast = Signal(N_PORTS)  # output
-        self.s_axi4_ruser = Signal()  # output
-        self.m0_axi4_awid = Signal()  # output
-        self.m0_axi4_awaddr = Signal()  # output
-        self.m0_axi4_awvalid = Signal(N_PORTS)  # output
-        self.m0_axi4_awready = Signal(N_PORTS)  # input
-        self.m0_axi4_awlen = Signal()  # output
-        self.m0_axi4_awsize = Signal()  # output
-        self.m0_axi4_awburst = Signal()  # output
-        self.m0_axi4_awlock = Signal(N_PORTS)  # output
-        self.m0_axi4_awprot = Signal()  # output
-        self.m0_axi4_awcache = Signal()  # output
-        self.m0_axi4_awregion = Signal()  # output
-        self.m0_axi4_awqos = Signal()  # output
-        self.m0_axi4_awuser = Signal()  # output
-        self.m0_axi4_wdata = Signal()  # output
-        self.m0_axi4_wvalid = Signal(N_PORTS)  # output
-        self.m0_axi4_wready = Signal(N_PORTS)  # input
-        self.m0_axi4_wstrb = Signal()  # output
-        self.m0_axi4_wlast = Signal(N_PORTS)  # output
-        self.m0_axi4_wuser = Signal()  # output
-        self.m0_axi4_bid = Signal()  # input
-        self.m0_axi4_bresp = Signal()  # input
-        self.m0_axi4_bvalid = Signal(N_PORTS)  # input
-        self.m0_axi4_buser = Signal()  # input
-        self.m0_axi4_bready = Signal(N_PORTS)  # output
-        self.m0_axi4_arid = Signal()  # output
-        self.m0_axi4_araddr = Signal()  # output
-        self.m0_axi4_arvalid = Signal(N_PORTS)  # output
-        self.m0_axi4_arready = Signal(N_PORTS)  # input
-        self.m0_axi4_arlen = Signal()  # output
-        self.m0_axi4_arsize = Signal()  # output
-        self.m0_axi4_arburst = Signal()  # output
-        self.m0_axi4_arlock = Signal(N_PORTS)  # output
-        self.m0_axi4_arprot = Signal()  # output
-        self.m0_axi4_arcache = Signal()  # output
-        self.m0_axi4_aruser = Signal()  # output
-        self.m0_axi4_rid = Signal()  # input
-        self.m0_axi4_rdata = Signal()  # input
-        self.m0_axi4_rresp = Signal()  # input
-        self.m0_axi4_rvalid = Signal(N_PORTS)  # input
-        self.m0_axi4_rready = Signal(N_PORTS)  # output
-        self.m0_axi4_rlast = Signal(N_PORTS)  # input
-        self.m0_axi4_ruser = Signal()  # input
-        self.m1_axi4_awid = Signal()  # output
-        self.m1_axi4_awaddr = Signal()  # output
-        self.m1_axi4_awvalid = Signal(N_PORTS)  # output
-        self.m1_axi4_awready = Signal(N_PORTS)  # input
-        self.m1_axi4_awlen = Signal()  # output
-        self.m1_axi4_awsize = Signal()  # output
-        self.m1_axi4_awburst = Signal()  # output
-        self.m1_axi4_awlock = Signal(N_PORTS)  # output
-        self.m1_axi4_awprot = Signal()  # output
-        self.m1_axi4_awcache = Signal()  # output
-        self.m1_axi4_awregion = Signal()  # output
-        self.m1_axi4_awqos = Signal()  # output
-        self.m1_axi4_awuser = Signal()  # output
-        self.m1_axi4_wdata = Signal()  # output
-        self.m1_axi4_wvalid = Signal(N_PORTS)  # output
-        self.m1_axi4_wready = Signal(N_PORTS)  # input
-        self.m1_axi4_wstrb = Signal()  # output
-        self.m1_axi4_wlast = Signal(N_PORTS)  # output
-        self.m1_axi4_wuser = Signal()  # output
-        self.m1_axi4_bid = Signal()  # input
-        self.m1_axi4_bresp = Signal()  # input
-        self.m1_axi4_bvalid = Signal(N_PORTS)  # input
-        self.m1_axi4_buser = Signal()  # input
-        self.m1_axi4_bready = Signal(N_PORTS)  # output
-        self.m1_axi4_arid = Signal()  # output
-        self.m1_axi4_araddr = Signal()  # output
-        self.m1_axi4_arvalid = Signal(N_PORTS)  # output
-        self.m1_axi4_arready = Signal(N_PORTS)  # input
-        self.m1_axi4_arlen = Signal()  # output
-        self.m1_axi4_arsize = Signal()  # output
-        self.m1_axi4_arburst = Signal()  # output
-        self.m1_axi4_arlock = Signal(N_PORTS)  # output
-        self.m1_axi4_arprot = Signal()  # output
-        self.m1_axi4_arcache = Signal()  # output
-        self.m1_axi4_aruser = Signal()  # output
-        self.m1_axi4_rid = Signal()  # input
-        self.m1_axi4_rdata = Signal()  # input
-        self.m1_axi4_rresp = Signal()  # input
-        self.m1_axi4_rvalid = Signal(N_PORTS)  # input
-        self.m1_axi4_rready = Signal(N_PORTS)  # output
-        self.m1_axi4_rlast = Signal(N_PORTS)  # input
-        self.m1_axi4_ruser = Signal()  # input
-        self.s_axi4lite_awaddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
-        self.s_axi4lite_awvalid = Signal()  # input
-        self.s_axi4lite_awready = Signal()  # output
-        self.s_axi4lite_wdata = Signal(AXI_LITE_DATA_WIDTH)  # input
-        self.s_axi4lite_wvalid = Signal()  # input
-        self.s_axi4lite_wready = Signal()  # output
-        self.s_axi4lite_wstrb = Signal(1+ERROR p_expression_25)  # input
-        self.s_axi4lite_bresp = Signal(2)  # output
-        self.s_axi4lite_bvalid = Signal()  # output
-        self.s_axi4lite_bready = Signal()  # input
-        self.s_axi4lite_araddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
-        self.s_axi4lite_arvalid = Signal()  # input
-        self.s_axi4lite_arready = Signal()  # output
-        self.s_axi4lite_rdata = Signal(AXI_LITE_DATA_WIDTH)  # output
-        self.s_axi4lite_rresp = Signal(2)  # output
-        self.s_axi4lite_rvalid = Signal()  # output
-        self.s_axi4lite_rready = Signal()  # input
-        self.int_miss = Signal(N_PORTS)  # output
-        self.int_multi = Signal(N_PORTS)  # output
-        self.int_prot = Signal(N_PORTS)  # output
-        self.int_mhf_full = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# // --=========================================================================--
-# //
-# //  ââââââ âââ  ââââââ    âââââââ  ââââââ âââââââ     âââââââââ âââââââ âââââââ
-# // âââââââââââââââââââ    ââââââââââââââââââââââââ    ââââââââââââââââââââââââââ
-# // ââââââââ ââââââ âââ    ââââââââââââââââââââââââ       âââ   âââ   âââââââââââ
-# // ââââââââ ââââââ âââ    ââââââââââââââââââââââââ       âââ   âââ   ââââââââââ
-# // âââ  âââââââ ââââââ    âââ  ââââââ  âââââââââââ       âââ   ââââââââââââ
-# // âââ  ââââââ  ââââââ    âââ  ââââââ  ââââââââââ        âââ    âââââââ âââ
-# //
-# // --=========================================================================--
-# /*
-# * axi_rab_top
-# *
-# * The remapping address block (RAB) performs address translation for AXI
-# * transactions arriving at the input port and forwards them to different
-# * downstream AXI ports.
-# *
-# * The five axi channels are each buffered on the input side using a FIFO,
-# * described in axi4_XX_buffer. The RAB lookup result is merged into the
-# * AXI transaction via the axi4_XX_sender instances, which manages upstream
-# * error signaling for failed lookups.
-# *
-# * Address translation is performed based on data stored in up to two
-# * translation lookaside buffers (TLBs), which are private per RAB port (each
-# * of which having two AXI master ports and one AXI slave port). These TLBs
-# * are managed in software through the AXI-Lite interface.
-# *
-# * If ACP is enabled, the `cache_coherent` flag in the TLBs is used to
-# * multiplex between the two ports. If ACP is disabled, only the first master
-# * port is used. In this case, the `cache_coherent` flag is used to set the
-# * AxCACHE signals of the AXI bus accordingly.
-# *
-# * Authors:
-# * Antonio Pullini <pullinia@iis.ee.ethz.ch>
-# * Conrad Burchert <bconrad@ethz.ch>
-# * Maheshwara Sharma <msharma@student.ethz.ch>
-# * Andreas Kurth <akurth@iis.ee.ethz.ch>
-# * Johannes Weinbuch <jweinbuch@student.ethz.ch>
-# * Pirmin Vogel <vogelpi@iis.ee.ethz.ch>
-# */
-#
-# //`include "pulp_soc_defines.sv"
-#
-# ////import CfMath::log2;
-#
-# module axi_rab_top
-#
-#  // Parameters {{{
-#  #(
-#    parameter N_PORTS             =  2,
-#    parameter N_L2_SETS           = 32,
-#    parameter N_L2_SET_ENTRIES    = 32,
-#    parameter AXI_DATA_WIDTH      = 64,
-#    parameter AXI_S_ADDR_WIDTH    = 32,
-#    parameter AXI_M_ADDR_WIDTH    = 40,
-#    parameter AXI_LITE_DATA_WIDTH = 64,
-#    parameter AXI_LITE_ADDR_WIDTH = 32,
-#    parameter AXI_ID_WIDTH        = 10,
-#    parameter AXI_USER_WIDTH      =  6,
-#    parameter MH_FIFO_DEPTH       = 16
-#  )
-#  // }}}
-#
-#  // Ports {{{
-#  (
-#
-#    input logic                                            Clk_CI,  // This clock may be gated.
-#    input logic                                            NonGatedClk_CI,
-#    input logic                                            Rst_RBI,
-#
-#    // For every slave port there are two master ports. The master
-#    // port to use can be set using the master_select flag of the protection
-#    // bits of a slice
-#
-#    // AXI4 Slave {{{
-#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_awid,
-#    input  logic    [N_PORTS-1:0]   [AXI_S_ADDR_WIDTH-1:0] s_axi4_awaddr,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_awvalid,
-#    output logic    [N_PORTS-1:0]                          s_axi4_awready,
-#    input  logic    [N_PORTS-1:0]                    [7:0] s_axi4_awlen,
-#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_awsize,
-#    input  logic    [N_PORTS-1:0]                    [1:0] s_axi4_awburst,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_awlock,
-#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_awprot,
-#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_awcache,
-#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_awregion,
-#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_awqos,
-#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_awuser,
-#
-#    input  logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_wvalid,
-#    output logic    [N_PORTS-1:0]                          s_axi4_wready,
-#    input  logic    [N_PORTS-1:0]   [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_wlast,
-#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_wuser,
-#
-#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_bid,
-#    output logic    [N_PORTS-1:0]                    [1:0] s_axi4_bresp,
-#    output logic    [N_PORTS-1:0]                          s_axi4_bvalid,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_buser,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_bready,
-#
-#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_arid,
-#    input  logic    [N_PORTS-1:0]   [AXI_S_ADDR_WIDTH-1:0] s_axi4_araddr,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_arvalid,
-#    output logic    [N_PORTS-1:0]                          s_axi4_arready,
-#    input  logic    [N_PORTS-1:0]                    [7:0] s_axi4_arlen,
-#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_arsize,
-#    input  logic    [N_PORTS-1:0]                    [1:0] s_axi4_arburst,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_arlock,
-#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_arprot,
-#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_arcache,
-#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_aruser,
-#
-#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_rid,
-#    output logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
-#    output logic    [N_PORTS-1:0]                    [1:0] s_axi4_rresp,
-#    output logic    [N_PORTS-1:0]                          s_axi4_rvalid,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_rready,
-#    output logic    [N_PORTS-1:0]                          s_axi4_rlast,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_ruser,
-#    // }}}
-#
-#    // AXI4 Master 0 {{{
-#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_awid,
-#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m0_axi4_awaddr,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_awvalid,
-#    input  logic    [N_PORTS-1:0]                          m0_axi4_awready,
-#    output logic    [N_PORTS-1:0]                    [7:0] m0_axi4_awlen,
-#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_awsize,
-#    output logic    [N_PORTS-1:0]                    [1:0] m0_axi4_awburst,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_awlock,
-#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_awprot,
-#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_awcache,
-#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_awregion,
-#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_awqos,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_awuser,
-#
-#    output logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m0_axi4_wdata,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_wvalid,
-#    input  logic    [N_PORTS-1:0]                          m0_axi4_wready,
-#    output logic    [N_PORTS-1:0]   [AXI_DATA_WIDTH/8-1:0] m0_axi4_wstrb,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_wlast,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_wuser,
-#
-#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_bid,
-#    input  logic    [N_PORTS-1:0]                    [1:0] m0_axi4_bresp,
-#    input  logic    [N_PORTS-1:0]                          m0_axi4_bvalid,
-#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_buser,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_bready,
-#
-#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_arid,
-#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m0_axi4_araddr,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_arvalid,
-#    input  logic    [N_PORTS-1:0]                          m0_axi4_arready,
-#    output logic    [N_PORTS-1:0]                    [7:0] m0_axi4_arlen,
-#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_arsize,
-#    output logic    [N_PORTS-1:0]                    [1:0] m0_axi4_arburst,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_arlock,
-#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_arprot,
-#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_arcache,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_aruser,
-#
-#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_rid,
-#    input  logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m0_axi4_rdata,
-#    input  logic    [N_PORTS-1:0]                    [1:0] m0_axi4_rresp,
-#    input  logic    [N_PORTS-1:0]                          m0_axi4_rvalid,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_rready,
-#    input  logic    [N_PORTS-1:0]                          m0_axi4_rlast,
-#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_ruser,
-#    // }}}
-#
-#    // AXI4 Master 1 {{{
-#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_awid,
-#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m1_axi4_awaddr,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_awvalid,
-#    input  logic    [N_PORTS-1:0]                          m1_axi4_awready,
-#    output logic    [N_PORTS-1:0]                    [7:0] m1_axi4_awlen,
-#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_awsize,
-#    output logic    [N_PORTS-1:0]                    [1:0] m1_axi4_awburst,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_awlock,
-#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_awprot,
-#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_awcache,
-#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_awregion,
-#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_awqos,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_awuser,
-#
-#    output logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m1_axi4_wdata,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_wvalid,
-#    input  logic    [N_PORTS-1:0]                          m1_axi4_wready,
-#    output logic    [N_PORTS-1:0]   [AXI_DATA_WIDTH/8-1:0] m1_axi4_wstrb,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_wlast,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_wuser,
-#
-#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_bid,
-#    input  logic    [N_PORTS-1:0]                    [1:0] m1_axi4_bresp,
-#    input  logic    [N_PORTS-1:0]                          m1_axi4_bvalid,
-#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_buser,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_bready,
-#
-#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_arid,
-#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m1_axi4_araddr,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_arvalid,
-#    input  logic    [N_PORTS-1:0]                          m1_axi4_arready,
-#    output logic    [N_PORTS-1:0]                    [7:0] m1_axi4_arlen,
-#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_arsize,
-#    output logic    [N_PORTS-1:0]                    [1:0] m1_axi4_arburst,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_arlock,
-#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_arprot,
-#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_arcache,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_aruser,
-#
-#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_rid,
-#    input  logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m1_axi4_rdata,
-#    input  logic    [N_PORTS-1:0]                    [1:0] m1_axi4_rresp,
-#    input  logic    [N_PORTS-1:0]                          m1_axi4_rvalid,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_rready,
-#    input  logic    [N_PORTS-1:0]                          m1_axi4_rlast,
-#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_ruser,
-#    // }}}
-#
-#    // AXI 4 Lite Slave (Configuration Interface) {{{
-#    // AXI4-Lite port to setup the rab slices
-#    // use this to program the configuration registers
-#    input  logic                 [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_awaddr,
-#    input  logic                                           s_axi4lite_awvalid,
-#    output logic                                           s_axi4lite_awready,
-#
-#    input  logic                 [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_wdata,
-#    input  logic                                           s_axi4lite_wvalid,
-#    output logic                                           s_axi4lite_wready,
-#    input  logic               [AXI_LITE_DATA_WIDTH/8-1:0] s_axi4lite_wstrb,
-#
-#    output logic                                     [1:0] s_axi4lite_bresp,
-#    output logic                                           s_axi4lite_bvalid,
-#    input  logic                                           s_axi4lite_bready,
-#
-#    input  logic                 [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_araddr,
-#    input  logic                                           s_axi4lite_arvalid,
-#    output logic                                           s_axi4lite_arready,
-#
-#    output logic                 [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_rdata,
-#    output logic                                     [1:0] s_axi4lite_rresp,
-#    output logic                                           s_axi4lite_rvalid,
-#    input  logic                                           s_axi4lite_rready,
-#    // }}}
-#
-#    // BRAMs {{{
-# //`ifdef RAB_AX_LOG_EN
-# //    BramPort.Slave                                         ArBram_PS,
-# //    BramPort.Slave                                         AwBram_PS,
-# //`endif
-#    // }}}
-#
-#    // Logger Control {{{
-# //`ifdef RAB_AX_LOG_EN
-# //   input  logic                                           LogEn_SI,
-# //   input  logic                                           ArLogClr_SI,
-# //   input  logic                                           AwLogClr_SI,
-#  //  output logic                                           ArLogRdy_SO,
-#  //  output logic                                           AwLogRdy_SO,
-# //`endif
-#    // }}}
-#
-#    // Interrupt Outputs {{{
-#    // Interrupt lines to handle misses, collisions of slices/multiple hits,
-#    // protection faults and overflow of the miss handling fifo
-# //`ifdef RAB_AX_LOG_EN
-# //   output logic                                           int_ar_log_full,
-# //   output logic                                           int_aw_log_full,
-# //`endif
-#    output logic                             [N_PORTS-1:0] int_miss,
-#    output logic                             [N_PORTS-1:0] int_multi,
-#    output logic                             [N_PORTS-1:0] int_prot,
-#    output logic                                           int_mhf_full
-#    // }}}
-#
-#  );
-#
-"""#docstring_begin
-
-  // }}}
-
-  // Signals {{{
-  // âââââââââââ âââââââ ââââ   âââ ââââââ âââ     ââââââââ
-  // âââââââââââââââââââ âââââ  ââââââââââââââ     ââââââââ
-  // ââââââââââââââ  ââââââââââ ââââââââââââââ     ââââââââ
-  // ââââââââââââââ   ââââââââââââââââââââââââ     ââââââââ
-  // âââââââââââââââââââââââ âââââââââ  âââââââââââââââââââ
-  // âââââââââââ âââââââ âââ  ââââââââ  âââââââââââââââââââ
-  //
-
-  // Internal AXI4 lines, these connect buffers on the slave side to the rab core and
-  // multiplexers which switch between the two master outputs
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_awid;
-  logic [N_PORTS-1:0]  [AXI_S_ADDR_WIDTH-1:0] int_awaddr;
-  logic [N_PORTS-1:0]                         int_awvalid;
-  logic [N_PORTS-1:0]                         int_awready;
-  logic [N_PORTS-1:0]                   [7:0] int_awlen;
-  logic [N_PORTS-1:0]                   [2:0] int_awsize;
-  logic [N_PORTS-1:0]                   [1:0] int_awburst;
-  logic [N_PORTS-1:0]                         int_awlock;
-  logic [N_PORTS-1:0]                   [2:0] int_awprot;
-  logic [N_PORTS-1:0]                   [3:0] int_awcache;
-  logic [N_PORTS-1:0]                   [3:0] int_awregion;
-  logic [N_PORTS-1:0]                   [3:0] int_awqos;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_awuser;
-
-  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_wdata;
-  logic [N_PORTS-1:0]                         int_wvalid;
-  logic [N_PORTS-1:0]                         int_wready;
-  logic [N_PORTS-1:0]  [AXI_DATA_WIDTH/8-1:0] int_wstrb;
-  logic [N_PORTS-1:0]                         int_wlast;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_wuser;
-
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_bid;
-  logic [N_PORTS-1:0]                   [1:0] int_bresp;
-  logic [N_PORTS-1:0]                         int_bvalid;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_buser;
-  logic [N_PORTS-1:0]                         int_bready;
-
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_arid;
-  logic [N_PORTS-1:0]  [AXI_S_ADDR_WIDTH-1:0] int_araddr;
-  logic [N_PORTS-1:0]                         int_arvalid;
-  logic [N_PORTS-1:0]                         int_arready;
-  logic [N_PORTS-1:0]                   [7:0] int_arlen;
-  logic [N_PORTS-1:0]                   [2:0] int_arsize;
-  logic [N_PORTS-1:0]                   [1:0] int_arburst;
-  logic [N_PORTS-1:0]                         int_arlock;
-  logic [N_PORTS-1:0]                   [2:0] int_arprot;
-  logic [N_PORTS-1:0]                   [3:0] int_arcache;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_aruser;
-
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_rid;
-  logic [N_PORTS-1:0]                   [1:0] int_rresp;
-  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_rdata;
-  logic [N_PORTS-1:0]                         int_rlast;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_ruser;
-  logic [N_PORTS-1:0]                         int_rvalid;
-  logic [N_PORTS-1:0]                         int_rready;
-
-  // rab_core outputs
-  logic [N_PORTS-1:0]  [AXI_M_ADDR_WIDTH-1:0] int_wtrans_addr;
-  logic [N_PORTS-1:0]                         int_wtrans_accept;
-  logic [N_PORTS-1:0]                         int_wtrans_drop;
-  logic [N_PORTS-1:0]                         int_wtrans_miss;
-  logic [N_PORTS-1:0]                         int_wtrans_sent;
-  logic [N_PORTS-1:0]                         int_wtrans_cache_coherent;
-  logic [N_PORTS-1:0]                         int_wmaster_select;
-
-  logic [N_PORTS-1:0]  [AXI_M_ADDR_WIDTH-1:0] int_rtrans_addr;
-  logic [N_PORTS-1:0]                         int_rtrans_accept;
-  logic [N_PORTS-1:0]                         int_rtrans_drop;
-  logic [N_PORTS-1:0]                         int_rtrans_miss;
-  logic [N_PORTS-1:0]                         int_rtrans_sent;
-  logic [N_PORTS-1:0]                         int_rtrans_cache_coherent;
-  logic [N_PORTS-1:0]                         int_rmaster_select;
-
-  logic [N_PORTS-1:0]                         w_master_select;
-
-  // Internal master0 AXI4 lines. These connect the first master port to the
-  // multiplexers
-  // For channels read address, write address and write data the other lines
-  // are ignored if valid is not set, therefore we only need to multiplex those
-  logic [N_PORTS-1:0]                         int_m0_awvalid;
-  logic [N_PORTS-1:0]                         int_m0_awready;
-
-  logic [N_PORTS-1:0]                         int_m0_wvalid;
-  logic [N_PORTS-1:0]                         int_m0_wready;
-
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m0_bid;
-  logic [N_PORTS-1:0]                   [1:0] int_m0_bresp;
-  logic [N_PORTS-1:0]                         int_m0_bvalid;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m0_buser;
-  logic [N_PORTS-1:0]                         int_m0_bready;
-
-  logic [N_PORTS-1:0]                         int_m0_arvalid;
-  logic [N_PORTS-1:0]                         int_m0_arready;
-
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m0_rid;
-  logic [N_PORTS-1:0]                   [1:0] int_m0_rresp;
-  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_m0_rdata;
-  logic [N_PORTS-1:0]                         int_m0_rlast;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m0_ruser;
-  logic [N_PORTS-1:0]                         int_m0_rready;
-  logic [N_PORTS-1:0]                         int_m0_rvalid;
-
-  logic [N_PORTS-1:0]                         l1_m0_ar_accept;
-  logic [N_PORTS-1:0]                         l1_m0_ar_drop;
-  logic [N_PORTS-1:0]                         l1_m0_ar_save;
-  logic [N_PORTS-1:0]                         l1_m0_ar_done;
-  logic [N_PORTS-1:0]                         l2_m0_ar_accept;
-  logic [N_PORTS-1:0]                         l2_m0_ar_drop;
-  logic [N_PORTS-1:0]                         l2_m0_ar_done;
-  logic [N_PORTS-1:0]                         l2_m0_ar_sending;
-
-  logic [N_PORTS-1:0]                         l1_m0_aw_accept;
-  logic [N_PORTS-1:0]                         l1_m0_aw_drop;
-  logic [N_PORTS-1:0]                         l1_m0_aw_save;
-  logic [N_PORTS-1:0]                         l1_m0_aw_done;
-  logic [N_PORTS-1:0]                         l2_m0_aw_accept;
-  logic [N_PORTS-1:0]                         l2_m0_aw_drop;
-  logic [N_PORTS-1:0]                         l2_m0_aw_done;
-  logic [N_PORTS-1:0]                         l2_m0_aw_sending;
-
-  // Internal master1 AXI4 lines. These connect the second master port to the
-  // multiplexers
-  // For channels read address, write address and write data the other lines
-  // are ignored if valid is not set, therefore we only need to multiplex those
-  logic [N_PORTS-1:0]                         int_m1_awvalid;
-  logic [N_PORTS-1:0]                         int_m1_awready;
-
-  logic [N_PORTS-1:0]                         int_m1_wvalid;
-  logic [N_PORTS-1:0]                         int_m1_wready;
-
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m1_bid;
-  logic [N_PORTS-1:0]                   [1:0] int_m1_bresp;
-  logic [N_PORTS-1:0]                         int_m1_bvalid;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m1_buser;
-  logic [N_PORTS-1:0]                         int_m1_bready;
-
-  logic [N_PORTS-1:0]                         int_m1_arvalid;
-  logic [N_PORTS-1:0]                         int_m1_arready;
-
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m1_rid;
-  logic [N_PORTS-1:0]                   [1:0] int_m1_rresp;
-  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_m1_rdata;
-  logic [N_PORTS-1:0]                         int_m1_rlast;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m1_ruser;
-  logic [N_PORTS-1:0]                         int_m1_rvalid;
-  logic [N_PORTS-1:0]                         int_m1_rready;
-
-  logic [N_PORTS-1:0]                         l1_m1_ar_accept;
-  logic [N_PORTS-1:0]                         l1_m1_ar_drop;
-  logic [N_PORTS-1:0]                         l1_m1_ar_save;
-  logic [N_PORTS-1:0]                         l1_m1_ar_done;
-  logic [N_PORTS-1:0]                         l2_m1_ar_accept;
-  logic [N_PORTS-1:0]                         l2_m1_ar_drop;
-  logic [N_PORTS-1:0]                         l2_m1_ar_done;
-
-  logic [N_PORTS-1:0]                         l1_m1_aw_accept;
-  logic [N_PORTS-1:0]                         l1_m1_aw_drop;
-  logic [N_PORTS-1:0]                         l1_m1_aw_save;
-  logic [N_PORTS-1:0]                         l1_m1_aw_done;
-  logic [N_PORTS-1:0]                         l2_m1_aw_accept;
-  logic [N_PORTS-1:0]                         l2_m1_aw_drop;
-  logic [N_PORTS-1:0]                         l2_m1_aw_done;
-
-  // L1 outputs
-  logic [N_PORTS-1:0]                         rab_miss; // L1 RAB miss
-  logic [N_PORTS-1:0]                         rab_prot;
-  logic [N_PORTS-1:0]                         rab_multi;
-  logic [N_PORTS-1:0]                         rab_prefetch;
-
-  //
-  // Signals used to support L2 TLB
-  //
-  // L2 RAM configuration signals
-  logic [N_PORTS-1:0] [AXI_LITE_DATA_WIDTH-1:0] L2CfgWData_D;
-  logic [N_PORTS-1:0] [AXI_LITE_ADDR_WIDTH-1:0] L2CfgWAddr_D;
-  logic [N_PORTS-1:0]                           L2CfgWE_S;
-
-  // L1 output and drop Buffer
-  logic [N_PORTS-1:0]                           L1OutRwType_D, L1DropRwType_DP;
-  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] L1OutUser_D, L1DropUser_DP;
-  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] L1OutId_D, L1DropId_DP;
-  logic [N_PORTS-1:0]                     [7:0] L1OutLen_D, L1DropLen_DP;
-  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] L1OutAddr_D, L1DropAddr_DP;
-  logic [N_PORTS-1:0]                           L1OutProt_D, L1DropProt_DP;
-  logic [N_PORTS-1:0]                           L1OutMulti_D, L1DropMulti_DP;
-  logic [N_PORTS-1:0]                           L1DropEn_S;
-  logic [N_PORTS-1:0]                           L1DropPrefetch_S;
-
-  logic [N_PORTS-1:0]                           L1DropValid_SN, L1DropValid_SP;
-
-  // L2 input Buffer
-  logic [N_PORTS-1:0]                           L2InRwType_DP;
-  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] L2InUser_DP;
-  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] L2InId_DP;
-  logic [N_PORTS-1:0]                     [7:0] L2InLen_DP;
-  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] L2InAddr_DP;
-  logic [N_PORTS-1:0]                           L2InEn_S;
-
-  // L2 output Buffer
-  logic [N_PORTS-1:0]                           L2OutRwType_DP;
-  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] L2OutUser_DP;
-  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] L2OutId_DP;
-  logic [N_PORTS-1:0]                     [7:0] L2OutLen_DP;
-  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] L2OutInAddr_DP;
-
-  logic [N_PORTS-1:0]                           L2OutHit_SN, L2OutHit_SP;
-  logic [N_PORTS-1:0]                           L2OutMiss_SN, L2OutMiss_SP;
-  logic [N_PORTS-1:0]                           L2OutProt_SN, L2OutProt_SP;
-  logic [N_PORTS-1:0]                           L2OutMulti_SN, L2OutMulti_SP;
-  logic [N_PORTS-1:0]                           L2OutCC_SN, L2OutCC_SP;
-  logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] L2OutAddr_DN, L2OutAddr_DP;
-
-  logic [N_PORTS-1:0]                           L2OutValid_SN, L2OutValid_SP;
-  logic [N_PORTS-1:0]                           L2OutPrefetch_S;
-  logic [N_PORTS-1:0]                           L2OutReady_S;
-  logic [N_PORTS-1:0]                           L2OutEn_S;
-
-   // L2 outputs
-  logic [N_PORTS-1:0]                           L2Busy_S;
-  logic [N_PORTS-1:0]                           L2OutValid_S;
-
-  logic [N_PORTS-1:0]                           L2Miss_S;
-
-  // Signals for interfacing the AXI modules
-  logic [N_PORTS-1:0]                           l1_ar_accept;
-  logic [N_PORTS-1:0]                           l1_aw_accept;
-  logic [N_PORTS-1:0]                           l1_w_accept;
-  logic [N_PORTS-1:0]                           l1_xw_accept;
-
-  logic [N_PORTS-1:0]                           l1_ar_drop;
-  logic [N_PORTS-1:0]                           l1_aw_drop;
-  logic [N_PORTS-1:0]                           l1_w_drop;
-  logic [N_PORTS-1:0]                           l1_xw_drop;
-
-  logic [N_PORTS-1:0]                           l1_ar_save;
-  logic [N_PORTS-1:0]                           l1_aw_save;
-  logic [N_PORTS-1:0]                           l1_w_save;
-  logic [N_PORTS-1:0]                           l1_xw_save;
-
-  logic [N_PORTS-1:0]                           l1_ar_done;
-  logic [N_PORTS-1:0]                           l1_r_done;
-  logic [N_PORTS-1:0]                           l1_r_drop;
-  logic [N_PORTS-1:0]                           lx_r_drop;
-  logic [N_PORTS-1:0]                           lx_r_done;
-
-  logic [N_PORTS-1:0]                           l1_aw_done;
-  logic [N_PORTS-1:0]                           l1_w_done;
-  logic [N_PORTS-1:0]                           l1_xw_done;
-  logic [N_PORTS-1:0]                           l1_aw_done_SP;
-  logic [N_PORTS-1:0]                           l1_w_done_SP;
-
-  logic [N_PORTS-1:0]                           l2_ar_accept;
-  logic [N_PORTS-1:0]                           l2_aw_accept;
-  logic [N_PORTS-1:0]                           l2_w_accept;
-  logic [N_PORTS-1:0]                           l2_xw_accept;
-
-  logic [N_PORTS-1:0]                           l2_ar_drop;
-  logic [N_PORTS-1:0]                           l2_r_drop;
-  logic [N_PORTS-1:0]                           l2_xr_drop;
-  logic [N_PORTS-1:0]                           l2_aw_drop;
-  logic [N_PORTS-1:0]                           l2_w_drop;
-  logic [N_PORTS-1:0]                           l2_xw_drop;
-
-  logic [N_PORTS-1:0]                           l2_aw_done;
-  logic [N_PORTS-1:0]                           l2_w_done;
-  logic [N_PORTS-1:0]                           l2_xw_done;
-  logic [N_PORTS-1:0]                           l2_aw_done_SP;
-  logic [N_PORTS-1:0]                           l2_w_done_SP;
-
-  logic [N_PORTS-1:0]                           l2_ar_done;
-  logic [N_PORTS-1:0]                           l2_r_done;
-  logic [N_PORTS-1:0]                           l2_xr_done;
-  logic [N_PORTS-1:0]                           l2_ar_done_SP;
-  logic [N_PORTS-1:0]                           l2_r_done_SP;
-
-  logic [N_PORTS-1:0]                           l1_mx_aw_done;
-  logic [N_PORTS-1:0]                           l1_mx_ar_done;
-  logic [N_PORTS-1:0]                           l1_m0_aw_done_SP;
-  logic [N_PORTS-1:0]                           l1_m0_ar_done_SP;
-  logic [N_PORTS-1:0]                           l1_m1_aw_done_SP;
-  logic [N_PORTS-1:0]                           l1_m1_ar_done_SP;
-
-  logic [N_PORTS-1:0]                           l2_mx_aw_done;
-  logic [N_PORTS-1:0]                           l2_mx_ar_done;
-  logic [N_PORTS-1:0]                           l2_m0_aw_done_SP;
-  logic [N_PORTS-1:0]                           l2_m0_ar_done_SP;
-  logic [N_PORTS-1:0]                           l2_m1_aw_done_SP;
-  logic [N_PORTS-1:0]                           l2_m1_ar_done_SP;
-
-  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] l1_id_drop, lx_id_drop, b_id_drop;
-  logic [N_PORTS-1:0]                     [7:0] l1_len_drop, lx_len_drop;
-  logic [N_PORTS-1:0]                           l1_prefetch_drop, lx_prefetch_drop, b_prefetch_drop;
-  logic [N_PORTS-1:0]                           l1_hit_drop, lx_hit_drop, b_hit_drop;
-
-  logic [N_PORTS-1:0]                           b_drop;
-  logic [N_PORTS-1:0]                           b_done;
-
-  logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] l2_aw_addr;
-  logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] l2_ar_addr;
-
-  logic [N_PORTS-1:0]                           l2_cache_coherent;
-  logic [N_PORTS-1:0]                           l2_master_select;
-
-  logic [N_PORTS-1:0]                           aw_in_stall;
-  logic [N_PORTS-1:0]                           aw_out_stall;
-
-  genvar                                        i;
-
-  // RRESP FSM
-  typedef enum logic                    {IDLE, BUSY} r_resp_mux_ctrl_state_t;
-  r_resp_mux_ctrl_state_t [N_PORTS-1:0] RRespMuxCtrl_SN, RRespMuxCtrl_SP;
-  logic                   [N_PORTS-1:0] RRespSel_SN, RRespSel_SP;
-  logic                   [N_PORTS-1:0] RRespBurst_S;
-  logic                   [N_PORTS-1:0] RRespSelIm_S;
-
-  // }}}
-
-  // Local parameters {{{
-
-  // Enable L2 for select ports
-  localparam integer ENABLE_L2TLB[N_PORTS-1:0] = `EN_L2TLB_ARRAY;
-
-  // L2TLB parameters
-  localparam integer HUM_BUFFER_DEPTH = (N_L2_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS)+13;
-
-  // }}}
-
-  // Derive `master_select` from cache coherency flag. {{{
-  `ifdef EN_ACP
-    assign int_wmaster_select = int_wtrans_cache_coherent;
-    assign int_rmaster_select = int_rtrans_cache_coherent;
-    assign l2_master_select   = l2_cache_coherent;
-  `else
-    assign int_wmaster_select = '0;
-    assign int_rmaster_select = '0;
-    assign l2_master_select   = '0;
-  `endif
-  // }}}
-
-  // Buf and Send {{{
-  // âââââââ âââ   âââââââââââ       âââ       ââââââââââââââââââââ   ââââââââââ
-  // âââââââââââ   âââââââââââ       âââ       âââââââââââââââââââââ  âââââââââââ
-  // âââââââââââ   âââââââââ      âââââââââ    ââââââââââââââ  ââââââ ââââââ  âââ
-  // âââââââââââ   âââââââââ      âââââââââ    ââââââââââââââ  âââââââââââââ  âââ
-  // ââââââââââââââââââââ         âââââââ      âââââââââââââââââââ ââââââââââââââ
-  // âââââââ  âââââââ âââ         âââââââ      âââââââââââââââââââ  ââââââââââââ
-  //
-  logic[N_PORTS-1:0] m0_write_is_burst, m0_read_is_burst;
-  logic[N_PORTS-1:0] m1_write_is_burst, m1_read_is_burst;
-
-  generate for (i = 0; i < N_PORTS; i++) begin : BUF_AND_SEND
-
-  // Write Address channel (aw) {{{
-  /*
-   * write address channel (aw)
-   *
-   * âââ    ââââââââââ ââââââââââââââââââââ     ââââââ âââââââ âââââââ âââââââ
-   * âââ    âââââââââââââââââââââââââââââââ    ââââââââââââââââââââââââââââââââ
-   * âââ ââ ââââââââââââââ   âââ   ââââââ      âââââââââââ  ââââââ  âââââââââââ
-   * âââââââââââââââââââââ   âââ   ââââââ      âââââââââââ  ââââââ  âââââââââââ
-   * âââââââââââââ  ââââââ   âââ   ââââââââ    âââ  ââââââââââââââââââââââ  âââ
-   *  ââââââââ âââ  ââââââ   âââ   ââââââââ    âââ  ââââââââââ âââââââ âââ  âââ
-   *
-   */
-
-  axi4_aw_buffer
-    #(
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-      )
-    u_aw_buffer
-    (
-      .axi4_aclk       ( Clk_CI             ),
-      .axi4_arstn      ( Rst_RBI            ),
-      .s_axi4_awid     ( s_axi4_awid[i]     ),
-      .s_axi4_awaddr   ( s_axi4_awaddr[i]   ),
-      .s_axi4_awvalid  ( s_axi4_awvalid[i]  ),
-      .s_axi4_awready  ( s_axi4_awready[i]  ),
-      .s_axi4_awlen    ( s_axi4_awlen[i]    ),
-      .s_axi4_awsize   ( s_axi4_awsize[i]   ),
-      .s_axi4_awburst  ( s_axi4_awburst[i]  ),
-      .s_axi4_awlock   ( s_axi4_awlock[i]   ),
-      .s_axi4_awprot   ( s_axi4_awprot[i]   ),
-      .s_axi4_awcache  ( s_axi4_awcache[i]  ),
-      .s_axi4_awregion ( s_axi4_awregion[i] ),
-      .s_axi4_awqos    ( s_axi4_awqos[i]    ),
-      .s_axi4_awuser   ( s_axi4_awuser[i]   ),
-      .m_axi4_awid     ( int_awid[i]        ),
-      .m_axi4_awaddr   ( int_awaddr[i]      ),
-      .m_axi4_awvalid  ( int_awvalid[i]     ),
-      .m_axi4_awready  ( int_awready[i]     ),
-      .m_axi4_awlen    ( int_awlen[i]       ),
-      .m_axi4_awsize   ( int_awsize[i]      ),
-      .m_axi4_awburst  ( int_awburst[i]     ),
-      .m_axi4_awlock   ( int_awlock[i]      ),
-      .m_axi4_awprot   ( int_awprot[i]      ),
-      .m_axi4_awcache  ( int_awcache[i]     ),
-      .m_axi4_awregion ( int_awregion[i]    ),
-      .m_axi4_awqos    ( int_awqos[i]       ),
-      .m_axi4_awuser   ( int_awuser[i]      )
-    );
-
-  axi4_aw_sender
-    #(
-      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
-      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
-      )
-    u_aw_sender_m0
-    (
-      .axi4_aclk       ( Clk_CI                ),
-      .axi4_arstn      ( Rst_RBI               ),
-      .l1_done_o       ( l1_m0_aw_done[i]      ),
-      .l1_accept_i     ( l1_m0_aw_accept[i]    ),
-      .l1_drop_i       ( l1_m0_aw_drop[i]      ),
-      .l1_save_i       ( l1_m0_aw_save[i]      ),
-      .l2_done_o       ( l2_m0_aw_done[i]      ),
-      .l2_accept_i     ( l2_m0_aw_accept[i]    ),
-      .l2_drop_i       ( l2_m0_aw_drop[i]      ),
-      .l2_sending_o    ( l2_m0_aw_sending[i]   ),
-      .l1_awaddr_i     ( int_wtrans_addr[i]    ),
-      .l2_awaddr_i     ( l2_aw_addr[i]         ),
-      .s_axi4_awid     ( int_awid[i]           ),
-      .s_axi4_awvalid  ( int_m0_awvalid[i]     ),
-      .s_axi4_awready  ( int_m0_awready[i]     ),
-      .s_axi4_awlen    ( int_awlen[i]          ),
-      .s_axi4_awsize   ( int_awsize[i]         ),
-      .s_axi4_awburst  ( int_awburst[i]        ),
-      .s_axi4_awlock   ( int_awlock[i]         ),
-      .s_axi4_awprot   ( int_awprot[i]         ),
-      .s_axi4_awcache  ( int_awcache[i]        ),
-      .s_axi4_awregion ( int_awregion[i]       ),
-      .s_axi4_awqos    ( int_awqos[i]          ),
-      .s_axi4_awuser   ( int_awuser[i]         ),
-      .m_axi4_awid     ( m0_axi4_awid[i]       ),
-      .m_axi4_awaddr   ( m0_axi4_awaddr[i]     ),
-      .m_axi4_awvalid  ( m0_axi4_awvalid[i]    ),
-      .m_axi4_awready  ( m0_axi4_awready[i]    ),
-      .m_axi4_awlen    ( m0_axi4_awlen[i]      ),
-      .m_axi4_awsize   ( m0_axi4_awsize[i]     ),
-      .m_axi4_awburst  ( m0_axi4_awburst[i]    ),
-      .m_axi4_awlock   ( m0_axi4_awlock[i]     ),
-      .m_axi4_awprot   ( m0_axi4_awprot[i]     ),
-      .m_axi4_awcache  (                       ),
-      .m_axi4_awregion ( m0_axi4_awregion[i]   ),
-      .m_axi4_awqos    ( m0_axi4_awqos[i]      ),
-      .m_axi4_awuser   ( m0_axi4_awuser[i]     )
-    );
-
-  // The AXCACHE signals are set according to burstiness and cache coherence or statically
-  // when not connected to ACP on Zynq (implemented below).
-    assign m0_write_is_burst[i] = (m0_axi4_awlen[i] != {8{1'b0}}) && (m0_axi4_awburst[i] != 2'b00);
-  `ifndef EN_ACP
-    always_comb begin
-      if ( (l2_m0_aw_sending[i] & l2_cache_coherent[i]) | int_wtrans_cache_coherent[i]) begin
-        if (m0_write_is_burst[i]) begin
-          m0_axi4_awcache[i]  = 4'b0111;
-        end else begin
-          m0_axi4_awcache[i]  = 4'b1111;
-        end
-      end else begin
-        m0_axi4_awcache[i]    = 4'b0011;
-      end
-    end
-  `else
-    assign m0_axi4_awcache[i] = 4'b0011;
-  `endif
-
-  axi4_aw_sender
-    #(
-      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
-      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
-      )
-    u_aw_sender_m1
-    (
-      .axi4_aclk       ( Clk_CI                ),
-      .axi4_arstn      ( Rst_RBI               ),
-      .l1_accept_i     ( l1_m1_aw_accept[i]    ),
-      .l1_drop_i       ( l1_m1_aw_drop[i]      ),
-      .l1_save_i       ( l1_m1_aw_save[i]      ),
-      .l1_done_o       ( l1_m1_aw_done[i]      ),
-      .l2_accept_i     ( l2_m1_aw_accept[i]    ),
-      .l2_drop_i       ( l2_m1_aw_drop[i]      ),
-      .l2_done_o       ( l2_m1_aw_done[i]      ),
-      .l2_sending_o    (                       ), // just helps to set axcache
-      .l1_awaddr_i     ( int_wtrans_addr[i]    ),
-      .l2_awaddr_i     ( l2_aw_addr[i]         ),
-      .s_axi4_awid     ( int_awid[i]           ),
-      .s_axi4_awvalid  ( int_m1_awvalid[i]     ),
-      .s_axi4_awready  ( int_m1_awready[i]     ),
-      .s_axi4_awlen    ( int_awlen[i]          ),
-      .s_axi4_awsize   ( int_awsize[i]         ),
-      .s_axi4_awburst  ( int_awburst[i]        ),
-      .s_axi4_awlock   ( int_awlock[i]         ),
-      .s_axi4_awprot   ( int_awprot[i]         ),
-      .s_axi4_awcache  ( int_awcache[i]        ),
-      .s_axi4_awregion ( int_awregion[i]       ),
-      .s_axi4_awqos    ( int_awqos[i]          ),
-      .s_axi4_awuser   ( int_awuser[i]         ),
-      .m_axi4_awid     ( m1_axi4_awid[i]       ),
-      .m_axi4_awaddr   ( m1_axi4_awaddr[i]     ),
-      .m_axi4_awvalid  ( m1_axi4_awvalid[i]    ),
-      .m_axi4_awready  ( m1_axi4_awready[i]    ),
-      .m_axi4_awlen    ( m1_axi4_awlen[i]      ),
-      .m_axi4_awsize   ( m1_axi4_awsize[i]     ),
-      .m_axi4_awburst  ( m1_axi4_awburst[i]    ),
-      .m_axi4_awlock   ( m1_axi4_awlock[i]     ),
-      .m_axi4_awprot   ( m1_axi4_awprot[i]     ),
-      .m_axi4_awcache  (                       ),
-      .m_axi4_awregion ( m1_axi4_awregion[i]   ),
-      .m_axi4_awqos    ( m1_axi4_awqos[i]      ),
-      .m_axi4_awuser   ( m1_axi4_awuser[i]     )
-    );
-
-    // The AXCACHE signals are set according to burstiness and cache coherence or statically
-    // when not connected to ACP on Zynq (implemented below).
-      assign m1_write_is_burst[i] = (m1_axi4_awlen[i] != {8{1'b0}}) && (m1_axi4_awburst[i] != 2'b00);
-    `ifdef EN_ACP
-      always_comb begin
-        if (m1_write_is_burst[i]) begin
-          m1_axi4_awcache[i]    = 4'b1011;
-        end else begin
-          m1_axi4_awcache[i]    = 4'b1111;
-        end
-      end
-    `else
-      assign m1_axi4_awcache[i] = 4'b0011;
-    `endif
-
-  // }}}
-
-  // Write Data channel (w) {{{
-  /*
-   * write data channel (w)
-   *
-   * âââ    ââââââââââ ââââââââââââââââââââ    âââââââ  ââââââ âââââââââ ââââââ
-   * âââ    âââââââââââââââââââââââââââââââ    âââââââââââââââââââââââââââââââââ
-   * âââ ââ ââââââââââââââ   âââ   ââââââ      âââ  âââââââââââ   âââ   ââââââââ
-   * âââââââââââââââââââââ   âââ   ââââââ      âââ  âââââââââââ   âââ   ââââââââ
-   * âââââââââââââ  ââââââ   âââ   ââââââââ    âââââââââââ  âââ   âââ   âââ  âââ
-   *  ââââââââ âââ  ââââââ   âââ   ââââââââ    âââââââ âââ  âââ   âââ   âââ  âââ
-   *
-   */
-  axi4_w_buffer
-    #(
-      .AXI_DATA_WIDTH   ( AXI_DATA_WIDTH   ),
-      .AXI_ID_WIDTH     ( AXI_ID_WIDTH     ),
-      .AXI_USER_WIDTH   ( AXI_USER_WIDTH   ),
-      .ENABLE_L2TLB     ( ENABLE_L2TLB[i]  ),
-      .HUM_BUFFER_DEPTH ( HUM_BUFFER_DEPTH )
-      )
-    u_w_buffer
-    (
-      .axi4_aclk       ( Clk_CI                ),
-      .axi4_arstn      ( Rst_RBI               ),
-
-      // L1 interface
-      .l1_done_o       ( l1_w_done[i]          ),
-      .l1_accept_i     ( l1_w_accept[i]        ),
-      .l1_save_i       ( l1_w_save[i]          ),
-      .l1_drop_i       ( l1_w_drop[i]          ),
-      .l1_master_i     ( int_wmaster_select[i] ),
-      .l1_id_i         ( l1_id_drop[i]         ),
-      .l1_len_i        ( l1_len_drop[i]        ),
-      .l1_prefetch_i   ( l1_prefetch_drop[i]   ),
-      .l1_hit_i        ( l1_hit_drop[i]        ),
-
-      // L2 interface
-      .l2_done_o       ( l2_w_done[i]          ),
-      .l2_accept_i     ( l2_w_accept[i]        ),
-      .l2_drop_i       ( l2_w_drop[i]          ),
-      .l2_master_i     ( l2_master_select[i]   ),
-      .l2_id_i         ( lx_id_drop[i]         ),
-      .l2_len_i        ( lx_len_drop[i]        ),
-      .l2_prefetch_i   ( lx_prefetch_drop[i]   ),
-      .l2_hit_i        ( lx_hit_drop[i]        ),
-
-      // Top-level control outputs
-      .master_select_o ( w_master_select[i]    ),
-      .input_stall_o   ( aw_in_stall[i]        ), // stall L1 AW input if request buffers full
-      .output_stall_o  ( aw_out_stall[i]       ), // stall L1 AW hit forwarding if bypass not possible
-
-      // B sender interface
-      .b_drop_o        ( b_drop[i]             ),
-      .b_done_i        ( b_done[i]             ),
-      .id_o            ( b_id_drop[i]          ),
-      .prefetch_o      ( b_prefetch_drop[i]    ),
-      .hit_o           ( b_hit_drop[i]         ),
-
-      // AXI W channel interfaces
-      .s_axi4_wdata    ( s_axi4_wdata[i]       ),
-      .s_axi4_wvalid   ( s_axi4_wvalid[i]      ),
-      .s_axi4_wready   ( s_axi4_wready[i]      ),
-      .s_axi4_wstrb    ( s_axi4_wstrb[i]       ),
-      .s_axi4_wlast    ( s_axi4_wlast[i]       ),
-      .s_axi4_wuser    ( s_axi4_wuser[i]       ),
-      .m_axi4_wdata    ( int_wdata[i]          ),
-      .m_axi4_wvalid   ( int_wvalid[i]         ),
-      .m_axi4_wready   ( int_wready[i]         ),
-      .m_axi4_wstrb    ( int_wstrb[i]          ),
-      .m_axi4_wlast    ( int_wlast[i]          ),
-      .m_axi4_wuser    ( int_wuser[i]          )
-    );
-
-  axi4_w_sender
-    #(
-      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-      )
-    u_w_sender_m0
-    (
-      .axi4_aclk       ( Clk_CI            ),
-      .axi4_arstn      ( Rst_RBI           ),
-      .s_axi4_wdata    ( int_wdata[i]      ),
-      .s_axi4_wvalid   ( int_m0_wvalid[i]  ),
-      .s_axi4_wready   ( int_m0_wready[i]  ),
-      .s_axi4_wstrb    ( int_wstrb[i]      ),
-      .s_axi4_wlast    ( int_wlast[i]      ),
-      .s_axi4_wuser    ( int_wuser[i]      ),
-      .m_axi4_wdata    ( m0_axi4_wdata[i]  ),
-      .m_axi4_wvalid   ( m0_axi4_wvalid[i] ),
-      .m_axi4_wready   ( m0_axi4_wready[i] ),
-      .m_axi4_wstrb    ( m0_axi4_wstrb[i]  ),
-      .m_axi4_wlast    ( m0_axi4_wlast[i]  ),
-      .m_axi4_wuser    ( m0_axi4_wuser[i]  )
-    );
-
-  axi4_w_sender
-    #(
-      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-
-      )
-    u_w_sender_m1
-    (
-      .axi4_aclk       ( Clk_CI            ),
-      .axi4_arstn      ( Rst_RBI           ),
-      .s_axi4_wdata    ( int_wdata[i]      ),
-      .s_axi4_wvalid   ( int_m1_wvalid[i]  ),
-      .s_axi4_wready   ( int_m1_wready[i]  ),
-      .s_axi4_wstrb    ( int_wstrb[i]      ),
-      .s_axi4_wlast    ( int_wlast[i]      ),
-      .s_axi4_wuser    ( int_wuser[i]      ),
-      .m_axi4_wdata    ( m1_axi4_wdata[i]  ),
-      .m_axi4_wvalid   ( m1_axi4_wvalid[i] ),
-      .m_axi4_wready   ( m1_axi4_wready[i] ),
-      .m_axi4_wstrb    ( m1_axi4_wstrb[i]  ),
-      .m_axi4_wlast    ( m1_axi4_wlast[i]  ),
-      .m_axi4_wuser    ( m1_axi4_wuser[i]  )
-    );
-
-  /*
-   * Multiplexer to switch between the two output master ports on the write data (w) channel
-   */
-  always_comb begin
-    /* Only one output can be selected at any time */
-    if (w_master_select[i] == 1'b0) begin
-      int_m0_wvalid[i] = int_wvalid[i];
-      int_m1_wvalid[i] = 1'b0;
-      int_wready[i]    = int_m0_wready[i];
-    end else begin
-      int_m0_wvalid[i] = 1'b0;
-      int_m1_wvalid[i] = int_wvalid[i];
-      int_wready[i]    = int_m1_wready[i];
-    end
-  end
-
-  // }}}
-
-  // Write Response channel (b) {{{
-  /*
-   * write response channel (b)
-   *
-   * âââ    ââââââââââ ââââââââââââââââââââ    âââââââ âââââââââââââââââââââââ
-   * âââ    âââââââââââââââââââââââââââââââ    ââââââââââââââââââââââââââââââââ
-   * âââ ââ ââââââââââââââ   âââ   ââââââ      ââââââââââââââ  ââââââââââââââââ
-   * âââââââââââââââââââââ   âââ   ââââââ      ââââââââââââââ  âââââââââââââââ
-   * âââââââââââââ  ââââââ   âââ   ââââââââ    âââ  ââââââââââââââââââââââ
-   *  ââââââââ âââ  ââââââ   âââ   ââââââââ    âââ  ââââââââââââââââââââââ
-   *
-   */
-  axi4_b_buffer
-    #(
-        .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
-        .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-      )
-    u_b_buffer_m0
-    (
-      .axi4_aclk     ( Clk_CI            ),
-      .axi4_arstn    ( Rst_RBI           ),
-      .s_axi4_bid    ( int_m0_bid[i]     ),
-      .s_axi4_bresp  ( int_m0_bresp[i]   ),
-      .s_axi4_bvalid ( int_m0_bvalid[i]  ),
-      .s_axi4_buser  ( int_m0_buser[i]   ),
-      .s_axi4_bready ( int_m0_bready[i]  ),
-      .m_axi4_bid    ( m0_axi4_bid[i]    ),
-      .m_axi4_bresp  ( m0_axi4_bresp[i]  ),
-      .m_axi4_bvalid ( m0_axi4_bvalid[i] ),
-      .m_axi4_buser  ( m0_axi4_buser[i]  ),
-      .m_axi4_bready ( m0_axi4_bready[i] )
-    );
-
-  axi4_b_buffer
-    #(
-        .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
-        .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-      )
-    u_b_buffer_m1
-    (
-      .axi4_aclk      ( Clk_CI            ),
-      .axi4_arstn     ( Rst_RBI           ),
-      .s_axi4_bid     ( int_m1_bid[i]     ),
-      .s_axi4_bresp   ( int_m1_bresp[i]   ),
-      .s_axi4_bvalid  ( int_m1_bvalid[i]  ),
-      .s_axi4_buser   ( int_m1_buser[i]   ),
-      .s_axi4_bready  ( int_m1_bready[i]  ),
-      .m_axi4_bid     ( m1_axi4_bid[i]    ),
-      .m_axi4_bresp   ( m1_axi4_bresp[i]  ),
-      .m_axi4_bvalid  ( m1_axi4_bvalid[i] ),
-      .m_axi4_buser   ( m1_axi4_buser[i]  ),
-      .m_axi4_bready  ( m1_axi4_bready[i] )
-    );
-
-  axi4_b_sender
-    #(
-        .AXI_ID_WIDTH   ( AXI_ID_WIDTH    ),
-        .AXI_USER_WIDTH ( AXI_USER_WIDTH  )
-      )
-    u_b_sender
-    (
-      .axi4_aclk      ( Clk_CI             ),
-      .axi4_arstn     ( Rst_RBI            ),
-      .drop_i         ( b_drop[i]          ),
-      .done_o         ( b_done[i]          ),
-      .id_i           ( b_id_drop[i]       ),
-      .prefetch_i     ( b_prefetch_drop[i] ),
-      .hit_i          ( b_hit_drop[i]      ),
-      .s_axi4_bid     ( s_axi4_bid[i]      ),
-      .s_axi4_bresp   ( s_axi4_bresp[i]    ),
-      .s_axi4_bvalid  ( s_axi4_bvalid[i]   ),
-      .s_axi4_buser   ( s_axi4_buser[i]    ),
-      .s_axi4_bready  ( s_axi4_bready[i]   ),
-      .m_axi4_bid     ( int_bid[i]         ),
-      .m_axi4_bresp   ( int_bresp[i]       ),
-      .m_axi4_bvalid  ( int_bvalid[i]      ),
-      .m_axi4_buser   ( int_buser[i]       ),
-      .m_axi4_bready  ( int_bready[i]      )
-    );
-
-  /*
-   * Multiplexer to switch between the two output master ports on the write response (b) channel
-   */
-  always_comb begin
-     /* Output 1 always gets priority, so if it has something to send connect
-      it and let output 0 wait using rready = 0 */
-    if (int_m1_bvalid[i] == 1'b1) begin
-      int_m0_bready[i] = 1'b0;
-      int_m1_bready[i] = int_bready[i];
-
-      int_bid[i]       = int_m1_bid[i];
-      int_bresp[i]     = int_m1_bresp[i];
-      int_buser[i]     = int_m1_buser[i];
-      int_bvalid[i]    = int_m1_bvalid[i];
-    end else begin
-      int_m0_bready[i] = int_bready[i];
-      int_m1_bready[i] = 1'b0;
-
-      int_bid[i]       = int_m0_bid[i];
-      int_bresp[i]     = int_m0_bresp[i];
-      int_buser[i]     = int_m0_buser[i];
-      int_bvalid[i]    = int_m0_bvalid[i];
-    end
-  end
-
-  // }}}
-
-  // Read Address channel (ar) {{{
-  /*
-   * read address channel (ar)
-   *
-   * âââââââ ââââââââ ââââââ âââââââ      ââââââ âââââââ âââââââ âââââââ
-   * ââââââââââââââââââââââââââââââââ    ââââââââââââââââââââââââââââââââ
-   * ââââââââââââââ  âââââââââââ  âââ    âââââââââââ  ââââââ  âââââââââââ
-   * ââââââââââââââ  âââââââââââ  âââ    âââââââââââ  ââââââ  âââââââââââ
-   * âââ  ââââââââââââââ  âââââââââââ    âââ  ââââââââââââââââââââââ  âââ
-   * âââ  ââââââââââââââ  ââââââââââ     âââ  ââââââââââ âââââââ âââ  âââ
-   *
-   */
-  axi4_ar_buffer
-    #(
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-      )
-    u_ar_buffer
-    (
-      .axi4_aclk      ( Clk_CI            ),
-      .axi4_arstn     ( Rst_RBI           ),
-      .s_axi4_arid    ( s_axi4_arid[i]    ),
-      .s_axi4_araddr  ( s_axi4_araddr[i]  ),
-      .s_axi4_arvalid ( s_axi4_arvalid[i] ),
-      .s_axi4_arready ( s_axi4_arready[i] ),
-      .s_axi4_arlen   ( s_axi4_arlen[i]   ),
-      .s_axi4_arsize  ( s_axi4_arsize[i]  ),
-      .s_axi4_arburst ( s_axi4_arburst[i] ),
-      .s_axi4_arlock  ( s_axi4_arlock[i]  ),
-      .s_axi4_arprot  ( s_axi4_arprot[i]  ),
-      .s_axi4_arcache ( s_axi4_arcache[i] ),
-      .s_axi4_aruser  ( s_axi4_aruser[i]  ),
-      .m_axi4_arid    ( int_arid[i]       ),
-      .m_axi4_araddr  ( int_araddr[i]     ),
-      .m_axi4_arvalid ( int_arvalid[i]    ),
-      .m_axi4_arready ( int_arready[i]    ),
-      .m_axi4_arlen   ( int_arlen[i]      ),
-      .m_axi4_arsize  ( int_arsize[i]     ),
-      .m_axi4_arburst ( int_arburst[i]    ),
-      .m_axi4_arlock  ( int_arlock[i]     ),
-      .m_axi4_arprot  ( int_arprot[i]     ),
-      .m_axi4_arcache ( int_arcache[i]    ),
-      .m_axi4_aruser  ( int_aruser[i]     )
-    );
-
-  axi4_ar_sender
-    #(
-      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
-      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
-      )
-    u_ar_sender_m0
-    (
-      .axi4_aclk       ( Clk_CI                ),
-      .axi4_arstn      ( Rst_RBI               ),
-      .l1_done_o       ( l1_m0_ar_done[i]      ),
-      .l1_accept_i     ( l1_m0_ar_accept[i]    ),
-      .l1_drop_i       ( l1_m0_ar_drop[i]      ),
-      .l1_save_i       ( l1_m0_ar_save[i]      ),
-      .l2_done_o       ( l2_m0_ar_done[i]      ),
-      .l2_accept_i     ( l2_m0_ar_accept[i]    ),
-      .l2_drop_i       ( l2_m0_ar_drop[i]      ),
-      .l2_sending_o    ( l2_m0_ar_sending[i]   ),
-      .l1_araddr_i     ( int_rtrans_addr[i]    ),
-      .l2_araddr_i     ( l2_ar_addr[i]         ),
-      .s_axi4_arid     ( int_arid[i]           ),
-      .s_axi4_arvalid  ( int_m0_arvalid[i]     ),
-      .s_axi4_arready  ( int_m0_arready[i]     ),
-      .s_axi4_arlen    ( int_arlen[i]          ),
-      .s_axi4_arsize   ( int_arsize[i]         ),
-      .s_axi4_arburst  ( int_arburst[i]        ),
-      .s_axi4_arlock   ( int_arlock[i]         ),
-      .s_axi4_arprot   ( int_arprot[i]         ),
-      .s_axi4_arcache  ( int_arcache[i]        ),
-      .s_axi4_aruser   ( int_aruser[i]         ),
-      .m_axi4_arid     ( m0_axi4_arid[i]       ),
-      .m_axi4_araddr   ( m0_axi4_araddr[i]     ),
-      .m_axi4_arvalid  ( m0_axi4_arvalid[i]    ),
-      .m_axi4_arready  ( m0_axi4_arready[i]    ),
-      .m_axi4_arlen    ( m0_axi4_arlen[i]      ),
-      .m_axi4_arsize   ( m0_axi4_arsize[i]     ),
-      .m_axi4_arburst  ( m0_axi4_arburst[i]    ),
-      .m_axi4_arlock   ( m0_axi4_arlock[i]     ),
-      .m_axi4_arprot   ( m0_axi4_arprot[i]     ),
-      .m_axi4_arcache  (                       ),
-      .m_axi4_aruser   ( m0_axi4_aruser[i]     )
-    );
-
-    // The AXCACHE signals are set according to burstiness and cache coherence or statically
-    // when not connected to ACP on Zynq (implemented below).
-      assign m0_read_is_burst[i] = (m0_axi4_arlen[i] != {8{1'b0}}) && (m0_axi4_arburst[i] != 2'b00);
-    `ifndef EN_ACP
-      always_comb begin
-        if ( (l2_m0_ar_sending[i] & l2_cache_coherent[i]) | int_rtrans_cache_coherent[i]) begin
-          if (m0_read_is_burst[i]) begin
-            m0_axi4_arcache[i]  = 4'b1011;
-          end else begin
-            m0_axi4_arcache[i]  = 4'b1111;
-          end
-        end else begin
-          m0_axi4_arcache[i]    = 4'b0011;
-        end
-      end
-    `else
-      assign m0_axi4_arcache[i] = 4'b0011;
-    `endif
-
-  axi4_ar_sender
-    #(
-      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
-      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
-      )
-    u_ar_sender_m1
-    (
-      .axi4_aclk       ( Clk_CI                ),
-      .axi4_arstn      ( Rst_RBI               ),
-      .l1_done_o       ( l1_m1_ar_done[i]      ),
-      .l1_accept_i     ( l1_m1_ar_accept[i]    ),
-      .l1_drop_i       ( l1_m1_ar_drop[i]      ),
-      .l1_save_i       ( l1_m1_ar_save[i]      ),
-      .l2_done_o       ( l2_m1_ar_done[i]      ),
-      .l2_accept_i     ( l2_m1_ar_accept[i]    ),
-      .l2_drop_i       ( l2_m1_ar_drop[i]      ),
-      .l2_sending_o    (                       ), // just helps to set axcache
-      .l1_araddr_i     ( int_rtrans_addr[i]    ),
-      .l2_araddr_i     ( l2_ar_addr[i]         ),
-      .s_axi4_arid     ( int_arid[i]           ),
-      .s_axi4_arvalid  ( int_m1_arvalid[i]     ),
-      .s_axi4_arready  ( int_m1_arready[i]     ),
-      .s_axi4_arlen    ( int_arlen[i]          ),
-      .s_axi4_arsize   ( int_arsize[i]         ),
-      .s_axi4_arburst  ( int_arburst[i]        ),
-      .s_axi4_arlock   ( int_arlock[i]         ),
-      .s_axi4_arprot   ( int_arprot[i]         ),
-      .s_axi4_arcache  ( int_arcache[i]        ),
-      .s_axi4_aruser   ( int_aruser[i]         ),
-      .m_axi4_arid     ( m1_axi4_arid[i]       ),
-      .m_axi4_araddr   ( m1_axi4_araddr[i]     ),
-      .m_axi4_arvalid  ( m1_axi4_arvalid[i]    ),
-      .m_axi4_arready  ( m1_axi4_arready[i]    ),
-      .m_axi4_arlen    ( m1_axi4_arlen[i]      ),
-      .m_axi4_arsize   ( m1_axi4_arsize[i]     ),
-      .m_axi4_arburst  ( m1_axi4_arburst[i]    ),
-      .m_axi4_arlock   ( m1_axi4_arlock[i]     ),
-      .m_axi4_arprot   ( m1_axi4_arprot[i]     ),
-      .m_axi4_arcache  (                       ),
-      .m_axi4_aruser   ( m1_axi4_aruser[i]     )
-    );
-
-    // The AXCACHE signals are set according to burstiness and cache coherence or statically
-    // when not connected to ACP on Zynq (implemented below).
-      assign m1_read_is_burst[i] = (m1_axi4_arlen[i] != {8{1'b0}}) && (m1_axi4_arburst[i] != 2'b00);
-    `ifdef EN_ACP
-      always_comb begin
-        if (m1_read_is_burst[i]) begin
-          m1_axi4_arcache[i]    = 4'b1011;
-        end else begin
-          m1_axi4_arcache[i]    = 4'b1111;
-        end
-      end
-    `else
-      assign m1_axi4_arcache[i] = 4'b0011;
-    `endif
-
-  // }}}
-
-  // Read Response channel (r) {{{
-  /*
-   * read response channel (r)
-   *
-   * âââââââ ââââââââ ââââââ âââââââ     âââââââ âââââââââââââââââââââââ
-   * ââââââââââââââââââââââââââââââââ    ââââââââââââââââââââââââââââââââ
-   * ââââââââââââââ  âââââââââââ  âââ    ââââââââââââââ  ââââââââââââââââ
-   * ââââââââââââââ  âââââââââââ  âââ    ââââââââââââââ  âââââââââââââââ
-   * âââ  ââââââââââââââ  âââââââââââ    âââ  ââââââââââââââââââââââ
-   * âââ  ââââââââââââââ  ââââââââââ     âââ  ââââââââââââââââââââââ
-   *
-   */
-  axi4_r_buffer
-    #(
-      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-      )
-    u_r_buffer_m0
-    (
-      .axi4_aclk     ( Clk_CI            ),
-      .axi4_arstn    ( Rst_RBI           ),
-      .s_axi4_rid    ( int_m0_rid[i]     ),
-      .s_axi4_rresp  ( int_m0_rresp[i]   ),
-      .s_axi4_rdata  ( int_m0_rdata[i]   ),
-      .s_axi4_rlast  ( int_m0_rlast[i]   ),
-      .s_axi4_rvalid ( int_m0_rvalid[i]  ),
-      .s_axi4_ruser  ( int_m0_ruser[i]   ),
-      .s_axi4_rready ( int_m0_rready[i]  ),
-      .m_axi4_rid    ( m0_axi4_rid[i]    ),
-      .m_axi4_rresp  ( m0_axi4_rresp[i]  ),
-      .m_axi4_rdata  ( m0_axi4_rdata[i]  ),
-      .m_axi4_rlast  ( m0_axi4_rlast[i]  ),
-      .m_axi4_rvalid ( m0_axi4_rvalid[i] ),
-      .m_axi4_ruser  ( m0_axi4_ruser[i]  ),
-      .m_axi4_rready ( m0_axi4_rready[i] )
-    );
-
-  axi4_r_buffer
-    #(
-      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-      )
-    u_r_buffer_m1
-    (
-      .axi4_aclk     ( Clk_CI            ),
-      .axi4_arstn    ( Rst_RBI           ),
-      .s_axi4_rid    ( int_m1_rid[i]     ),
-      .s_axi4_rresp  ( int_m1_rresp[i]   ),
-      .s_axi4_rdata  ( int_m1_rdata[i]   ),
-      .s_axi4_rlast  ( int_m1_rlast[i]   ),
-      .s_axi4_rvalid ( int_m1_rvalid[i]  ),
-      .s_axi4_ruser  ( int_m1_ruser[i]   ),
-      .s_axi4_rready ( int_m1_rready[i]  ),
-      .m_axi4_rid    ( m1_axi4_rid[i]    ),
-      .m_axi4_rresp  ( m1_axi4_rresp[i]  ),
-      .m_axi4_rdata  ( m1_axi4_rdata[i]  ),
-      .m_axi4_rlast  ( m1_axi4_rlast[i]  ),
-      .m_axi4_rvalid ( m1_axi4_rvalid[i] ),
-      .m_axi4_ruser  ( m1_axi4_ruser[i]  ),
-      .m_axi4_rready ( m1_axi4_rready[i] )
-    );
-
-  axi4_r_sender
-    #(
-      .AXI_DATA_WIDTH  ( AXI_DATA_WIDTH ),
-      .AXI_ID_WIDTH    ( AXI_ID_WIDTH   ),
-      .AXI_USER_WIDTH  ( AXI_USER_WIDTH )
-      )
-    u_r_sender
-    (
-      .axi4_aclk     ( Clk_CI              ),
-      .axi4_arstn    ( Rst_RBI             ),
-      .drop_i        ( lx_r_drop[i]        ),
-      .drop_len_i    ( lx_len_drop[i]      ),
-      .done_o        ( lx_r_done[i]        ),
-      .id_i          ( lx_id_drop[i]       ),
-      .prefetch_i    ( lx_prefetch_drop[i] ),
-      .hit_i         ( lx_hit_drop[i]      ),
-      .s_axi4_rid    ( s_axi4_rid[i]       ),
-      .s_axi4_rresp  ( s_axi4_rresp[i]     ),
-      .s_axi4_rdata  ( s_axi4_rdata[i]     ),
-      .s_axi4_rlast  ( s_axi4_rlast[i]     ),
-      .s_axi4_rvalid ( s_axi4_rvalid[i]    ),
-      .s_axi4_ruser  ( s_axi4_ruser[i]     ),
-      .s_axi4_rready ( s_axi4_rready[i]    ),
-      .m_axi4_rid    ( int_rid[i]          ),
-      .m_axi4_rresp  ( int_rresp[i]        ),
-      .m_axi4_rdata  ( int_rdata[i]        ),
-      .m_axi4_rlast  ( int_rlast[i]        ),
-      .m_axi4_rvalid ( int_rvalid[i]       ),
-      .m_axi4_ruser  ( int_ruser[i]        ),
-      .m_axi4_rready ( int_rready[i]       )
-    );
-
-  /*
-   * Multiplexer to switch between the two output master ports on the read response(r) channel
-   *
-   * Do not perform read burst interleaving as the DMA does not support it. This means we can only
-   * switch between the two masters upon sending rlast or when idle.
-   *
-   * However, if the downstream already performs burst interleaving, this cannot be undone here.
-   * Also, the downstream may interleave a burst reponse with a single-beat transaction. In this
-   * case, the FSM below falls out of the burst mode. To avoid it performing burst interleaving
-   * after such an event, it gives priority to the master which received the last burst in case
-   * both have a have a burst ready (rvalid).
-   *
-   * Order of priority:
-   * 1. Ongoing burst transaction
-   * 2. Single-beat transaction on Master 1.
-   * 3. Single-beat transaction on Master 0.
-   * 4. Burst transaction on master that received the last burst.
-   */
-  // Select signal
-  always_ff @(posedge Clk_CI) begin
-    if (Rst_RBI == 0) begin
-      RRespSel_SP[i] <= 1'b0;
-    end else begin
-      RRespSel_SP[i] <= RRespSel_SN[i];
-    end
-  end
-
-  // FSM
-  always_comb begin : RRespMuxFsm
-    RRespMuxCtrl_SN[i] = RRespMuxCtrl_SP[i];
-    RRespSel_SN[i]     = RRespSel_SP[i];
-
-    RRespBurst_S[i]    = 1'b0;
-    RRespSelIm_S[i]    = 1'b0;
-
-    unique case (RRespMuxCtrl_SP[i])
-
-      IDLE: begin
-        // immediately forward single-beat transactions
-        if      (int_m1_rvalid[i] && int_m1_rlast[i])
-          RRespSelIm_S[i] = 1'b1;
-        else if (int_m0_rvalid[i] && int_m0_rlast[i])
-          RRespSelIm_S[i] = 1'b0;
-
-        // bursts - they also start immediately
-        else if (int_m1_rvalid[i] || int_m0_rvalid[i]) begin
-          RRespMuxCtrl_SN[i] = BUSY;
-
-          // in case both are ready, continue with the master that had the last burst
-          if    (int_m1_rvalid[i] && int_m0_rvalid[i]) begin
-            RRespSel_SN[i]  = RRespSel_SP[i];
-            RRespSelIm_S[i] = RRespSel_SP[i];
-          end else if (int_m1_rvalid[i]) begin
-            RRespSel_SN[i]  = 1'b1;
-            RRespSelIm_S[i] = 1'b1;
-          end else begin
-            RRespSel_SN[i]  = 1'b0;
-            RRespSelIm_S[i] = 1'b0;
-          end
-        end
-      end
-
-      BUSY: begin
-        RRespBurst_S[i] = 1'b1;
-        // detect last handshake of currently ongoing transfer
-        if (int_rvalid[i] && int_rready[i] && int_rlast[i])
-          RRespMuxCtrl_SN[i] = IDLE;
-      end
-
-      default: begin
-        RRespMuxCtrl_SN[i] = IDLE;
-      end
-
-    endcase
-  end
-
-  // FSM state
-  always_ff @(posedge Clk_CI) begin
-    if (Rst_RBI == 0) begin
-      RRespMuxCtrl_SP[i] <= IDLE;
-    end else begin
-      RRespMuxCtrl_SP[i] <= RRespMuxCtrl_SN[i];
-    end
-  end
-
-  // Actual multiplexer
-  always_comb begin
-    if ( (RRespBurst_S[i] && RRespSel_SP[i]) || (!RRespBurst_S[i] && RRespSelIm_S[i]) ) begin
-      int_m0_rready[i] = 1'b0;
-      int_m1_rready[i] = int_rready[i];
-
-      int_rid[i]    = int_m1_rid[i];
-      int_rresp[i]  = int_m1_rresp[i];
-      int_rdata[i]  = int_m1_rdata[i];
-      int_rlast[i]  = int_m1_rlast[i];
-      int_ruser[i]  = int_m1_ruser[i];
-      int_rvalid[i] = int_m1_rvalid[i];
-    end else begin
-      int_m0_rready[i] = int_rready[i];
-      int_m1_rready[i] = 1'b0;
-
-      int_rid[i]    = int_m0_rid[i];
-      int_rresp[i]  = int_m0_rresp[i];
-      int_rdata[i]  = int_m0_rdata[i];
-      int_rlast[i]  = int_m0_rlast[i];
-      int_ruser[i]  = int_m0_ruser[i];
-      int_rvalid[i] = int_m0_rvalid[i];
-    end
-  end
-
-  end // BUF & SEND
-
-  // }}}
-
-  endgenerate // BUF & SEND }}}
-
-  // Log {{{
-
-`ifdef RAB_AX_LOG_EN
-  AxiBramLogger
-    #(
-      .AXI_ID_BITW     ( AXI_ID_WIDTH        ),
-      .AXI_ADDR_BITW   ( AXI_S_ADDR_WIDTH    ),
-      .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES )
-    )
-    u_aw_logger
-    (
-      .Clk_CI          ( NonGatedClk_CI    ),
-      .TimestampClk_CI ( Clk_CI            ),
-      .Rst_RBI         ( Rst_RBI           ),
-      .AxiValid_SI     ( s_axi4_awvalid[1] ),
-      .AxiReady_SI     ( s_axi4_awready[1] ),
-      .AxiId_DI        ( s_axi4_awid[1]    ),
-      .AxiAddr_DI      ( s_axi4_awaddr[1]  ),
-      .AxiLen_DI       ( s_axi4_awlen[1]   ),
-      .Clear_SI        ( AwLogClr_SI       ),
-      .LogEn_SI        ( LogEn_SI          ),
-      .Full_SO         ( int_aw_log_full   ),
-      .Ready_SO        ( AwLogRdy_SO       ),
-      .Bram_PS         ( AwBram_PS         )
-    );
-
-  AxiBramLogger
-    #(
-      .AXI_ID_BITW     ( AXI_ID_WIDTH        ),
-      .AXI_ADDR_BITW   ( AXI_S_ADDR_WIDTH    ),
-      .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES )
-    )
-    u_ar_logger
-    (
-      .Clk_CI          ( NonGatedClk_CI    ),
-      .TimestampClk_CI ( Clk_CI            ),
-      .Rst_RBI         ( Rst_RBI           ),
-      .AxiValid_SI     ( s_axi4_arvalid[1] ),
-      .AxiReady_SI     ( s_axi4_arready[1] ),
-      .AxiId_DI        ( s_axi4_arid[1]    ),
-      .AxiAddr_DI      ( s_axi4_araddr[1]  ),
-      .AxiLen_DI       ( s_axi4_arlen[1]   ),
-      .Clear_SI        ( ArLogClr_SI       ),
-      .LogEn_SI        ( LogEn_SI          ),
-      .Full_SO         ( int_ar_log_full   ),
-      .Ready_SO        ( ArLogRdy_SO       ),
-      .Bram_PS         ( ArBram_PS         )
-    );
-`endif
-
-  // }}}
-
-  // RAB Core {{{
-  // âââââââ  ââââââ âââââââ      âââââââ âââââââ âââââââ ââââââââ
-  // ââââââââââââââââââââââââ    âââââââââââââââââââââââââââââââââ
-  // ââââââââââââââââââââââââ    âââ     âââ   âââââââââââââââââ
-  // ââââââââââââââââââââââââ    âââ     âââ   âââââââââââââââââ
-  // âââ  ââââââ  âââââââââââ    ââââââââââââââââââââ  âââââââââââ
-  // âââ  ââââââ  ââââââââââ      âââââââ âââââââ âââ  âââââââââââ
-  //
-  /*
-   * rab_core
-   *
-   * The rab core translates addresses. It has two ports, which can be used
-   * independently, however they will compete for time internally, as lookups
-   * are serialized.
-   *
-   * type is the read(0) or write(1) used to check the protection flags. If they
-   * don't match an interrupt is created on the int_prot line.
-   */
-
-  rab_core
-    #(
-      .N_PORTS             ( N_PORTS             ),
-      .N_L2_SETS           ( N_L2_SETS           ),
-      .N_L2_SET_ENTRIES    ( N_L2_SET_ENTRIES    ),
-      .AXI_DATA_WIDTH      ( AXI_DATA_WIDTH      ),
-      .AXI_S_ADDR_WIDTH    ( AXI_S_ADDR_WIDTH    ),
-      .AXI_M_ADDR_WIDTH    ( AXI_M_ADDR_WIDTH    ),
-      .AXI_LITE_DATA_WIDTH ( AXI_LITE_DATA_WIDTH ),
-      .AXI_LITE_ADDR_WIDTH ( AXI_LITE_ADDR_WIDTH ),
-      .AXI_ID_WIDTH        ( AXI_ID_WIDTH        ),
-      .AXI_USER_WIDTH      ( AXI_USER_WIDTH      ),
-      .MH_FIFO_DEPTH       ( MH_FIFO_DEPTH       )
-    )
-    u_rab_core
-    (
-      .Clk_CI               ( Clk_CI                     ),
-      .Rst_RBI              ( Rst_RBI                    ),
-
-      // Config IF
-      .s_axi_awaddr         ( s_axi4lite_awaddr          ),
-      .s_axi_awvalid        ( s_axi4lite_awvalid         ),
-      .s_axi_awready        ( s_axi4lite_awready         ),
-      .s_axi_wdata          ( s_axi4lite_wdata           ),
-      .s_axi_wstrb          ( s_axi4lite_wstrb           ),
-      .s_axi_wvalid         ( s_axi4lite_wvalid          ),
-      .s_axi_wready         ( s_axi4lite_wready          ),
-      .s_axi_bresp          ( s_axi4lite_bresp           ),
-      .s_axi_bvalid         ( s_axi4lite_bvalid          ),
-      .s_axi_bready         ( s_axi4lite_bready          ),
-      .s_axi_araddr         ( s_axi4lite_araddr          ),
-      .s_axi_arvalid        ( s_axi4lite_arvalid         ),
-      .s_axi_arready        ( s_axi4lite_arready         ),
-      .s_axi_rready         ( s_axi4lite_rready          ),
-      .s_axi_rdata          ( s_axi4lite_rdata           ),
-      .s_axi_rresp          ( s_axi4lite_rresp           ),
-      .s_axi_rvalid         ( s_axi4lite_rvalid          ),
-
-      // L1 miss info outputs -> L2 TLB arbitration
-      .int_miss             ( rab_miss                   ),
-      .int_multi            ( rab_multi                  ),
-      .int_prot             ( rab_prot                   ),
-      .int_prefetch         ( rab_prefetch               ),
-      .int_mhf_full         ( int_mhf_full               ),
-
-      // L1 transaction info outputs -> L2 TLB arbitration
-      .int_axaddr_o         ( L1OutAddr_D                ),
-      .int_axid_o           ( L1OutId_D                  ),
-      .int_axlen_o          ( L1OutLen_D                 ),
-      .int_axuser_o         ( L1OutUser_D                ),
-
-      // Write Req IF
-      .port1_addr           ( int_awaddr                 ),
-      .port1_id             ( int_awid                   ),
-      .port1_len            ( int_awlen                  ),
-      .port1_size           ( int_awsize                 ),
-      .port1_addr_valid     ( int_awvalid & ~aw_in_stall ), // avoid the FSM accepting new AW requests
-      .port1_type           ( {N_PORTS{1'b1}}            ),
-      .port1_user           ( int_awuser                 ),
-      .port1_sent           ( int_wtrans_sent            ), // signal done to L1 FSM
-      .port1_out_addr       ( int_wtrans_addr            ),
-      .port1_cache_coherent ( int_wtrans_cache_coherent  ),
-      .port1_accept         ( int_wtrans_accept          ),
-      .port1_drop           ( int_wtrans_drop            ),
-      .port1_miss           ( int_wtrans_miss            ),
-
-      // Read Req IF
-      .port2_addr           ( int_araddr                 ),
-      .port2_id             ( int_arid                   ),
-      .port2_len            ( int_arlen                  ),
-      .port2_size           ( int_arsize                 ),
-      .port2_addr_valid     ( int_arvalid                ),
-      .port2_type           ( {N_PORTS{1'b0}}            ),
-      .port2_user           ( int_aruser                 ),
-      .port2_sent           ( int_rtrans_sent            ), // signal done to L1 FSM
-      .port2_out_addr       ( int_rtrans_addr            ),
-      .port2_cache_coherent ( int_rtrans_cache_coherent  ),
-      .port2_accept         ( int_rtrans_accept          ),
-      .port2_drop           ( int_rtrans_drop            ),
-      .port2_miss           ( int_rtrans_miss            ),
-
-      // L2 miss info inputs -> axi_rab_cfg
-      .miss_l2_i            ( L2Miss_S                   ),
-      .miss_l2_addr_i       ( L2OutInAddr_DP             ),
-      .miss_l2_id_i         ( L2OutId_DP                 ),
-      .miss_l2_user_i       ( L2OutUser_DP               ),
-
-      // L2 config outputs
-      .wdata_l2_o           ( L2CfgWData_D               ),
-      .waddr_l2_o           ( L2CfgWAddr_D               ),
-      .wren_l2_o            ( L2CfgWE_S                  )
-    );
-
-  // }}}
-
-  // AX SPLITS {{{
-  //  ââââââ âââ  âââ    âââââââââââââââ âââ     ââââââââââââ
-  // ââââââââââââââââ    âââââââââââââââââââ     ââââââââââââ
-  // ââââââââ ââââââ     âââââââââââââââââââ     âââ   âââ
-  // ââââââââ ââââââ     âââââââââââââââ âââ     âââ   âââ
-  // âââ  âââââââ âââ    âââââââââââ     âââââââââââ   âââ
-  // âââ  ââââââ  âââ    âââââââââââ     âââââââââââ   âââ
-  //
-  /**
-   * Multiplex the two output master ports of the Read Address and Write Address (AR/AW) channels.
-   *
-   * Use the `int_xmaster_select` signal to route the signals to either Master 0 (to memory) or
-   * Master 1 (to ACP). In case of an L1 miss: Route the signals to both masters. They shall be
-   * saved until the L2 outputs are available.
-   */
-  generate for (i = 0; i < N_PORTS; i++) begin : AX_SPLIT
-
-    /*
-     * When accepting L1 transactions, we must just do so on the selected master. Drop requests must
-     * be performed on any one of the two masters. Save requests must be performed by both masters.
-     */
-    always_comb begin : AW_L1_SPLIT
-
-      // TLB handshake
-      l1_m0_aw_accept[i] = 1'b0;
-      l1_m1_aw_accept[i] = 1'b0;
-      l1_m0_aw_drop[i]   = 1'b0;
-      l1_m1_aw_drop[i]   = 1'b0;
-      l1_m0_aw_save[i]   = 1'b0;
-      l1_m1_aw_save[i]   = 1'b0;
-
-      l1_mx_aw_done[i]   = 1'b0;
-
-      // AXI sender input handshake
-      int_m0_awvalid[i]  = 1'b0;
-      int_m1_awvalid[i]  = 1'b0;
-      int_awready[i]     = 1'b0;
-
-      // accept on selected master only
-      if (l1_aw_accept[i]) begin
-        if (int_wmaster_select[i]) begin
-          l1_m1_aw_accept[i] = 1'b1;
-          l1_mx_aw_done[i]   = l1_m1_aw_done[i];
-
-          int_m1_awvalid[i]  = int_awvalid[i];
-          int_awready[i]     = int_m1_awready[i];
-
-        end else begin
-          l1_m0_aw_accept[i] = 1'b1;
-          l1_mx_aw_done[i]   = l1_m0_aw_done[i];
-
-          int_m0_awvalid[i]  = int_awvalid[i];
-          int_awready[i]     = int_m0_awready[i];
-        end
-
-      // drop on Master 0 only
-      end else if (l1_aw_drop[i]) begin
-        l1_m0_aw_drop[i]     = 1'b1;
-        l1_mx_aw_done[i]     = l1_m0_aw_done[i];
-
-        int_m0_awvalid[i]    = int_awvalid[i];
-        int_awready[i]       = l1_m0_aw_done[i];
-
-      // save on both masters
-      end else if (l1_aw_save[i]) begin
-        // split save
-        l1_m0_aw_save[i]     = ~l1_m0_aw_done_SP[i];
-        l1_m1_aw_save[i]     = ~l1_m1_aw_done_SP[i];
-
-        // combine done
-        l1_mx_aw_done[i]     = l1_m0_aw_done_SP[i] & l1_m1_aw_done_SP[i];
-
-        int_m0_awvalid[i]    = int_awvalid[i];
-        int_m1_awvalid[i]    = int_awvalid[i];
-        int_awready[i]       = l1_mx_aw_done[i];
-      end
-    end
-
-    // signal back to handshake splitter
-    assign l1_aw_done[i]     = l1_mx_aw_done[i];
-
-    always_ff @(posedge Clk_CI) begin : L1_MX_AW_DONE_REG
-      if (Rst_RBI == 0) begin
-        l1_m0_aw_done_SP[i] <= 1'b0;
-        l1_m1_aw_done_SP[i] <= 1'b0;
-      end else if (l1_mx_aw_done[i]) begin
-        l1_m0_aw_done_SP[i] <= 1'b0;
-        l1_m1_aw_done_SP[i] <= 1'b0;
-      end else begin
-        l1_m0_aw_done_SP[i] <= l1_m0_aw_done_SP[i] | l1_m0_aw_done[i];
-        l1_m1_aw_done_SP[i] <= l1_m1_aw_done_SP[i] | l1_m1_aw_done[i];
-      end
-    end
-
-    /*
-     * When accepting L2 transactions, we must drop the corresponding transaction from the other
-     * master to make it available again for save requests from L1_DROP_SAVE.
-     */
-    always_comb begin : AW_L2_SPLIT
-
-      l2_m0_aw_accept[i] = 1'b0;
-      l2_m1_aw_accept[i] = 1'b0;
-      l2_m0_aw_drop[i]   = 1'b0;
-      l2_m1_aw_drop[i]   = 1'b0;
-
-      // de-assert request signals individually upon handshakes
-      if (l2_aw_accept[i]) begin
-        if (l2_master_select[i]) begin
-          l2_m1_aw_accept[i] = ~l2_m1_aw_done_SP[i];
-          l2_m0_aw_drop[i]   = ~l2_m0_aw_done_SP[i];
-
-        end else begin
-          l2_m0_aw_accept[i] = ~l2_m0_aw_done_SP[i];
-          l2_m1_aw_drop[i]   = ~l2_m1_aw_done_SP[i];
-
-        end
-      end else begin
-        l2_m0_aw_drop[i]     = ~l2_m0_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0;
-        l2_m1_aw_drop[i]     = ~l2_m1_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0;
-
-      end
-
-      // combine done
-      l2_mx_aw_done[i] = l2_m0_aw_done_SP[i] & l2_m1_aw_done_SP[i];
-
-      l2_aw_done[i]    = l2_mx_aw_done[i];
-    end
-
-    always_ff @(posedge Clk_CI) begin : L2_MX_AW_DONE_REG
-      if (Rst_RBI == 0) begin
-        l2_m0_aw_done_SP[i] <= 1'b0;
-        l2_m1_aw_done_SP[i] <= 1'b0;
-      end else if (l2_mx_aw_done[i]) begin
-        l2_m0_aw_done_SP[i] <= 1'b0;
-        l2_m1_aw_done_SP[i] <= 1'b0;
-      end else begin
-        l2_m0_aw_done_SP[i] <= l2_m0_aw_done_SP[i] | l2_m0_aw_done[i];
-        l2_m1_aw_done_SP[i] <= l2_m1_aw_done_SP[i] | l2_m1_aw_done[i];
-      end
-    end
-
-    /*
-     * When accepting L1 transactions, we must just do so on the selected master. Drop requests must
-     * be performed on any one of the two masters. Save requests must be performed by both masters.
-     */
-    always_comb begin : AR_L1_SPLIT
-
-      // TLB handshake
-      l1_m0_ar_accept[i] = 1'b0;
-      l1_m1_ar_accept[i] = 1'b0;
-      l1_m0_ar_drop[i]   = 1'b0;
-      l1_m1_ar_drop[i]   = 1'b0;
-      l1_m0_ar_save[i]   = 1'b0;
-      l1_m1_ar_save[i]   = 1'b0;
-
-      l1_mx_ar_done[i]   = 1'b0;
-
-      // AXI sender input handshake
-      int_m0_arvalid[i]  = 1'b0;
-      int_m1_arvalid[i]  = 1'b0;
-      int_arready[i]     = 1'b0;
-
-      // accept on selected master only
-      if (l1_ar_accept[i]) begin
-        if (int_rmaster_select[i]) begin
-          l1_m1_ar_accept[i] = 1'b1;
-          l1_mx_ar_done[i]   = l1_m1_ar_done[i];
-
-          int_m1_arvalid[i]  = int_arvalid[i];
-          int_arready[i]     = int_m1_arready[i];
-
-        end else begin
-          l1_m0_ar_accept[i] = 1'b1;
-          l1_mx_ar_done[i]   = l1_m0_ar_done[i];
-
-          int_m0_arvalid[i]  = int_arvalid[i];
-          int_arready[i]     = int_m0_arready[i];
-        end
-
-      // drop on Master 0 only
-      end else if (l1_ar_drop[i]) begin
-        l1_m0_ar_drop[i]     = 1'b1;
-        l1_mx_ar_done[i]     = l1_m0_ar_done[i];
-
-        int_m0_arvalid[i]    = int_arvalid[i];
-        int_arready[i]       = l1_m0_ar_done[i];
-
-      // save on both masters
-      end else if (l1_ar_save[i]) begin
-        // split save
-        l1_m0_ar_save[i]     = ~l1_m0_ar_done_SP[i];
-        l1_m1_ar_save[i]     = ~l1_m1_ar_done_SP[i];
-
-        // combine done
-        l1_mx_ar_done[i]     = l1_m0_ar_done_SP[i] & l1_m1_ar_done_SP[i];
-
-        int_m0_arvalid[i]    = int_arvalid[i];
-        int_m1_arvalid[i]    = int_arvalid[i];
-        int_arready[i]       = l1_mx_ar_done[i];
-      end
-    end
-
-    // signal back to handshake splitter
-    assign l1_ar_done[i]     = l1_mx_ar_done[i];
-
-    always_ff @(posedge Clk_CI) begin : L1_MX_AR_DONE_REG
-      if (Rst_RBI == 0) begin
-        l1_m0_ar_done_SP[i] <= 1'b0;
-        l1_m1_ar_done_SP[i] <= 1'b0;
-      end else if (l1_mx_ar_done[i]) begin
-        l1_m0_ar_done_SP[i] <= 1'b0;
-        l1_m1_ar_done_SP[i] <= 1'b0;
-      end else begin
-        l1_m0_ar_done_SP[i] <= l1_m0_ar_done_SP[i] | l1_m0_ar_done[i];
-        l1_m1_ar_done_SP[i] <= l1_m1_ar_done_SP[i] | l1_m1_ar_done[i];
-      end
-    end
-
-    /*
-     * When accepting L2 transactions, we must drop the corresponding transaction from the other
-     * master to make it available again for save requests from L1_DROP_SAVE.
-     */
-    always_comb begin : AR_L2_SPLIT
-
-      l2_m0_ar_accept[i] = 1'b0;
-      l2_m1_ar_accept[i] = 1'b0;
-      l2_m0_ar_drop[i]   = 1'b0;
-      l2_m1_ar_drop[i]   = 1'b0;
-
-      // de-assert request signals individually upon handshakes
-      if (l2_ar_accept[i]) begin
-        if (l2_master_select[i]) begin
-          l2_m1_ar_accept[i] = ~l2_m1_ar_done_SP[i];
-          l2_m0_ar_drop[i]   = ~l2_m0_ar_done_SP[i];
-
-        end else begin
-          l2_m0_ar_accept[i] = ~l2_m0_ar_done_SP[i];
-          l2_m1_ar_drop[i]   = ~l2_m1_ar_done_SP[i];
-
-        end
-      end else if (l2_ar_drop[i]) begin
-        l2_m0_ar_drop[i]     = ~l2_m0_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0;
-        l2_m1_ar_drop[i]     = ~l2_m1_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0;
-
-      end
-
-      // combine done
-      l2_mx_ar_done[i] = l2_m0_ar_done_SP[i] & l2_m1_ar_done_SP[i];
-
-      l2_ar_done[i]    = l2_mx_ar_done[i];
-    end
-
-    always_ff @(posedge Clk_CI) begin : L2_MX_AR_DONE_REG
-      if (Rst_RBI == 0) begin
-        l2_m0_ar_done_SP[i] <= 1'b0;
-        l2_m1_ar_done_SP[i] <= 1'b0;
-      end else if (l2_mx_ar_done[i]) begin
-        l2_m0_ar_done_SP[i] <= 1'b0;
-        l2_m1_ar_done_SP[i] <= 1'b0;
-      end else begin
-        l2_m0_ar_done_SP[i] <= l2_m0_ar_done_SP[i] | l2_m0_ar_done[i];
-        l2_m1_ar_done_SP[i] <= l2_m1_ar_done_SP[i] | l2_m1_ar_done[i];
-      end
-    end
-
-  end // AX_SPLIT
-  endgenerate // AX_SPLIT
-
-  // }}}
-
-  // HANDSHAKE SPLITS {{{
-  // âââ  âââââââââââ    âââââââââââââââ âââ     ââââââââââââ
-  // âââ  âââââââââââ    âââââââââââââââââââ     ââââââââââââ
-  // ââââââââââââââââ    âââââââââââââââââââ     âââ   âââ
-  // ââââââââââââââââ    âââââââââââââââ âââ     âââ   âââ
-  // âââ  âââââââââââ    âââââââââââ     âââââââââââ   âââ
-  // âââ  âââââââââââ    âââââââââââ     âââââââââââ   âââ
-  //
-  /*
-   * We need to perform combined handshakes with multiple AXI modules
-   * upon transactions drops, accepts, saves etc. from two TLBs.
-   */
-  generate for (i = 0; i < N_PORTS; i++) begin : HANDSHAKE_SPLIT
-
-    assign l1_xw_accept[i]    = int_wtrans_accept[i] & ~aw_out_stall[i];
-    assign int_wtrans_sent[i] = l1_xw_done[i];
-
-    assign l1_ar_accept[i]    = int_rtrans_accept[i];
-    assign int_rtrans_sent[i] = l1_ar_done[i];
-
-    /*
-     * L1 AW sender + W buffer handshake split
-     */
-    // forward
-    assign l1_aw_accept[i] = l1_xw_accept[i] & ~l1_aw_done_SP[i];
-    assign l1_w_accept[i]  = l1_xw_accept[i] & ~l1_w_done_SP[i];
-
-    assign l1_aw_save[i]   = l1_xw_save[i]   & ~l1_aw_done_SP[i];
-    assign l1_w_save[i]    = l1_xw_save[i]   & ~l1_w_done_SP[i];
-
-    assign l1_aw_drop[i]   = l1_xw_drop[i]   & ~l1_aw_done_SP[i];
-    assign l1_w_drop[i]    = l1_xw_drop[i]   & ~l1_w_done_SP[i];
-
-    // backward
-    assign l1_xw_done[i]   = l1_aw_done_SP[i] & l1_w_done_SP[i];
-
-    always_ff @(posedge Clk_CI) begin : L1_XW_HS_SPLIT
-      if (Rst_RBI == 0) begin
-        l1_aw_done_SP[i] <= 1'b0;
-        l1_w_done_SP[i]  <= 1'b0;
-      end else if (l1_xw_done[i]) begin
-        l1_aw_done_SP[i] <= 1'b0;
-        l1_w_done_SP[i]  <= 1'b0;
-      end else begin
-        l1_aw_done_SP[i] <= l1_aw_done_SP[i] | l1_aw_done[i];
-        l1_w_done_SP[i]  <= l1_w_done_SP[i]  | l1_w_done[i];
-      end
-    end
-
-    if (ENABLE_L2TLB[i] == 1) begin : L2_HS_SPLIT
-
-      /*
-       * L1 AR sender + R sender handshake split
-       *
-       * AR and R do not need to be strictly in sync. We thus use separate handshakes.
-       * But the handshake signals for the R sender are multiplexed with the those for
-       * the L2. However, L2_ACCEPT_DROP_SAVE has always higher priority.
-       */
-      assign lx_r_drop[i] = l2_r_drop[i] | l1_r_drop[i];
-      assign l1_r_done[i] = l2_r_drop[i] ? 1'b0         : lx_r_done[i];
-      assign l2_r_done[i] = l2_r_drop[i] ? lx_r_done[i] : 1'b0;
-
-      /*
-       * L2 AW sender + W buffer handshake split
-       */
-      // forward
-      assign l2_aw_accept[i] = l2_xw_accept[i] & ~l2_aw_done_SP[i];
-      assign l2_w_accept[i]  = l2_xw_accept[i] & ~l2_w_done_SP[i];
-
-      assign l2_aw_drop[i]   = l2_xw_drop[i]   & ~l2_aw_done_SP[i];
-      assign l2_w_drop[i]    = l2_xw_drop[i]   & ~l2_w_done_SP[i];
-
-      // backward
-      assign l2_xw_done[i]   = l2_aw_done_SP[i] & l2_w_done_SP[i];
-
-      always_ff @(posedge Clk_CI) begin : L2_XW_HS_SPLIT
-        if (Rst_RBI == 0) begin
-          l2_aw_done_SP[i] <= 1'b0;
-          l2_w_done_SP[i]  <= 1'b0;
-        end else if (l2_xw_done[i]) begin
-          l2_aw_done_SP[i] <= 1'b0;
-          l2_w_done_SP[i]  <= 1'b0;
-        end else begin
-          l2_aw_done_SP[i] <= l2_aw_done_SP[i] | l2_aw_done[i];
-          l2_w_done_SP[i]  <= l2_w_done_SP[i]  | l2_w_done[i];
-        end
-      end
-
-      /*
-       * L2 AR + R sender handshake split
-       */
-      // forward
-      assign l2_ar_drop[i]   = l2_xr_drop[i]   & ~l2_ar_done_SP[i];
-      assign l2_r_drop[i]    = l2_xr_drop[i]   & ~l2_r_done_SP[i];
-
-      // backward - make sure to always clear L2_XR_HS_SPLIT
-      always_comb begin
-        if (l2_xr_drop[i]) begin
-          l2_xr_done[i]      = l2_ar_done_SP[i] & l2_r_done_SP[i];
-        end else begin
-          l2_xr_done[i]      = l2_ar_done_SP[i];
-        end
-      end
-
-      always_ff @(posedge Clk_CI) begin : L2_XR_HS_SPLIT
-        if (Rst_RBI == 0) begin
-          l2_ar_done_SP[i] <= 1'b0;
-          l2_r_done_SP[i]  <= 1'b0;
-        end else if (l2_xr_done[i]) begin
-          l2_ar_done_SP[i] <= 1'b0;
-          l2_r_done_SP[i]  <= 1'b0;
-        end else begin
-          l2_ar_done_SP[i] <= l2_ar_done_SP[i] | l2_ar_done[i];
-          l2_r_done_SP[i]  <= l2_r_done_SP[i]  | l2_r_done[i];
-        end
-      end
-
-    end else begin // if (ENABLE_L2TLB[i] == 1)
-
-      assign lx_r_drop[i]     = l1_r_drop[i];
-      assign l1_r_done[i]     = lx_r_done[i];
-
-      assign l2_aw_accept[i]  = 1'b0;
-      assign l2_w_accept[i]   = 1'b0;
-      assign l2_aw_drop[i]    = 1'b0;
-      assign l2_w_drop[i]     = 1'b0;
-      assign l2_xw_done[i]    = 1'b0;
-      assign l2_aw_done_SP[i] = 1'b0;
-      assign l2_w_done_SP[i]  = 1'b0;
-
-      assign l2_ar_accept[i]  = 1'b0;
-      assign l2_ar_drop[i]    = 1'b0;
-      assign l2_r_drop[i]     = 1'b0;
-      assign l2_xr_done[i]    = 1'b0;
-      assign l2_r_done[i]     = 1'b0;
-      assign l2_ar_done_SP[i] = 1'b0;
-      assign l2_r_done_SP[i]  = 1'b0;
-
-    end // if (ENABLE_L2TLB[i] == 1)
-
-  end // HANDSHAKE_SPLIT
-  endgenerate // HANDSHAKE_SPLIT
-
-  // }}}
-
-  // L2 TLB {{{
-  // âââ     âââââââ     ââââââââââââ     âââââââ
-  // âââ     ââââââââ    ââââââââââââ     ââââââââ
-  // âââ      âââââââ       âââ   âââ     ââââââââ
-  // âââ     âââââââ        âââ   âââ     ââââââââ
-  // ââââââââââââââââ       âââ   ââââââââââââââââ
-  // ââââââââââââââââ       âââ   âââââââââââââââ
-  //
-  /*
-   * l2_tlb
-   *
-   * The L2 TLB translates addresses upon misses in the L1 TLB (rab_core).
-   *
-   * It supports one ongoing translation at a time. If an L1 miss occurs while the L2 is busy,
-   * the L1 is stalled untill the L2 is available again.
-   *
-   */
-  generate for (i = 0; i < N_PORTS; i++) begin : L2_TLB
-    if (ENABLE_L2TLB[i] == 1) begin : L2_TLB
-
-      /*
-       * L1 output selector
-       */
-      assign L1OutRwType_D[i] = int_wtrans_drop[i] ? 1'b1 : 1'b0;
-      assign L1OutProt_D[i]   = rab_prot[i];
-      assign L1OutMulti_D[i]  = rab_multi[i];
-
-      /*
-       * L1 output control + L1_DROP_BUF, L2_IN_BUF management
-       *
-       * Forward the L1 drop request to AR/AW sender modules if
-       * 1. the transactions needs to be dropped (L1 multi, prot, prefetch), or
-       * 2. if a lookup in the L2 TLB is required (L1 miss) and the input buffer is not full.
-       *
-       * The AR/AW senders do not support more than 1 oustanding L1 miss. The push back towards
-       * the upstream is realized by not accepting the save request (saving the L1 transaction)
-       * in the senders as long as the L2 TLB is busy or has valid output. This ultimately
-       * blocks the L1 TLB.
-       *
-       * Together with the AW drop/save, we also perform the W drop/save as AW and W need to
-       * absolutely remain in order. In contrast, the R drop is performed
-       */
-      always_comb begin : L1_DROP_SAVE
-
-        l1_ar_drop[i]       = 1'b0;
-        l1_ar_save[i]       = 1'b0;
-        l1_xw_drop[i]       = 1'b0;
-        l1_xw_save[i]       = 1'b0;
-
-        l1_id_drop[i]       = L1OutId_D[i];
-        l1_len_drop[i]      = L1OutLen_D[i];
-        l1_prefetch_drop[i] = rab_prefetch[i];
-        l1_hit_drop[i]      = 1'b1; // there are no drops for L1 misses
-
-        L1DropEn_S[i]       = 1'b0;
-        L2InEn_S[i]         = 1'b0;
-
-        if ( rab_prot[i] | rab_multi[i] | rab_prefetch[i] ) begin
-          // 1. Drop
-          l1_ar_drop[i] = int_rtrans_drop[i] & ~L1DropValid_SP[i];
-          l1_xw_drop[i] = int_wtrans_drop[i] & ~L1DropValid_SP[i];
-
-          // Store to L1_DROP_BUF upon handshake
-          L1DropEn_S[i] = (l1_ar_drop[i] & l1_ar_done[i]) |
-                          (l1_xw_drop[i] & l1_xw_done[i]);
-
-        end else if ( rab_miss[i] ) begin
-          // 2. Save - Make sure L2 is really available.
-          l1_ar_save[i] = int_rtrans_drop[i] & ~L2Busy_S[i];
-          l1_xw_save[i] = int_wtrans_drop[i] & ~L2Busy_S[i];
-
-          // Store to L2_IN_BUF upon handshake - triggers the L2 TLB
-          L2InEn_S[i]   = (l1_ar_save[i] & l1_ar_done[i]) |
-                          (l1_xw_save[i] & l1_xw_done[i]);
-        end
-      end
-
-      /*
-       * L2 output control + L2_OUT_BUF management + R/B sender control + W buffer control
-       *
-       * Perform L1 R transaction drops unless the L2 output buffer holds valid data. The AXI specs
-       * require the B response to be sent only after consuming/discarding the corresponding data
-       * in the W channel. Thus, we only send L2 drop request to the W buffer here. The drop
-       * request to the B sender is then sent by the W buffer autonomously.
-       *
-       * L1 AW/W drop requests are managed by L1_DROP_SAVE.
-       */
-      always_comb begin : L2_ACCEPT_DROP_SAVE
-
-        l2_ar_addr[i]       =  'b0;
-        l2_aw_addr[i]       =  'b0;
-        l2_ar_accept[i]     = 1'b0;
-        l2_xr_drop[i]       = 1'b0;
-        l2_xw_accept[i]     = 1'b0;
-        l2_xw_drop[i]       = 1'b0;
-
-        l1_r_drop[i]        = 1'b0;
-
-        lx_id_drop[i]       =  'b0;
-        lx_len_drop[i]      =  'b0;
-        lx_prefetch_drop[i] = 1'b0;
-        lx_hit_drop[i]      = 1'b0;
-
-        L1DropValid_SN[i]   = L1DropValid_SP[i] | L1DropEn_S[i];
-        L2OutValid_SN[i]    = L2OutValid_SP[i];
-        L2OutReady_S[i]     = 1'b0;
-        L2OutEn_S[i]        = 1'b0;
-
-        L2Miss_S[i]         = 1'b0;
-        int_multi[i]        = 1'b0;
-        int_prot[i]         = 1'b0;
-
-        if (L2OutValid_SP[i] == 1'b0) begin
-
-          // Drop L1 from R senders
-          if (L1DropValid_SP[i] == 1'b1) begin
-
-            // Only perform the R sender drop here.
-            if (~L1DropRwType_DP[i]) begin
-
-              l1_r_drop[i]        = 1'b1;
-              lx_id_drop[i]       = L1DropId_DP[i];
-              lx_len_drop[i]      = L1DropLen_DP[i];
-              lx_prefetch_drop[i] = L1DropPrefetch_S[i];
-              lx_hit_drop[i]      = 1'b1; // there are no drops for L1 misses
-
-              // Invalidate L1_DROP_BUF upon handshake
-              if ( l1_r_drop[i] & l1_r_done[i] ) begin
-
-                L1DropValid_SN[i] = 1'b0;
-                int_prot[i]       = L1DropProt_DP[i];
-                int_multi[i]      = L1DropMulti_DP[i];
-              end
-
-            end else begin
-              // Invalidate L1_DROP_BUF
-              L1DropValid_SN[i]   = 1'b0;
-              int_prot[i]         = L1DropProt_DP[i];
-              int_multi[i]        = L1DropMulti_DP[i];
-            end
-          end
-
-        end else begin // L2_OUT_BUF has valid data
-
-          if ( L2OutHit_SP[i] & ~(L2OutPrefetch_S[i] | L2OutProt_SP[i] | L2OutMulti_SP[i]) ) begin
-
-            l2_ar_addr[i]       = L2OutAddr_DP[i];
-            l2_aw_addr[i]       = L2OutAddr_DP[i];
-
-            l2_ar_accept[i]     = L2OutRwType_DP[i] ? 1'b0 : 1'b1;
-            l2_xw_accept[i]     = L2OutRwType_DP[i] ? 1'b1 : 1'b0;
-
-            // Invalidate L2_OUT_BUF upon handshake
-            L2OutValid_SN[i] = ~( (l2_ar_accept[i] & l2_ar_done[i]) |
-                                  (l2_xw_accept[i] & l2_xw_done[i]) );
-          end else begin
-
-            lx_id_drop[i]       = L2OutId_DP[i];
-            lx_len_drop[i]      = L2OutLen_DP[i];
-            lx_prefetch_drop[i] = L2OutPrefetch_S[i];
-            lx_hit_drop[i]      = L2OutHit_SP[i];
-
-            // The l2_xr_drop will also perform the handshake with the R sender
-            l2_xr_drop[i]       = L2OutRwType_DP[i] ? 1'b0 : 1'b1;
-            l2_xw_drop[i]       = L2OutRwType_DP[i] ? 1'b1 : 1'b0;
-
-            // Invalidate L1_DROP_BUF upon handshake
-            if ( (l2_xr_drop[i] & l2_xr_done[i]) | (l2_xw_drop[i] & l2_xw_done[i]) ) begin
-
-              L2OutValid_SN[i]  = 1'b0;
-              L2Miss_S[i]       = ~L2OutHit_SP[i];
-              int_prot[i]       = L2OutProt_SP[i];
-              int_multi[i]      = L2OutMulti_SP[i];
-            end
-          end
-        end
-
-        // Only accept new L2 output after ongoing drops have finished.
-        if ( (l2_xr_drop[i] == l2_xr_done[i]) &
-             (l2_xw_drop[i] == l2_xw_done[i]) &
-             (l1_r_drop[i]  == l1_r_done[i] ) ) begin
-          // Store to L2_OUT_BUF upon handshake with L2 TLB module
-          if ( (L2OutValid_SP[i] == 1'b0) && (L2OutValid_S[i] == 1'b1) ) begin
-            L2OutValid_SN[i]   = 1'b1;
-            L2OutReady_S[i]    = 1'b1;
-            L2OutEn_S[i]       = 1'b1;
-          end
-        end
-      end
-
-      /*
-       * L1 drop buffer
-       *
-       * Used in case of multi, prot and prefetch hits in the L1 TLB.
-       */
-      always_ff @(posedge Clk_CI) begin : L1_DROP_BUF
-         if (Rst_RBI == 0) begin
-            L1DropProt_DP[i]   <= 1'b0;
-            L1DropMulti_DP[i]  <= 1'b0;
-            L1DropRwType_DP[i] <= 1'b0;
-            L1DropUser_DP[i]   <=  'b0;
-            L1DropId_DP[i]     <=  'b0;
-            L1DropLen_DP[i]    <=  'b0;
-            L1DropAddr_DP[i]   <=  'b0;
-         end else if (L1DropEn_S[i] == 1'b1) begin
-            L1DropProt_DP[i]   <= L1OutProt_D[i]  ;
-            L1DropMulti_DP[i]  <= L1OutMulti_D[i] ;
-            L1DropRwType_DP[i] <= L1OutRwType_D[i];
-            L1DropUser_DP[i]   <= L1OutUser_D[i]  ;
-            L1DropId_DP[i]     <= L1OutId_D[i]    ;
-            L1DropLen_DP[i]    <= L1OutLen_D[i]   ;
-            L1DropAddr_DP[i]   <= L1OutAddr_D[i]  ;
-         end
-      end // always_ff @ (posedge Clk_CI)
-
-      /*
-       * L2 input buffer
-       *
-       * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB.
-       */
-      always_ff @(posedge Clk_CI) begin : L2_IN_BUF
-         if (Rst_RBI == 0) begin
-            L2InRwType_DP[i] <= 1'b0;
-            L2InUser_DP[i]   <=  'b0;
-            L2InId_DP[i]     <=  'b0;
-            L2InLen_DP[i]    <=  'b0;
-            L2InAddr_DP[i]   <=  'b0;
-         end else if (L2InEn_S[i] == 1'b1) begin
-            L2InRwType_DP[i] <= L1OutRwType_D[i];
-            L2InUser_DP[i]   <= L1OutUser_D[i]  ;
-            L2InId_DP[i]     <= L1OutId_D[i]    ;
-            L2InLen_DP[i]    <= L1OutLen_D[i]   ;
-            L2InAddr_DP[i]   <= L1OutAddr_D[i]  ;
-         end
-      end // always_ff @ (posedge Clk_CI)
-
-      l2_tlb
-        #(
-          .AXI_S_ADDR_WIDTH       ( AXI_S_ADDR_WIDTH                                    ),
-          .AXI_M_ADDR_WIDTH       ( AXI_M_ADDR_WIDTH                                    ),
-          .AXI_LITE_DATA_WIDTH    ( AXI_LITE_DATA_WIDTH                                 ),
-          .AXI_LITE_ADDR_WIDTH    ( AXI_LITE_ADDR_WIDTH                                 ),
-          .N_SETS                 ( `RAB_L2_N_SETS                                      ),
-          .N_OFFSETS              ( `RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS       ),
-          .N_PAR_VA_RAMS          ( `RAB_L2_N_PAR_VA_RAMS                               ),
-          .HIT_OFFSET_STORE_WIDTH ( log2(`RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS) )
-          )
-      u_l2_tlb
-        (
-          .clk_i              ( Clk_CI           ),
-          .rst_ni             ( Rst_RBI          ),
-
-          // Config inputs
-          .we_i               ( L2CfgWE_S[i]     ),
-          .waddr_i            ( L2CfgWAddr_D[i]  ),
-          .wdata_i            ( L2CfgWData_D[i]  ),
-
-          // Request input
-          .start_i            ( L2InEn_S[i]      ),
-          .busy_o             ( L2Busy_S[i]      ),
-          .rw_type_i          ( L2InRwType_DP[i] ),
-          .in_addr_i          ( L2InAddr_DP[i]   ),
-
-          // Response output
-          .out_ready_i        ( L2OutReady_S[i]  ),
-          .out_valid_o        ( L2OutValid_S[i]  ),
-          .hit_o              ( L2OutHit_SN[i]   ),
-          .miss_o             ( L2OutMiss_SN[i]  ),
-          .prot_o             ( L2OutProt_SN[i]  ),
-          .multi_o            ( L2OutMulti_SN[i] ),
-          .cache_coherent_o   ( L2OutCC_SN[i]    ),
-          .out_addr_o         ( L2OutAddr_DN[i]  )
-        );
-
-      /*
-       * L2 output buffer
-       *
-       * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB.
-       */
-      always_ff @(posedge Clk_CI) begin : L2_OUT_BUF
-         if (Rst_RBI == 0) begin
-            L2OutRwType_DP[i] <= 1'b0;
-            L2OutUser_DP[i]   <=  'b0;
-            L2OutLen_DP[i]    <=  'b0;
-            L2OutId_DP[i]     <=  'b0;
-            L2OutInAddr_DP[i] <=  'b0;
-
-            L2OutHit_SP[i]    <= 1'b0;
-            L2OutMiss_SP[i]   <= 1'b0;
-            L2OutProt_SP[i]   <= 1'b0;
-            L2OutMulti_SP[i]  <= 1'b0;
-            L2OutCC_SP[i]     <= 1'b0;
-            L2OutAddr_DP[i]   <=  'b0;
-         end else if (L2OutEn_S[i] == 1'b1) begin
-            L2OutRwType_DP[i] <= L2InRwType_DP[i];
-            L2OutUser_DP[i]   <= L2InUser_DP[i]  ;
-            L2OutLen_DP[i]    <= L2InLen_DP[i]   ;
-            L2OutId_DP[i]     <= L2InId_DP[i]    ;
-            L2OutInAddr_DP[i] <= L2InAddr_DP[i]  ;
-
-            L2OutHit_SP[i]    <= L2OutHit_SN[i]  ;
-            L2OutMiss_SP[i]   <= L2OutMiss_SN[i] ;
-            L2OutProt_SP[i]   <= L2OutProt_SN[i] ;
-            L2OutMulti_SP[i]  <= L2OutMulti_SN[i];
-            L2OutCC_SP[i]     <= L2OutCC_SN[i]   ;
-            L2OutAddr_DP[i]   <= L2OutAddr_DN[i] ;
-         end
-      end // always_ff @ (posedge Clk_CI)
-
-      always_ff @(posedge Clk_CI) begin : BUF_VALID
-        if (Rst_RBI == 0) begin
-          L1DropValid_SP[i] = 1'b0;
-          L2OutValid_SP[i]  = 1'b0;
-        end else begin
-          L1DropValid_SP[i] = L1DropValid_SN[i];
-          L2OutValid_SP[i]  = L2OutValid_SN[i];
-        end
-      end
-
-      always_comb begin : BUF_TO_PREFETCH
-        // L1 Drop Buf
-        if (L1DropUser_DP[i] == {AXI_USER_WIDTH{1'b1}})
-          L1DropPrefetch_S[i] = 1'b1;
-        else
-          L1DropPrefetch_S[i] = 1'b0;
-
-        // L2 Out Buf
-        if (L2OutUser_DP[i] == {AXI_USER_WIDTH{1'b1}})
-          L2OutPrefetch_S[i]  = 1'b1;
-        else
-          L2OutPrefetch_S[i]  = 1'b0;
-      end
-
-      assign l2_cache_coherent[i] = L2OutCC_SP[i];
-      assign int_miss[i]          = L2Miss_S[i];
-
-    end else begin : L2_TLB_STUB // if (ENABLE_L2TLB[i] == 1)
-
-      assign l1_ar_drop[i]        = int_rtrans_drop[i];
-      assign l1_r_drop[i]         = int_rtrans_drop[i];
-      assign l1_xw_drop[i]        = int_wtrans_drop[i];
-
-      assign l1_ar_save[i]        = 1'b0;
-      assign l1_xw_save[i]        = 1'b0;
-      assign l2_xw_accept[i]      = 1'b0;
-      assign l2_xr_drop[i]        = 1'b0;
-      assign l2_xw_drop[i]        = 1'b0;
-
-      assign l2_ar_addr[i]        =  'b0;
-      assign l2_aw_addr[i]        =  'b0;
-
-      assign l1_id_drop[i]        = int_wtrans_drop[i] ? int_awid[i] :
-                                    int_rtrans_drop[i] ? int_arid[i] :
-                                    '0;
-      assign l1_len_drop[i]       = int_wtrans_drop[i] ? int_awlen[i] :
-                                    int_rtrans_drop[i] ? int_arlen[i] :
-                                    '0;
-      assign l1_prefetch_drop[i]  = rab_prefetch[i];
-      assign l1_hit_drop[i]       = ~rab_miss[i];
-
-      assign lx_id_drop[i]        = int_wtrans_drop[i] ? int_awid[i] :
-                                    int_rtrans_drop[i] ? int_arid[i] :
-                                    '0;
-      assign lx_len_drop[i]       = int_wtrans_drop[i] ? int_awlen[i] :
-                                    int_rtrans_drop[i] ? int_arlen[i] :
-                                    '0;
-      assign lx_prefetch_drop[i]  = rab_prefetch[i];
-      assign lx_hit_drop[i]       = ~rab_miss[i];
-
-      assign l2_cache_coherent[i] = 1'b0;
-
-      assign int_miss[i]          = rab_miss[i];
-      assign int_prot[i]          = rab_prot[i];
-      assign int_multi[i]         = rab_multi[i];
-
-      // unused signals
-      assign L2Miss_S[i]          = 1'b0;
-
-      assign L1OutRwType_D[i]     = 1'b0;
-      assign L1OutProt_D[i]       = 1'b0;
-      assign L1OutMulti_D[i]      = 1'b0;
-
-      assign L1DropRwType_DP[i]   = 1'b0;
-      assign L1DropUser_DP[i]     =  'b0;
-      assign L1DropId_DP[i]       =  'b0;
-      assign L1DropLen_DP[i]      =  'b0;
-      assign L1DropAddr_DP[i]     =  'b0;
-      assign L1DropProt_DP[i]     = 1'b0;
-      assign L1DropMulti_DP[i]    = 1'b0;
-
-      assign L1DropEn_S[i]        = 1'b0;
-      assign L1DropPrefetch_S[i]  = 1'b0;
-      assign L1DropValid_SN[i]    = 1'b0;
-      assign L1DropValid_SP[i]    = 1'b0;
-
-      assign L2InRwType_DP[i]     = 1'b0;
-      assign L2InUser_DP[i]       =  'b0;
-      assign L2InId_DP[i]         =  'b0;
-      assign L2InLen_DP[i]        =  'b0;
-      assign L2InAddr_DP[i]       =  'b0;
-
-      assign L2InEn_S[i]          = 1'b0;
-
-      assign L2OutHit_SN[i]       = 1'b0;
-      assign L2OutMiss_SN[i]      = 1'b0;
-      assign L2OutProt_SN[i]      = 1'b0;
-      assign L2OutMulti_SN[i]     = 1'b0;
-      assign L2OutCC_SN[i]        = 1'b0;
-      assign L2OutAddr_DN[i]      =  'b0;
-
-      assign L2OutRwType_DP[i]    = 1'b0;
-      assign L2OutUser_DP[i]      =  'b0;
-      assign L2OutId_DP[i]        =  'b0;
-      assign L2OutLen_DP[i]       =  'b0;
-      assign L2OutInAddr_DP[i]    =  'b0;
-      assign L2OutHit_SP[i]       = 1'b0;
-      assign L2OutMiss_SP[i]      = 1'b0;
-      assign L2OutProt_SP[i]      = 1'b0;
-      assign L2OutMulti_SP[i]     = 1'b0;
-      assign L2OutCC_SP[i]        = 1'b0;
-      assign L2OutAddr_DP[i]      =  'b0;
-
-      assign L2OutEn_S[i]         = 1'b0;
-      assign L2OutPrefetch_S[i]   = 1'b0;
-      assign L2Busy_S[i]          = 1'b0;
-      assign L2OutValid_S[i]      = 1'b0;
-      assign L2OutValid_SN[i]     = 1'b0;
-      assign L2OutValid_SP[i]     = 1'b0;
-      assign L2OutReady_S[i]      = 1'b0;
-
-    end // !`ifdef ENABLE_L2TLB
-  end // for (i = 0; i < N_PORTS; i++)
-  endgenerate
-
-// }}}
-"""
-# endmodule
-#
-#
-# // vim: ts=2 sw=2 sts=2 et nosmartindent autoindent foldmethod=marker
-#
-#
diff --git a/src/iommu/axi_rab/check_ram.py b/src/iommu/axi_rab/check_ram.py
deleted file mode 100644
index 31bf32ea..00000000
--- a/src/iommu/axi_rab/check_ram.py
+++ /dev/null
@@ -1,240 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class check_ram(Elaboratable):
-
-    def __init__(self):
-        self.clk_i = Signal()  # input
-        self.rst_ni = Signal()  # input
-        self.in_addr = Signal(ADDR_WIDTH)  # input
-        self.rw_type = Signal()  # input
-        self.ram_we = Signal()  # input
-        self.port0_addr = Signal(1+ERROR p_expression_25)  # input
-        self.port1_addr = Signal(1+ERROR p_expression_25)  # input
-        self.ram_wdata = Signal(RAM_DATA_WIDTH)  # input
-        self.output_sent = Signal()  # input
-        self.output_valid = Signal()  # input
-        self.offset_addr_d = Signal(OFFSET_WIDTH)  # input
-        self.hit_addr = Signal(1+ERROR p_expression_25)  # output
-        self.master = Signal()  # output
-        self.hit = Signal()  # output
-        self.multi_hit = Signal()  # output
-        self.prot = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# //import CfMath::log2;
-#
-# //`define MULTI_HIT_FULL_SET
-#
-# module check_ram
-#  //#(
-#  //  parameter ADDR_WIDTH     = 32,
-#   // parameter RAM_DATA_WIDTH = 32,
-#   // parameter PAGE_SIZE      = 4096, // 4kB
-#  //  parameter SET_WIDTH      = 5,
-#   // parameter OFFSET_WIDTH   = 4
-#   // )
-#  (
-#   input  logic                                clk_i,
-#   input  logic                                rst_ni,
-#   input  logic [ADDR_WIDTH-1:0]               in_addr,
-#   input  logic                                rw_type, // 1 => write, 0=> read
-#   input  logic                                ram_we,
-#   input  logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr,
-#   input  logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr,
-#   input  logic [RAM_DATA_WIDTH-1:0]           ram_wdata,
-#   input  logic                                output_sent,
-#   input  logic                                output_valid,
-#   input  logic [OFFSET_WIDTH-1:0]             offset_addr_d,
-#   output logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr,
-#   output logic                                master,
-#   output logic                                hit,
-#   output logic                                multi_hit,
-#   output logic                                prot
-#   );
-#
-"""   #docstring_begin
-
-   localparam IGNORE_LSB = log2(PAGE_SIZE); // 12
-
-   logic [RAM_DATA_WIDTH-1:0]           port0_data_o, port1_data_o; // RAM read data outputs
-   logic                                port0_hit, port1_hit; // Ram output matches in_addr
-
-    logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr_saved, port1_addr_saved;
-
-   // Hit FSM Signals
-   typedef enum                         logic {SEARCH, HIT} hit_state_t;
-   hit_state_t                          hit_SP; // Hit FSM state
-   hit_state_t                          hit_SN; // Hit FSM next state
-
-   // Multi Hit FSM signals
-`ifdef MULTI_HIT_FULL_SET
-   typedef enum                         logic[1:0] {NO_HITS, ONE_HIT, MULTI_HIT} multi_state_t;
-   multi_state_t                        multi_SP; // Multi Hit FSM state
-   multi_state_t                        multi_SN; // Multi Hit FSM next state
-
-   logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr_saved;
-   logic                                master_saved;
-`endif
-
-  //// --------------- Block RAM (Dual Port) -------------- ////
-
-  // The outputs of the BRAMs are only valid if in the previous cycle:
-  // 1. the inputs were valid, and
-  // 2. the BRAM was not written to.
-  // Otherwise, the outputs must be ignored which is controlled by the output_valid signal.
-  // This signal is driven by the uppler level L2 TLB module.
-  ram_tp_no_change #(
-      .ADDR_WIDTH( SET_WIDTH+OFFSET_WIDTH+1 ),
-      .DATA_WIDTH( RAM_DATA_WIDTH           )
-    )
-    ram_tp_no_change_0
-    (
-      .clk   ( clk_i         ),
-      .we    ( ram_we        ),
-      .addr0 ( port0_addr    ),
-      .addr1 ( port1_addr    ),
-      .d_i   ( ram_wdata     ),
-      .d0_o  ( port0_data_o  ),
-      .d1_o  ( port1_data_o  )
-    );
-
-   //// Check Ram Outputs
-   assign port0_hit = (port0_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port0_data_o[RAM_DATA_WIDTH-1:4]);
-   assign port1_hit = (port1_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port1_data_o[RAM_DATA_WIDTH-1:4]);
-   //// ----------------------------------------------------- /////
-
-   //// ------------------- Check if Hit ------------------------ ////
-   // FSM
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         hit_SP <= SEARCH;
-      end else begin
-         hit_SP <= hit_SN;
-      end
-   end
-
-   always_ff @(posedge clk_i, negedge rst_ni) begin
-       if (!rst_ni) begin
-           port0_addr_saved <= '0;
-           port1_addr_saved <= '0;
-       end else begin
-           port0_addr_saved <= port0_addr;
-           port1_addr_saved <= port1_addr;
-       end
-   end
-
-   always_comb begin
-      hit_SN   = hit_SP;
-      hit      = 1'b0;
-      hit_addr = 0;
-      master   = 1'b0;
-      unique case(hit_SP)
-        SEARCH :
-          if (output_valid)
-            if (port0_hit || port1_hit) begin
-               hit_SN   = HIT;
-               hit      = 1'b1;
-               hit_addr = port0_hit ? {port0_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} :
-                          port1_hit ? {port1_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} :
-                          0;
-               master   = port0_hit ? port0_data_o[3] :
-                          port1_hit ? port1_data_o[3] :
-                          1'b0;
-            end
-
-        HIT : begin
-`ifdef MULTI_HIT_FULL_SET // Since the search continues after the first hit, it needs to be saved to be accessed later.
-           hit      = 1'b1;
-           hit_addr = hit_addr_saved;
-           master   = master_saved;
-`endif
-           if (output_sent)
-             hit_SN = SEARCH;
-        end
-
-        default : begin
-           hit_SN = SEARCH;
-        end
-      endcase // case (hit_SP)
-   end // always_comb begin
-
-   //// ------------------------------------------- ////
-
-   assign prot = output_valid && port0_hit ? ((~port0_data_o[2] && rw_type) || (~port0_data_o[1] && ~rw_type)) :
-                 output_valid && port1_hit ? ((~port1_data_o[2] && rw_type) || (~port1_data_o[1] && ~rw_type)) :
-                 1'b0;
-
-   //// ------------------- Multi ------------------- ////
-`ifdef MULTI_HIT_FULL_SET
-
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         hit_addr_saved <= 0;
-         master_saved   <= 1'b0;
-      end else if (output_valid) begin
-         hit_addr_saved <= hit_addr;
-         master_saved   <= master;
-      end
-   end
-
-   // FSM
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         multi_SP <= NO_HITS;
-      end else begin
-         multi_SP <= multi_SN;
-      end
-   end
-
-   always_comb begin
-      multi_SN  = multi_SP;
-      multi_hit = 1'b0;
-      unique case(multi_SP)
-        NO_HITS :
-          if(output_valid && (port0_hit && port1_hit)) begin
-             multi_SN  = MULTI_HIT;
-             multi_hit = 1'b1;
-          end else if(output_valid && (port0_hit || port1_hit))
-            multi_SN = ONE_HIT;
-
-        ONE_HIT :
-          if(output_valid && (port0_hit || port1_hit)) begin
-             multi_SN  = MULTI_HIT;
-             multi_hit = 1'b1;
-          end else if (output_sent)
-            multi_SN = NO_HITS;
-
-        MULTI_HIT : begin
-          multi_hit = 1'b1;
-           if (output_sent)
-             multi_SN = NO_HITS;
-        end
-
-      endcase // case (multi_SP)
-   end // always_comb begin
-
-`else // !`ifdef MULTI_HIT_FULL_SET
-   assign multi_hit = output_valid && port0_hit && port1_hit;
-`endif // !`ifdef MULTI_HIT_FULL_SET
-   //// ------------------------------------------- ////
-"""
-# endmodule
-#
-#
diff --git a/src/iommu/axi_rab/coreconfig.py b/src/iommu/axi_rab/coreconfig.py
deleted file mode 100644
index 247d0ce3..00000000
--- a/src/iommu/axi_rab/coreconfig.py
+++ /dev/null
@@ -1,6 +0,0 @@
-class CoreConfig:
-    def __init__(self):
-        self.N_SLICES = 16
-        self.N_REGS = 4*self.N_SLICES
-        self.ADDR_WIDTH_PHYS = 40
-        self.ADDR_WIDTH_VIRT = 32
diff --git a/src/iommu/axi_rab/fsm.py b/src/iommu/axi_rab/fsm.py
deleted file mode 100644
index d64b1cb4..00000000
--- a/src/iommu/axi_rab/fsm.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class fsm(Elaboratable):
-
-    def __init__(self):
-        self.Clk_CI = Signal()  # input
-        self.Rst_RBI = Signal()  # input
-        self.port1_addr_valid_i = Signal()  # input
-        self.port2_addr_valid_i = Signal()  # input
-        self.port1_sent_i = Signal()  # input
-        self.port2_sent_i = Signal()  # input
-        self.select_i = Signal()  # input
-        self.no_hit_i = Signal()  # input
-        self.multi_hit_i = Signal()  # input
-        self.no_prot_i = Signal()  # input
-        self.prefetch_i = Signal()  # input
-        self.out_addr_i = Signal(AXI_M_ADDR_WIDTH)  # input
-        self.cache_coherent_i = Signal()  # input
-        self.port1_accept_o = Signal()  # output
-        self.port1_drop_o = Signal()  # output
-        self.port1_miss_o = Signal()  # output
-        self.port2_accept_o = Signal()  # output
-        self.port2_drop_o = Signal()  # output
-        self.port2_miss_o = Signal()  # output
-        self.out_addr_o = Signal(AXI_M_ADDR_WIDTH)  # output
-        self.cache_coherent_o = Signal()  # output
-        self.miss_o = Signal()  # output
-        self.multi_o = Signal()  # output
-        self.prot_o = Signal()  # output
-        self.prefetch_o = Signal()  # output
-        self.in_addr_i = Signal(AXI_S_ADDR_WIDTH)  # input
-        self.in_id_i = Signal(AXI_ID_WIDTH)  # input
-        self.in_len_i = Signal(8)  # input
-        self.in_user_i = Signal(AXI_USER_WIDTH)  # input
-        self.in_addr_o = Signal(AXI_S_ADDR_WIDTH)  # output
-        self.in_id_o = Signal(AXI_ID_WIDTH)  # output
-        self.in_len_o = Signal(8)  # output
-        self.in_user_o = Signal(AXI_USER_WIDTH)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# //`timescale 1ns / 1ps
-#
-# module fsm
-#  #(
-#    parameter AXI_M_ADDR_WIDTH = 40,
-#    parameter AXI_S_ADDR_WIDTH = 32,
-#    parameter AXI_ID_WIDTH     = 8,
-#    parameter AXI_USER_WIDTH   = 6
-#  )
-#  (
-#    input  logic                        Clk_CI,
-#    input  logic                        Rst_RBI,
-#
-#    input  logic                        port1_addr_valid_i,
-#    input  logic                        port2_addr_valid_i,
-#    input  logic                        port1_sent_i,
-#    input  logic                        port2_sent_i,
-#    input  logic                        select_i,
-#    input  logic                        no_hit_i,
-#    input  logic                        multi_hit_i,
-#    input  logic                        no_prot_i,
-#    input  logic                        prefetch_i,
-#    input  logic [AXI_M_ADDR_WIDTH-1:0] out_addr_i,
-#    input  logic                        cache_coherent_i,
-#    output logic                        port1_accept_o,
-#    output logic                        port1_drop_o,
-#    output logic                        port1_miss_o,
-#    output logic                        port2_accept_o,
-#    output logic                        port2_drop_o,
-#    output logic                        port2_miss_o,
-#    output logic [AXI_M_ADDR_WIDTH-1:0] out_addr_o,
-#    output logic                        cache_coherent_o,
-#    output logic                        miss_o,
-#    output logic                        multi_o,
-#    output logic                        prot_o,
-#    output logic                        prefetch_o,
-#    input  logic [AXI_S_ADDR_WIDTH-1:0] in_addr_i,
-#    input  logic     [AXI_ID_WIDTH-1:0] in_id_i,
-#    input  logic                  [7:0] in_len_i,
-#    input  logic   [AXI_USER_WIDTH-1:0] in_user_i,
-#    output logic [AXI_S_ADDR_WIDTH-1:0] in_addr_o,
-#    output logic     [AXI_ID_WIDTH-1:0] in_id_o,
-#    output logic                  [7:0] in_len_o,
-#    output logic   [AXI_USER_WIDTH-1:0] in_user_o
-#  );
-#
-"""  #docstring_begin
-
-  //-------------Internal Signals----------------------
-
-  typedef enum logic           {IDLE, WAIT} state_t;
-  logic                        state_SP; // Present state
-  logic                        state_SN; // Next State
-
-  logic                        port1_accept_SN;
-  logic                        port1_drop_SN;
-  logic                        port1_miss_SN;
-  logic                        port2_accept_SN;
-  logic                        port2_drop_SN;
-  logic                        port2_miss_SN;
-  logic                        miss_SN;
-  logic                        multi_SN;
-  logic                        prot_SN;
-  logic                        prefetch_SN;
-  logic                        cache_coherent_SN;
-  logic [AXI_M_ADDR_WIDTH-1:0] out_addr_DN;
-
-  logic                        out_reg_en_S;
-
-  //----------FSM comb------------------------------
-
-  always_comb begin: FSM_COMBO
-    state_SN          = state_SP;
-
-    port1_accept_SN   = 1'b0;
-    port1_drop_SN     = 1'b0;
-    port1_miss_SN     = 1'b0;
-    port2_accept_SN   = 1'b0;
-    port2_drop_SN     = 1'b0;
-    port2_miss_SN     = 1'b0;
-    miss_SN           = 1'b0;
-    multi_SN          = 1'b0;
-    prot_SN           = 1'b0;
-    prefetch_SN       = 1'b0;
-    cache_coherent_SN = 1'b0;
-    out_addr_DN       =   '0;
-
-    out_reg_en_S      = 1'b0; // by default hold register output
-
-    unique case(state_SP)
-        IDLE :
-          if ( (port1_addr_valid_i & select_i) | (port2_addr_valid_i & ~select_i) ) begin
-            out_reg_en_S = 1'b1;
-            state_SN     = WAIT;
-
-            // Select inputs for output registers
-            if          (port1_addr_valid_i & select_i) begin
-              port1_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
-              port1_drop_SN   =  (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
-              port1_miss_SN   =   no_hit_i;
-              port2_accept_SN = 1'b0;
-              port2_drop_SN   = 1'b0;
-              port2_miss_SN   = 1'b0;
-            end else if (port2_addr_valid_i & ~select_i) begin
-              port1_accept_SN = 1'b0;
-              port1_drop_SN   = 1'b0;
-              port1_miss_SN   = 1'b0;
-              port2_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
-              port2_drop_SN   =  (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
-              port2_miss_SN   =   no_hit_i;
-            end
-
-            miss_SN           = port1_miss_SN | port2_miss_SN;
-            multi_SN          = multi_hit_i;
-            prot_SN           = ~no_prot_i;
-            prefetch_SN       = ~no_hit_i & prefetch_i;
-
-            cache_coherent_SN = cache_coherent_i;
-            out_addr_DN       = out_addr_i;
-          end
-
-        WAIT :
-          if ( port1_sent_i | port2_sent_i ) begin
-            out_reg_en_S = 1'b1; // "clear" the register
-            state_SN     = IDLE;
-          end
-
-        default : begin
-           state_SN      = IDLE;
-        end
-      endcase
-    end
-
-  //----------FSM seq-------------------------------
-
-  always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: FSM_SEQ
-    if (Rst_RBI == 1'b0)
-      state_SP <= IDLE;
-    else
-      state_SP <= state_SN;
-  end
-
-  //----------Output seq--------------------------
-
-  always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: OUTPUT_SEQ
-    if (Rst_RBI == 1'b0) begin
-      port1_accept_o   = 1'b0;
-      port1_drop_o     = 1'b0;
-      port1_miss_o     = 1'b0;
-      port2_accept_o   = 1'b0;
-      port2_drop_o     = 1'b0;
-      port2_miss_o     = 1'b0;
-      miss_o           = 1'b0;
-      multi_o          = 1'b0;
-      prot_o           = 1'b0;
-      prefetch_o       = 1'b0;
-      cache_coherent_o = 1'b0;
-      out_addr_o       =   '0;
-      in_addr_o        =   '0;
-      in_id_o          =   '0;
-      in_len_o         =   '0;
-      in_user_o        =   '0;
-    end else if (out_reg_en_S == 1'b1) begin
-      port1_accept_o   = port1_accept_SN;
-      port1_drop_o     = port1_drop_SN;
-      port1_miss_o     = port1_miss_SN;
-      port2_accept_o   = port2_accept_SN;
-      port2_drop_o     = port2_drop_SN;
-      port2_miss_o     = port2_miss_SN;
-      miss_o           = miss_SN;
-      multi_o          = multi_SN;
-      prot_o           = prot_SN;
-      prefetch_o       = prefetch_SN;
-      cache_coherent_o = cache_coherent_SN;
-      out_addr_o       = out_addr_DN;
-      in_addr_o        = in_addr_i;
-      in_id_o          = in_id_i;
-      in_len_o         = in_len_i;
-      in_user_o        = in_user_i;
-    end
-  end // block: OUTPUT_SEQ
-"""
-#
-# endmodule
-#
-#
diff --git a/src/iommu/axi_rab/l2_tlb.py b/src/iommu/axi_rab/l2_tlb.py
deleted file mode 100644
index 11983f64..00000000
--- a/src/iommu/axi_rab/l2_tlb.py
+++ /dev/null
@@ -1,550 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class l2_tlb(Elaboratable):
-
-    def __init__(self):
-        self.clk_i = Signal()  # input
-        self.rst_ni = Signal()  # input
-        self.we_i = Signal()  # input
-        self.waddr_i = Signal(AXI_LITE_ADDR_WIDTH)  # input
-        self.wdata_i = Signal(AXI_LITE_DATA_WIDTH)  # input
-        self.start_i = Signal()  # input
-        self.busy_o = Signal()  # output
-        self.in_addr_i = Signal(AXI_S_ADDR_WIDTH)  # input
-        self.rw_type_i = Signal()  # input
-        self.out_ready_i = Signal()  # input
-        self.out_valid_o = Signal()  # output
-        self.hit_o = Signal()  # output
-        self.miss_o = Signal()  # output
-        self.prot_o = Signal()  # output
-        self.multi_o = Signal()  # output
-        self.cache_coherent_o = Signal()  # output
-        self.out_addr_o = Signal(AXI_M_ADDR_WIDTH)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# //`include "pulp_soc_defines.sv"
-#
-# ////import CfMath::log2;
-#
-# //`define MULTI_HIT_FULL_SET  // Enable full multi hit detection. Always the entire set is searched.
-# //`define MULTI_HIT_CUR_CYCLE // Enable partial multi hit detection. Only multi hits in the same search cycle are detected.
-#
-# //`ifdef MULTI_HIT_FULL_SET
-# //  `ifndef MULTI_HIT_CUR_CYCLE
-# //    `define MULTI_HIT_CUR_CYCLE
-# //  `endif
-# //`endif
-#
-# module l2_tlb
-#  //#(
-#  //  parameter AXI_S_ADDR_WIDTH       = 32,
-#   // parameter AXI_M_ADDR_WIDTH       = 40,
-#  //  parameter AXI_LITE_DATA_WIDTH    = 64,
-#   // parameter AXI_LITE_ADDR_WIDTH    = 32,
-#   // parameter N_SETS                 = 32,
-#   // parameter N_OFFSETS              = 4, //per port. There are 2 ports.
-#  //  parameter PAGE_SIZE              = 4096, // 4kB
-#  //  parameter N_PAR_VA_RAMS          = 4,
-#  //  parameter HIT_OFFSET_STORE_WIDTH = 2 // Num of bits of VA RAM offset stored. This should not be greater than OFFSET_WIDTH
-#  //  )
-#   (
-#    input  logic                           clk_i,
-#    input  logic                           rst_ni,
-#
-#    input  logic                           we_i,
-#    input  logic [AXI_LITE_ADDR_WIDTH-1:0] waddr_i,
-#    input  logic [AXI_LITE_DATA_WIDTH-1:0] wdata_i,
-#
-#    input  logic                           start_i,
-#    output logic                           busy_o,
-#    input  logic    [AXI_S_ADDR_WIDTH-1:0] in_addr_i,
-#    input  logic                           rw_type_i, //1 => write, 0=> read
-#
-#    input  logic                           out_ready_i,
-#    output logic                           out_valid_o,
-#    output logic                           hit_o,
-#    output logic                           miss_o,
-#    output logic                           prot_o,
-#    output logic                           multi_o,
-#    output logic                           cache_coherent_o,
-#    output logic    [AXI_M_ADDR_WIDTH-1:0] out_addr_o
-#    );
-#
-"""    #docstring_begin
-
-   localparam VA_RAM_DEPTH      = N_SETS * N_OFFSETS * 2;
-   localparam PA_RAM_DEPTH      = VA_RAM_DEPTH * N_PAR_VA_RAMS;
-   localparam VA_RAM_ADDR_WIDTH = log2(VA_RAM_DEPTH);
-   localparam PA_RAM_ADDR_WIDTH = log2(PA_RAM_DEPTH);
-   localparam SET_WIDTH         = log2(N_SETS);
-   localparam OFFSET_WIDTH      = log2(N_OFFSETS);
-   localparam LL_WIDTH          = log2(N_PAR_VA_RAMS);
-   localparam IGNORE_LSB        = log2(PAGE_SIZE);
-
-   localparam VA_RAM_DATA_WIDTH = AXI_S_ADDR_WIDTH - IGNORE_LSB + 4;
-   localparam PA_RAM_DATA_WIDTH = AXI_M_ADDR_WIDTH - IGNORE_LSB;
-
-   logic                               [N_PAR_VA_RAMS-1:0] hit, prot, multi_hit, cache_coherent;
-   logic                               [N_PAR_VA_RAMS-1:0] ram_we;
-   logic                                                   last_search, last_search_next;
-   logic                                                   first_search, first_search_next;
-   logic                    [SET_WIDTH+OFFSET_WIDTH+1-1:0] ram_waddr;
-   logic [N_PAR_VA_RAMS-1:0][SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr;
-   logic                                                   pa_ram_we;
-   logic                           [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr, pa_port0_waddr; // PA RAM read, Write addr;
-   logic                           [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr_reg_SN, pa_port0_raddr_reg_SP; // registered addresses, needed for WAIT_ON_WRITE;
-   logic                           [PA_RAM_ADDR_WIDTH-1:0] pa_port0_addr; // PA RAM addr
-   logic                           [PA_RAM_DATA_WIDTH-1:0] pa_port0_data, pa_data, pa_port0_data_reg; // PA RAM data
-   logic                                                   pa_ram_store_data_SN, pa_ram_store_data_SP;
-   logic                                                   hit_top, prot_top, multi_hit_top, first_hit_top;
-   logic                                                   output_sent;
-   int                                                     hit_block_num;
-
-   logic                                                   searching, search_done;
-   logic                    [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr, port0_raddr; // VA RAM port0 addr
-   logic                    [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr; // VA RAM port1 addr
-   logic                                [OFFSET_WIDTH-1:0] offset_addr, offset_addr_d;
-   logic                                [OFFSET_WIDTH-1:0] offset_start_addr, offset_end_addr;
-   logic                                   [SET_WIDTH-1:0] set_num;
-
-   logic                                                   va_output_valid;
-   logic                                                   searching_q;
-
-   genvar                                                  z;
-
-   // Search FSM
-   typedef enum logic                                [1:0] {IDLE, SEARCH, DONE} search_state_t;
-   search_state_t                                          search_SP; // Present state
-   search_state_t                                          search_SN; // Next State
-
-   // Output FSM
-   typedef enum logic                                [1:0] {OUT_IDLE, SEND_OUTPUT, WAIT_ON_WRITE} out_state_t;
-   out_state_t                                             out_SP; // Present state
-   out_state_t                                             out_SN; // Next State
-
-   logic                                                   miss_next;
-   logic                                                   hit_next;
-   logic                                                   prot_next;
-   logic                                                   multi_next;
-   logic                                                   cache_coherent_next;
-
-   // Generate the VA Block rams and their surrounding logic
-   generate
-      for (z = 0; z < N_PAR_VA_RAMS; z++) begin : VA_RAMS
-         check_ram
-           #(
-             .ADDR_WIDTH     ( AXI_S_ADDR_WIDTH  ),
-             .RAM_DATA_WIDTH ( VA_RAM_DATA_WIDTH ),
-             .PAGE_SIZE      ( PAGE_SIZE         ),
-             .SET_WIDTH      ( SET_WIDTH         ),
-             .OFFSET_WIDTH   ( OFFSET_WIDTH      )
-             )
-         u_check_ram
-             (
-              .clk_i         ( clk_i                          ),
-              .rst_ni        ( rst_ni                         ),
-              .in_addr       ( in_addr_i                      ),
-              .rw_type       ( rw_type_i                      ),
-              .ram_we        ( ram_we[z]                      ),
-              .port0_addr    ( port0_addr                     ),
-              .port1_addr    ( port1_addr                     ),
-              .ram_wdata     ( wdata_i[VA_RAM_DATA_WIDTH-1:0] ),
-              .output_sent   ( output_sent                    ),
-              .output_valid  ( va_output_valid                ),
-              .offset_addr_d ( offset_addr_d                  ),
-              .hit_addr      ( hit_addr[z]                    ),
-              .master        ( cache_coherent[z]              ),
-              .hit           ( hit[z]                         ),
-              .multi_hit     ( multi_hit[z]                   ),
-              .prot          ( prot[z]                        )
-              );
-      end // for (z = 0; z < N_PORTS; z++)
-   endgenerate
-
-   ////////////////// ---------------- Control and Address --------------- ////////////////////////
-   // FSM
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         search_SP <= IDLE;
-      end else begin
-         search_SP <= search_SN;
-      end
-   end
-
-   always_comb begin : SEARCH_FSM
-      search_SN         = search_SP;
-      busy_o            = 1'b0;
-      searching         = 1'b0;
-      search_done       = 1'b0;
-      last_search_next  = 1'b0;
-      first_search_next = first_search;
-
-      unique case (search_SP)
-        IDLE : begin
-          if (start_i) begin
-            search_SN         = SEARCH;
-            first_search_next = 1'b1;
-          end
-        end
-
-        SEARCH : begin
-          busy_o = 1'b1;
-
-          // detect last search cycle
-          if ( (first_search == 1'b0) && (offset_addr == offset_end_addr) )
-             last_search_next  = 1'b1;
-
-          // pause search during VA RAM reconfigration
-          if (|ram_we) begin
-             searching         = 1'b0;
-          end else begin
-             searching         = 1'b1;
-             first_search_next = 1'b0;
-          end
-
-          if (va_output_valid) begin
-            // stop search
-`ifdef MULTI_HIT_FULL_SET
-            if (last_search | prot_top | multi_hit_top) begin
-`else
-            if (last_search | prot_top | multi_hit_top | hit_top ) begin
-`endif
-              search_SN      = DONE;
-              search_done    = 1'b1;
-            end
-          end
-        end
-
-        DONE : begin
-          busy_o = 1'b1;
-          if (out_valid_o & out_ready_i)
-            search_SN = IDLE;
-        end
-
-        default : begin
-          search_SN = IDLE;
-        end
-      endcase // case (prot_SP)
-   end // always_comb begin
-
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         last_search  <= 1'b0;
-         first_search <= 1'b0;
-      end else begin
-         last_search  <= last_search_next;
-         first_search <= first_search_next;
-      end
-   end
-
-   /*
-    * VA RAM address generation
-    *
-    * The input address and set number, and thus the offset start address, are available in the
-    * cycle after the start signal. The buffered offset_addr becomes available one cycle later.
-    * During the first search cycle, we therefore directly use offset_addr_start for the lookup.
-    */
-   assign set_num = in_addr_i[SET_WIDTH+IGNORE_LSB -1 : IGNORE_LSB];
-
-   assign port0_raddr[OFFSET_WIDTH] = 1'b0;
-   assign port1_addr [OFFSET_WIDTH] = 1'b1;
-
-   assign port0_raddr[OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr;
-   assign port1_addr [OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr;
-
-   assign port0_raddr[SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num;
-   assign port1_addr [SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num;
-
-   assign port0_addr = ram_we ? ram_waddr : port0_raddr;
-
-   // The outputs of the BRAMs are only valid if in the previous cycle:
-   // 1. the inputs were valid, and
-   // 2. the BRAMs were not written to.
-   // Otherwise, the outputs must be ignored.
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         searching_q <= 1'b0;
-      end else begin
-         searching_q <= searching;
-      end
-   end
-   assign va_output_valid = searching_q;
-
-   // Address offset for looking up the VA RAMs
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         offset_addr   <= 0;
-      end else if (first_search) begin
-         offset_addr <= offset_start_addr + 1'b1;
-      end else if (searching) begin
-         offset_addr <= offset_addr + 1'b1;
-      end
-   end
-
-   // Delayed address offest for looking up the PA RAM upon a hit in the VA RAMs
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         offset_addr_d <= 0;
-      end else if (first_search) begin
-         offset_addr_d <= offset_start_addr;
-      end else if (searching) begin
-         offset_addr_d <= offset_addr_d + 1'b1;
-      end
-   end
-
-   // Store the offset addr for hit to reduce latency for next search.
-   generate
-      if (HIT_OFFSET_STORE_WIDTH > 0) begin : OFFSET_STORE
-`ifndef MULTI_HIT_FULL_SET
-         logic [N_SETS-1:0][HIT_OFFSET_STORE_WIDTH-1:0] hit_offset_addr; // Contains offset addr for previous hit for every SET.
-         logic [SET_WIDTH+OFFSET_WIDTH+1-1:0]           hit_addr_reg;
-
-         assign offset_start_addr = { hit_offset_addr[set_num] , {{OFFSET_WIDTH-HIT_OFFSET_STORE_WIDTH}{1'b0}} };
-         assign offset_end_addr   =   hit_offset_addr[set_num]-1'b1;
-
-         // Register the hit addr
-         always_ff @(posedge clk_i) begin
-            if (rst_ni == 0) begin
-               hit_addr_reg <= 0;
-            end else if (hit_top) begin
-               hit_addr_reg <= hit_addr[hit_block_num];
-            end
-         end
-
-         // Store hit addr for each set. The next search in the same set will start from the saved addr.
-         always_ff @(posedge clk_i) begin
-            if (rst_ni == 0) begin
-               hit_offset_addr <= 0;
-            end else if (hit_o) begin
-               hit_offset_addr[set_num][HIT_OFFSET_STORE_WIDTH-1:0] <= hit_addr_reg[OFFSET_WIDTH-1 : (OFFSET_WIDTH - HIT_OFFSET_STORE_WIDTH)];
-            end
-         end
-`else // No need to store offset if full multi hit detection is enabled because the entire SET is searched.
-         assign offset_start_addr = 0;
-         assign offset_end_addr   = {OFFSET_WIDTH{1'b1}};
-`endif
-      end else begin // if (HIT_OFFSET_STORE_WIDTH > 0)
-         assign offset_start_addr = 0;
-         assign offset_end_addr   = {OFFSET_WIDTH{1'b1}};
-      end
-   endgenerate
-
-   assign prot_top = |prot;
-
-   //////////////////////////////////////////////////////////////////////////////////////
-   // check for hit, multi hit
-   // In case of a multi hit, the hit_block_num indicates the lowest VA RAM with a hit.
-   // In case of a multi hit in the same VA RAM, Port 0 is given priority.
-   always_comb begin : HIT_CHECK
-      hit_top       = |hit;
-      hit_block_num = 0;
-      first_hit_top = 1'b0;
-      multi_hit_top = 1'b0;
-      for (int i=N_PAR_VA_RAMS-1; i>=0; i--) begin
-        if (hit[i] == 1'b1) begin
-`ifdef MULTI_HIT_CUR_CYCLE
-          if (multi_hit[i] | first_hit_top ) begin
-            multi_hit_top = 1'b1;
-          end
-`endif
-          first_hit_top = 1'b1;
-          hit_block_num = i;
-        end
-      end // for (int i=0; i<N_PAR_VA_RAMS; i++)
-   end // always_comb begin
-
-   ///////////////////// ------------- Outputs ------------ //////////////////////////////////
-   //// FSM
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         out_SP                     <= OUT_IDLE;
-         pa_ram_store_data_SP       <= 1'b0;
-         pa_port0_raddr_reg_SP      <=  'b0;
-      end else begin
-         out_SP                     <= out_SN;
-         pa_ram_store_data_SP       <= pa_ram_store_data_SN;
-         pa_port0_raddr_reg_SP      <= pa_port0_raddr_reg_SN;
-      end
-   end
-
-   always_comb begin : OUTPUT_FSM
-      out_SN                   = out_SP;
-
-      miss_next                = miss_o;
-      prot_next                = prot_o;
-      multi_next               = multi_o;
-      hit_next                 = hit_o;
-      cache_coherent_next      = cache_coherent_o;
-      pa_port0_raddr_reg_SN    = pa_port0_raddr_reg_SP;
-
-      pa_port0_raddr           =  'b0;
-      pa_ram_store_data_SN     = 1'b0;
-
-      out_valid_o              = 1'b0;
-      output_sent              = 1'b0;
-
-      unique case (out_SP)
-        OUT_IDLE : begin
-           hit_next            = 1'b0;
-           miss_next           = 1'b0;
-           prot_next           = 1'b0;
-           multi_next          = 1'b0;
-           cache_coherent_next = 1'b0;
-
-          // abort transaction
-          if         ((search_done & ~hit_top) | prot_top | multi_hit_top) begin
-             out_SN = SEND_OUTPUT;
-
-             if (search_done & ~hit_top) begin
-                miss_next  = 1'b1;
-             end
-             if (prot_top) begin
-                prot_next  = 1'b1;
-                hit_next   = 1'b1;
-             end
-             if (multi_hit_top) begin
-                multi_next = 1'b1;
-                hit_next   = 1'b1;
-             end
-
-          // read PA RAM
-          end else if (search_done & hit_top) begin
-             hit_next              = 1'b1;
-             cache_coherent_next   = cache_coherent[hit_block_num];
-             pa_port0_raddr        = (N_PAR_VA_RAMS * hit_addr[hit_block_num]) + hit_block_num;
-             pa_port0_raddr_reg_SN = pa_port0_raddr;
-
-             // read PA RAM now
-             if (~pa_ram_we) begin
-                out_SN               = SEND_OUTPUT;
-                pa_ram_store_data_SN = 1'b1;
-
-             // read PA RAM after PA RAM reconfiguration
-             end else begin // pa_ram_we
-                out_SN               = WAIT_ON_WRITE;
-
-             end
-          end
-        end
-
-        WAIT_ON_WRITE : begin
-          if ( ~pa_ram_we ) begin
-             out_SN               = SEND_OUTPUT;
-             pa_port0_raddr       = pa_port0_raddr_reg_SP;
-             pa_ram_store_data_SN = 1'b1;
-          end
-        end
-
-        SEND_OUTPUT : begin
-           out_valid_o  = 1'b1;
-           if (out_ready_i) begin
-              out_SN      = OUT_IDLE;
-              output_sent = 1'b1;
-           end
-        end
-
-        default : begin
-           out_SN = OUT_IDLE;
-        end
-
-      endcase // case (out_SP)
-   end // always_comb begin
-
-   //// Output signals
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         miss_o           <= 1'b0;
-         prot_o           <= 1'b0;
-         multi_o          <= 1'b0;
-         hit_o            <= 1'b0;
-         cache_coherent_o <= 1'b0;
-      end else begin
-         miss_o           <= miss_next;
-         prot_o           <= prot_next;
-         multi_o          <= multi_next;
-         hit_o            <= hit_next;
-         cache_coherent_o <= cache_coherent_next;
-      end
-   end
-
-   ///////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-  ///////////////////// --------------- Physical Address -------------- ////////////////////////////
-
-  /// PA Block RAM
-  ram_tp_no_change #(
-        .ADDR_WIDTH( PA_RAM_ADDR_WIDTH ),
-        .DATA_WIDTH( PA_RAM_DATA_WIDTH )
-        )
-  pa_ram
-    (
-      .clk   ( clk_i                          ),
-      .we    ( pa_ram_we                      ),
-      .addr0 ( pa_port0_addr                  ),
-      .addr1 ( '0                             ),
-      .d_i   ( wdata_i[PA_RAM_DATA_WIDTH-1:0] ),
-      .d0_o  ( pa_port0_data                  ),
-      .d1_o  (                                )
-    );
-
-   assign out_addr_o[IGNORE_LSB-1:0]                = in_addr_i[IGNORE_LSB-1:0];
-   assign out_addr_o[AXI_M_ADDR_WIDTH-1:IGNORE_LSB] = pa_data;
-
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         pa_port0_data_reg <= 0;
-      end else if (pa_ram_store_data_SP) begin
-         pa_port0_data_reg <= pa_port0_data;
-      end
-   end
-
-   assign pa_data = pa_ram_store_data_SP ? pa_port0_data : pa_port0_data_reg;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-///// Write enable for all block rams
-generate if (LL_WIDTH != 0) begin
-   always_comb begin
-      var reg[LL_WIDTH:0] para;
-      var int             para_int;
-      for (para = 0; para < N_PAR_VA_RAMS; para=para+1'b1) begin
-        para_int         = int'(para);
-        ram_we[para_int] = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b0) && (waddr_i[LL_WIDTH-1:0] == para);
-      end
-   end
-end else begin
-   assign ram_we[0] = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b0);
-end
-
-endgenerate
-
-// Addresses are word, not byte addresses
-assign pa_ram_we      = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b1); //waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] will be 0 for all VA writes and 1 for all PA writes
-assign ram_waddr      = waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH-1:LL_WIDTH];
-assign pa_port0_waddr = waddr_i[PA_RAM_ADDR_WIDTH-1:0];
-assign pa_port0_addr  = pa_ram_we ? pa_port0_waddr : pa_port0_raddr;
-
-"""
-# endmodule
-#
-# // vim: ts=3 sw=3 sts=3 et nosmartindent autoindent foldmethod=marker tw=100
-#
-#
diff --git a/src/iommu/axi_rab/rab_core.py b/src/iommu/axi_rab/rab_core.py
deleted file mode 100644
index 7d7494aa..00000000
--- a/src/iommu/axi_rab/rab_core.py
+++ /dev/null
@@ -1,539 +0,0 @@
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-
-# this file has been generated by sv2nmigen
-
-#
-# //`include "pulp_soc_defines.sv"
-#
-# ////import CfMath::log2;
-#
-# //`define MY_ARRAY_SUM(MY_ARRAY,ARRAY_SIZE) ( (ARRAY_SIZE==1) ? MY_ARRAY[0] : (ARRAY_SIZE==2) ? MY_ARRAY[0] + MY_ARRAY[1] : (ARRAY_SIZE==3) ? MY_ARRAY[0] + MY_ARRAY[1] + MY_ARRAY[2] : (ARRAY_SIZE==4) ? MY_ARRAY[0] + MY_ARRAY[1] + MY_ARRAY[2] + MY_ARRAY[3] : 0 )
-#
-
-# module rab_core
-#  #(
-#    parameter N_PORTS             =  3,
-#    parameter N_L2_SETS           = 32,
-#    parameter N_L2_SET_ENTRIES    = 32,
-#    parameter AXI_DATA_WIDTH      = 64,
-#    parameter AXI_S_ADDR_WIDTH    = 32,
-#    parameter AXI_M_ADDR_WIDTH    = 40,
-#    parameter AXI_LITE_DATA_WIDTH = 64,
-#    parameter AXI_LITE_ADDR_WIDTH = 32,
-#    parameter AXI_ID_WIDTH        =  8,
-#    parameter AXI_USER_WIDTH      =  6,
-#    parameter MH_FIFO_DEPTH       = 16
-#    )
-#   (
-#    input  logic                                         Clk_CI,
-#    input  logic                                         Rst_RBI,
-#
-#    input  logic               [AXI_LITE_ADDR_WIDTH-1:0] s_axi_awaddr,
-#    input  logic                                         s_axi_awvalid,
-#    output logic                                         s_axi_awready,
-#
-#    input  logic               [AXI_LITE_DATA_WIDTH-1:0] s_axi_wdata,
-#    input  logic             [AXI_LITE_DATA_WIDTH/8-1:0] s_axi_wstrb,
-#    input  logic                                         s_axi_wvalid,
-#    output logic                                         s_axi_wready,
-#
-#    input  logic               [AXI_LITE_ADDR_WIDTH-1:0] s_axi_araddr,
-#    input  logic                                         s_axi_arvalid,
-#    output logic                                         s_axi_arready,
-#
-#    input  logic                                         s_axi_rready,
-#    output logic               [AXI_LITE_DATA_WIDTH-1:0] s_axi_rdata,
-#    output logic                                   [1:0] s_axi_rresp,
-#    output logic                                         s_axi_rvalid,
-#
-#    output logic                                   [1:0] s_axi_bresp,
-#    output logic                                         s_axi_bvalid,
-#    input  logic                                         s_axi_bready,
-#
-#    output logic [N_PORTS-1:0]                           int_miss,
-#    output logic [N_PORTS-1:0]                           int_prot,
-#    output logic [N_PORTS-1:0]                           int_multi,
-#    output logic [N_PORTS-1:0]                           int_prefetch,
-#    output logic                                         int_mhf_full,
-#
-#    output logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] int_axaddr_o,
-#    output logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] int_axid_o,
-#    output logic [N_PORTS-1:0]                     [7:0] int_axlen_o,
-#    output logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] int_axuser_o,
-#
-#    input  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] port1_addr,
-#    input  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] port1_id,
-#    input  logic [N_PORTS-1:0]                     [7:0] port1_len,
-#    input  logic [N_PORTS-1:0]                     [2:0] port1_size,
-#    input  logic [N_PORTS-1:0]                           port1_addr_valid,
-#    input  logic [N_PORTS-1:0]                           port1_type,
-#    input  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] port1_user,
-#    input  logic [N_PORTS-1:0]                           port1_sent,
-#    output logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] port1_out_addr,
-#    output logic [N_PORTS-1:0]                           port1_cache_coherent,
-#    output logic [N_PORTS-1:0]                           port1_accept,
-#    output logic [N_PORTS-1:0]                           port1_drop,
-#    output logic [N_PORTS-1:0]                           port1_miss,
-#
-#    input  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] port2_addr,
-#    input  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] port2_id,
-#    input  logic [N_PORTS-1:0]                     [7:0] port2_len,
-#    input  logic [N_PORTS-1:0]                     [2:0] port2_size,
-#    input  logic [N_PORTS-1:0]                           port2_addr_valid,
-#    input  logic [N_PORTS-1:0]                           port2_type,
-#    input  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] port2_user,
-#    input  logic [N_PORTS-1:0]                           port2_sent,
-#    output logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] port2_out_addr,
-#    output logic [N_PORTS-1:0]                           port2_cache_coherent,
-#    output logic [N_PORTS-1:0]                           port2_accept,
-#    output logic [N_PORTS-1:0]                           port2_drop,
-#    output logic [N_PORTS-1:0]                           port2_miss,
-#
-#    input  logic [N_PORTS-1:0]                           miss_l2_i,
-#    input  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] miss_l2_addr_i,
-#    input  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] miss_l2_id_i,
-#    input  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] miss_l2_user_i,
-#
-#    output logic [N_PORTS-1:0] [AXI_LITE_DATA_WIDTH-1:0] wdata_l2_o,
-#    output logic [N_PORTS-1:0] [AXI_LITE_ADDR_WIDTH-1:0] waddr_l2_o,
-#    output logic [N_PORTS-1:0]                           wren_l2_o
-#    );
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class rab_core(Elaboratable):
-
-    def __init__(self):
-        self.s_axi_awaddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
-        self.s_axi_awvalid = Signal()  # input
-        self.s_axi_awready = Signal()  # output
-        self.s_axi_wdata = Signal(AXI_LITE_DATA_WIDTH)  # input
-        self.s_axi_wstrb = Signal(FIXME)  # input
-        self.s_axi_wvalid = Signal()  # input
-        self.s_axi_wready = Signal()  # output
-        self.s_axi_araddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
-        self.s_axi_arvalid = Signal()  # input
-        self.s_axi_arready = Signal()  # output
-        self.s_axi_rready = Signal()  # input
-        self.s_axi_rdata = Signal(AXI_LITE_DATA_WIDTH)  # output
-        self.s_axi_rresp = Signal(2)  # output
-        self.s_axi_rvalid = Signal()  # output
-        self.s_axi_bresp = Signal(2)  # output
-        self.s_axi_bvalid = Signal()  # output
-        self.s_axi_bready = Signal()  # input
-        self.int_miss = Signal(N_PORTS)  # output
-        self.int_prot = Signal(N_PORTS)  # output
-        self.int_multi = Signal(N_PORTS)  # output
-        self.int_prefetch = Signal(N_PORTS)  # output
-        self.int_mhf_full = Signal()  # output
-        self.int_axaddr_o = Signal()  # output
-        self.int_axid_o = Signal()  # output
-        self.int_axlen_o = Signal()  # output
-        self.int_axuser_o = Signal()  # output
-        self.port1_addr = Signal()  # input
-        self.port1_id = Signal()  # input
-        self.port1_len = Signal()  # input
-        self.port1_size = Signal()  # input
-        self.port1_addr_valid = Signal(N_PORTS)  # input
-        self.port1_type = Signal(N_PORTS)  # input
-        self.port1_user = Signal()  # input
-        self.port1_sent = Signal(N_PORTS)  # input
-        self.port1_out_addr = Signal()  # output
-        self.port1_cache_coherent = Signal(N_PORTS)  # output
-        self.port1_accept = Signal(N_PORTS)  # output
-        self.port1_drop = Signal(N_PORTS)  # output
-        self.port1_miss = Signal(N_PORTS)  # output
-        self.port2_addr = Signal()  # input
-        self.port2_id = Signal()  # input
-        self.port2_len = Signal()  # input
-        self.port2_size = Signal()  # input
-        self.port2_addr_valid = Signal(N_PORTS)  # input
-        self.port2_type = Signal(N_PORTS)  # input
-        self.port2_user = Signal()  # input
-        self.port2_sent = Signal(N_PORTS)  # input
-        self.port2_out_addr = Signal()  # output
-        self.port2_cache_coherent = Signal(N_PORTS)  # output
-        self.port2_accept = Signal(N_PORTS)  # output
-        self.port2_drop = Signal(N_PORTS)  # output
-        self.port2_miss = Signal(N_PORTS)  # output
-        self.miss_l2_i = Signal(N_PORTS)  # input
-        self.miss_l2_addr_i = Signal()  # input
-        self.miss_l2_id_i = Signal()  # input
-        self.miss_l2_user_i = Signal()  # input
-        self.wdata_l2_o = Signal()  # output
-        self.waddr_l2_o = Signal()  # output
-        self.wren_l2_o = Signal(N_PORTS)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-""" 
-
-
-    // âââââââââââ âââââââ ââââ   âââ ââââââ âââ     ââââââââ
-    // âââââââââââââââââââ âââââ  ââââââââââââââ     ââââââââ
-    // ââââââââââââââ  ââââââââââ ââââââââââââââ     ââââââââ
-    // ââââââââââââââ   ââââââââââââââââââââââââ     ââââââââ
-    // âââââââââââââââââââââââ âââââââââ  âââââââââââââââââââ
-    // âââââââââââ âââââââ âââ  ââââââââ  âââââââââââââââââââ
-    // signals
-
-  localparam integer ENABLE_L2TLB[N_PORTS-1:0] = `EN_L2TLB_ARRAY;
-
-  localparam integer N_SLICES[N_PORTS-1:0]     = `N_SLICES_ARRAY;
-  localparam         N_SLICES_TOT              = `MY_ARRAY_SUM(N_SLICES,N_PORTS);
-  localparam         N_SLICES_MAX              = `N_SLICES_MAX;
-
-  localparam N_REGS                            = 4*N_SLICES_TOT + 4;
-  localparam AXI_SIZE_WIDTH                    = log2(AXI_DATA_WIDTH/8);
-
-  localparam PORT_ID_WIDTH                     = (N_PORTS < 2) ? 1 : log2(N_PORTS);
-  localparam MISS_META_WIDTH                   = PORT_ID_WIDTH + AXI_USER_WIDTH + AXI_ID_WIDTH;
-
-  logic [N_PORTS-1:0]                      [15:0] p1_burst_size;
-  logic [N_PORTS-1:0]                      [15:0] p2_burst_size;
-
-  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p1_align_addr;
-  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p2_align_addr;
-
-  logic [N_PORTS-1:0]        [AXI_SIZE_WIDTH-1:0] p1_mask;
-  logic [N_PORTS-1:0]        [AXI_SIZE_WIDTH-1:0] p2_mask;
-
-  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p1_max_addr;
-  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p2_max_addr;
-
-  logic [N_PORTS-1:0]                             p1_prefetch;
-  logic [N_PORTS-1:0]                             p2_prefetch;
-
-  logic [N_PORTS-1:0]                             int_rw;
-  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] int_addr_min;
-  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] int_addr_max;
-  logic [N_PORTS-1:0]          [AXI_ID_WIDTH-1:0] int_id;
-  logic [N_PORTS-1:0]                       [7:0] int_len;
-  logic [N_PORTS-1:0]        [AXI_USER_WIDTH-1:0] int_user;
-
-  logic [N_PORTS-1:0]                             hit;
-  logic [N_PORTS-1:0]                             prot;
-  logic [N_PORTS-1:0]                             prefetch;
-
-  logic [N_PORTS-1:0]                             no_hit;
-  logic [N_PORTS-1:0]                             no_prot;
-
-  logic [N_PORTS-1:0]          [N_SLICES_MAX-1:0] hit_slices;
-  logic [N_PORTS-1:0]          [N_SLICES_MAX-1:0] prot_slices;
-
-  logic [N_PORTS-1:0]      [AXI_M_ADDR_WIDTH-1:0] out_addr;
-  logic [N_PORTS-1:0]      [AXI_M_ADDR_WIDTH-1:0] out_addr_reg;
-
-  logic [N_PORTS-1:0]                             cache_coherent;
-  logic [N_PORTS-1:0]                             cache_coherent_reg;
-
-  logic [N_PORTS-1:0]                             select;
-  reg   [N_PORTS-1:0]                             curr_priority;
-
-  reg   [N_PORTS-1:0]                             multi_hit;
-
-  logic [N_PORTS-1:0]                             miss_valid_mhf;
-  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] miss_addr_mhf;
-  logic [N_PORTS-1:0]       [MISS_META_WIDTH-1:0] miss_meta_mhf;
-
-  logic [N_REGS-1:0]                       [63:0] int_cfg_regs;
-  logic [N_PORTS-1:0] [4*N_SLICES_MAX-1:0] [63:0] int_cfg_regs_slices;
-
-  logic                                           L1AllowMultiHit_S;
-
-  genvar z;
-
-  //  ââââââ âââââââââââââââââââ âââââââ ââââ   âââââââ   ââââââââââââââââ   ââââââââââââââââââââ
-  // âââââââââââââââââââââââââââââââââââ âââââ  ââââââââ ââââââââââââââââââ  ââââââââââââââââââââ
-  // ââââââââââââââââââââââââââââââ  ââââââââââ ââââââââââââââââââââ  ââââââ âââ   âââ   ââââââââ
-  // ââââââââââââââââââââââââââââââ   ââââââââââââââââââââââââââââââ  ââââââââââ   âââ   ââââââââ
-  // âââ  ââââââââââââââââââââââââââââââââââ âââââââââ âââ ââââââââââââââ ââââââ   âââ   ââââââââ
-  // âââ  ââââââââââââââââââââââ âââââââ âââ  ââââââââ     ââââââââââââââ  âââââ   âââ   ââââââââ
-  // assignments
-
-  always_comb
-    begin : PORT_SELECT
-      var integer idx;
-
-      for (idx=0; idx<N_PORTS; idx++) begin
-
-        // select = 1 -> port1 active
-        // select = 0 -> port2 active
-        select[idx] = (curr_priority[idx] & port1_addr_valid[idx]) | ~port2_addr_valid[idx];
-
-        p1_burst_size[idx] = (port1_len[idx] + 1) << port1_size[idx];
-        p2_burst_size[idx] = (port2_len[idx] + 1) << port2_size[idx];
-
-        // align min addr for max addr computation to allow for smart AXI bursts around the 4k boundary
-        if      (port1_size[idx] == 3'b001)
-          p1_mask[idx] = 3'b110;
-        else if (port1_size[idx] == 3'b010)
-          p1_mask[idx] = 3'b100;
-        else if (port1_size[idx] == 3'b011)
-          p1_mask[idx] = 3'b000;
-        else
-          p1_mask[idx] = 3'b111;
-
-        p1_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port1_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH];
-        p1_align_addr[idx][AXI_SIZE_WIDTH-1:0]                = port1_addr[idx][AXI_SIZE_WIDTH-1:0] & p1_mask[idx];
-
-        if      (port2_size[idx] == 3'b001)
-          p2_mask[idx] = 3'b110;
-        else if (port2_size[idx] == 3'b010)
-          p2_mask[idx] = 3'b100;
-        else if (port2_size[idx] == 3'b011)
-          p2_mask[idx] = 3'b000;
-        else
-          p2_mask[idx] = 3'b111;
-
-        if (port1_user[idx] == {AXI_USER_WIDTH{1'b1}})
-          p1_prefetch[idx] = 1'b1;
-        else
-          p1_prefetch[idx] = 1'b0;
-
-        if (port2_user[idx] == {AXI_USER_WIDTH{1'b1}})
-          p2_prefetch[idx] = 1'b1;
-        else
-          p2_prefetch[idx] = 1'b0;
-
-        p2_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port2_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH];
-        p2_align_addr[idx][AXI_SIZE_WIDTH-1:0]                = port2_addr[idx][AXI_SIZE_WIDTH-1:0] & p2_mask[idx];
-
-        p1_max_addr[idx]  = p1_align_addr[idx] + p1_burst_size[idx] - 1;
-        p2_max_addr[idx]  = p2_align_addr[idx] + p2_burst_size[idx] - 1;
-
-        int_addr_min[idx] = select[idx] ? port1_addr[idx]  : port2_addr[idx];
-        int_addr_max[idx] = select[idx] ? p1_max_addr[idx] : p2_max_addr[idx];
-        int_rw[idx]       = select[idx] ? port1_type[idx]  : port2_type[idx];
-        int_id[idx]       = select[idx] ? port1_id[idx]    : port2_id[idx];
-        int_len[idx]      = select[idx] ? port1_len[idx]   : port2_len[idx];
-        int_user[idx]     = select[idx] ? port1_user[idx]  : port2_user[idx];
-        prefetch[idx]     = select[idx] ? p1_prefetch[idx] : p2_prefetch[idx];
-
-        hit [idx]    = | hit_slices [idx];
-        prot[idx]    = | prot_slices[idx];
-
-        no_hit [idx] = ~hit [idx];
-        no_prot[idx] = ~prot[idx];
-
-        port1_out_addr[idx] = out_addr_reg[idx];
-        port2_out_addr[idx] = out_addr_reg[idx];
-
-        port1_cache_coherent[idx] = cache_coherent_reg[idx];
-        port2_cache_coherent[idx] = cache_coherent_reg[idx];
-      end
-    end
-
-  always_comb
-    begin
-      var integer idx_port, idx_slice;
-      var integer reg_num;
-      reg_num=0;
-      for ( idx_port = 0; idx_port < N_PORTS; idx_port++ ) begin
-        for ( idx_slice = 0; idx_slice < 4*N_SLICES[idx_port]; idx_slice++ ) begin
-          int_cfg_regs_slices[idx_port][idx_slice] = int_cfg_regs[4+reg_num];
-          reg_num++;
-        end
-        // int_cfg_regs_slices[idx_port][N_SLICES_MAX:N_SLICES[idx_port]] will be dangling
-        // Fix to zero. Synthesis will remove these signals.
-        // int_cfg_regs_slices[idx_port][4*N_SLICES_MAX-1:4*N_SLICES[idx_port]] = 0;
-      end
-  end
-
-  always @(posedge Clk_CI or negedge Rst_RBI)
-    begin : PORT_PRIORITY
-      var integer idx;
-      if (Rst_RBI == 1'b0)
-        curr_priority = 'h0;
-      else begin
-        for (idx=0; idx<N_PORTS; idx++) begin
-          if (port1_accept[idx] || port1_drop[idx])
-            curr_priority[idx] = 1'b1;
-          else if (port2_accept[idx] || port2_drop[idx])
-            curr_priority[idx] = 1'b0;
-        end
-      end
-    end
-
-  // find port that misses
-  logic [PORT_ID_WIDTH-1:0] PortIdx_D; // index of the first missing port
-  var integer               idx_miss;
-  always_comb begin : MHF_PORT_SELECT
-    PortIdx_D = 'b0;
-    for (idx_miss = 0; idx_miss < N_PORTS; idx_miss++) begin
-      if (miss_valid_mhf[idx_miss] == 1'b1) begin
-        PortIdx_D = idx_miss;
-        break;
-      end
-    end
-  end // always_comb begin
-
-  //  ââââââ âââ  ââââââ    âââââââ  ââââââ âââââââ      âââââââââââââââ âââââââ
-  // âââââââââââââââââââ    ââââââââââââââââââââââââ    ââââââââââââââââââââââââ
-  // ââââââââ ââââââ âââ    ââââââââââââââââââââââââ    âââ     ââââââ  âââ  ââââ
-  // ââââââââ ââââââ âââ    ââââââââââââââââââââââââ    âââ     ââââââ  âââ   âââ
-  // âââ  âââââââ ââââââ    âââ  ââââââ  âââââââââââ    âââââââââââ     âââââââââ
-  // âââ  ââââââ  ââââââ    âââ  ââââââ  ââââââââââ      ââââââââââ      âââââââ
-  axi_rab_cfg
-    #(
-      .N_PORTS         ( N_PORTS             ),
-      .N_REGS          ( N_REGS              ),
-      .N_L2_SETS       ( N_L2_SETS           ),
-      .N_L2_SET_ENTRIES( N_L2_SET_ENTRIES    ),
-      .ADDR_WIDTH_PHYS ( AXI_M_ADDR_WIDTH    ),
-      .ADDR_WIDTH_VIRT ( AXI_S_ADDR_WIDTH    ),
-      .N_FLAGS         ( 4                   ),
-      .AXI_DATA_WIDTH  ( AXI_LITE_DATA_WIDTH ),
-      .AXI_ADDR_WIDTH  ( AXI_LITE_ADDR_WIDTH ),
-      .MISS_META_WIDTH ( MISS_META_WIDTH     ),
-      .MH_FIFO_DEPTH   ( MH_FIFO_DEPTH       )
-    )
-    u_axi_rab_cfg
-    (
-      .Clk_CI             ( Clk_CI                    ),
-      .Rst_RBI            ( Rst_RBI                   ),
-      .s_axi_awaddr       ( s_axi_awaddr              ),
-      .s_axi_awvalid      ( s_axi_awvalid             ),
-      .s_axi_wdata        ( s_axi_wdata               ),
-      .s_axi_wstrb        ( s_axi_wstrb               ),
-      .s_axi_wvalid       ( s_axi_wvalid              ),
-      .s_axi_bready       ( s_axi_bready              ),
-      .s_axi_araddr       ( s_axi_araddr              ),
-      .s_axi_arvalid      ( s_axi_arvalid             ),
-      .s_axi_rready       ( s_axi_rready              ),
-      .s_axi_arready      ( s_axi_arready             ),
-      .s_axi_rdata        ( s_axi_rdata               ),
-      .s_axi_rresp        ( s_axi_rresp               ),
-      .s_axi_rvalid       ( s_axi_rvalid              ),
-      .s_axi_wready       ( s_axi_wready              ),
-      .s_axi_bresp        ( s_axi_bresp               ),
-      .s_axi_bvalid       ( s_axi_bvalid              ),
-      .s_axi_awready      ( s_axi_awready             ),
-      .L1Cfg_DO           ( int_cfg_regs              ),
-      .L1AllowMultiHit_SO ( L1AllowMultiHit_S         ),
-      .MissAddr_DI        ( miss_addr_mhf[PortIdx_D]  ),
-      .MissMeta_DI        ( miss_meta_mhf[PortIdx_D]  ),
-      .Miss_SI            ( miss_valid_mhf[PortIdx_D] ),
-      .MhFifoFull_SO      ( int_mhf_full              ),
-      .wdata_l2           ( wdata_l2_o                ),
-      .waddr_l2           ( waddr_l2_o                ),
-      .wren_l2            ( wren_l2_o                 )
-    );
-
-  generate for (z = 0; z < N_PORTS; z++) begin : MHF_TLB_SELECT
-    if (ENABLE_L2TLB[z] == 1) begin // L2 TLB is enabled
-      assign miss_valid_mhf[z] = miss_l2_i[z];
-      assign miss_addr_mhf[z]  = miss_l2_addr_i[z];
-      assign miss_meta_mhf[z]  = {miss_l2_user_i[z], PortIdx_D, miss_l2_id_i[z]};
-    end else begin// L2 TLB is disabled
-      assign miss_valid_mhf[z] = int_miss[z];
-      assign miss_addr_mhf[z]  = int_addr_min[z];
-      assign miss_meta_mhf[z]  = {int_user[z], PortIdx_D, int_id[z]};
-    end
-  end
-  endgenerate
-
-  // âââââââââââ     âââ âââââââââââââââ    âââââââââ âââââââ âââââââ
-  // âââââââââââ     âââââââââââââââââââ    ââââââââââââââââââââââââââ
-  // âââââââââââ     ââââââ     ââââââ         âââ   âââ   âââââââââââ
-  // âââââââââââ     ââââââ     ââââââ         âââ   âââ   ââââââââââ
-  // âââââââââââââââââââââââââââââââââââ       âââ   ââââââââââââ
-  // âââââââââââââââââââ âââââââââââââââ       âââ    âââââââ âââ
-  generate for (z = 0; z < N_PORTS; z++) begin : SLICE_TOP_GEN
-    slice_top
-      #(
-        .N_SLICES        ( N_SLICES[z]      ),
-        .N_REGS          ( 4*N_SLICES[z]    ),
-        .ADDR_WIDTH_PHYS ( AXI_M_ADDR_WIDTH ),
-        .ADDR_WIDTH_VIRT ( AXI_S_ADDR_WIDTH )
-      )
-      u_slice_top
-      (
-        .int_cfg_regs    ( int_cfg_regs_slices[z][4*N_SLICES[z]-1:0] ),
-        .int_rw          ( int_rw[z]                                 ),
-        .int_addr_min    ( int_addr_min[z]                           ),
-        .int_addr_max    ( int_addr_max[z]                           ),
-        .multi_hit_allow ( L1AllowMultiHit_S                         ),
-        .multi_hit       ( multi_hit[z]                              ),
-        .prot            ( prot_slices[z][N_SLICES[z]-1:0]           ),
-        .hit             ( hit_slices [z][N_SLICES[z]-1:0]           ),
-        .cache_coherent  ( cache_coherent[z]                         ),
-        .out_addr        ( out_addr[z]                               )
-      );
-    // hit_slices [N_SLICES_MAX-1:N_SLICES_MAX-N_SLICES[z]] will be dangling
-    // prot_slices[N_SLICES_MAX-1:N_SLICES_MAX-N_SLICES[z]] will be dangling
-    // Fix to zero. Synthesis will remove these signals.
-    if ( N_SLICES[z] < N_SLICES_MAX ) begin
-      assign hit_slices [z][N_SLICES_MAX-1:N_SLICES[z]] = 0;
-      assign prot_slices[z][N_SLICES_MAX-1:N_SLICES[z]] = 0;
-    end
-  end // for (z = 0; z < N_PORTS; z++)
-  endgenerate
-
-  // ââââââââââââââââââââ   ââââ
-  // âââââââââââââââââââââ âââââ
-  // ââââââ  âââââââââââââââââââ
-  // ââââââ  âââââââââââââââââââ
-  // âââ     âââââââââââ âââ âââ
-  // âââ     âââââââââââ     âââ
-  //
-  generate for (z = 0; z < N_PORTS; z++) begin : FSM_GEN
-    fsm
-      #(
-        .AXI_M_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
-        .AXI_S_ADDR_WIDTH ( AXI_S_ADDR_WIDTH ),
-        .AXI_ID_WIDTH     ( AXI_ID_WIDTH     ),
-        .AXI_USER_WIDTH   ( AXI_USER_WIDTH   )
-      )
-      u_fsm
-      (
-        .Clk_CI             ( Clk_CI                ),
-        .Rst_RBI            ( Rst_RBI               ),
-        .port1_addr_valid_i ( port1_addr_valid[z]   ),
-        .port2_addr_valid_i ( port2_addr_valid[z]   ),
-        .port1_sent_i       ( port1_sent[z]         ),
-        .port2_sent_i       ( port2_sent[z]         ),
-        .select_i           ( select[z]             ),
-        .no_hit_i           ( no_hit[z]             ),
-        .multi_hit_i        ( multi_hit[z]          ),
-        .no_prot_i          ( no_prot[z]            ),
-        .prefetch_i         ( prefetch[z]           ),
-        .out_addr_i         ( out_addr[z]           ),
-        .cache_coherent_i   ( cache_coherent[z]     ),
-        .port1_accept_o     ( port1_accept[z]       ),
-        .port1_drop_o       ( port1_drop[z]         ),
-        .port1_miss_o       ( port1_miss[z]         ),
-        .port2_accept_o     ( port2_accept[z]       ),
-        .port2_drop_o       ( port2_drop[z]         ),
-        .port2_miss_o       ( port2_miss[z]         ),
-        .out_addr_o         ( out_addr_reg[z]       ),
-        .cache_coherent_o   ( cache_coherent_reg[z] ),
-        .miss_o             ( int_miss[z]           ),
-        .multi_o            ( int_multi[z]          ),
-        .prot_o             ( int_prot[z]           ),
-        .prefetch_o         ( int_prefetch[z]       ),
-        .in_addr_i          ( int_addr_min[z]       ),
-        .in_id_i            ( int_id[z]             ),
-        .in_len_i           ( int_len[z]            ),
-        .in_user_i          ( int_user[z]           ),
-        .in_addr_o          ( int_axaddr_o[z]       ),
-        .in_id_o            ( int_axid_o[z]         ),
-        .in_len_o           ( int_axlen_o[z]        ),
-        .in_user_o          ( int_axuser_o[z]       )
-      );
-  end
-  endgenerate
-  
-"""
diff --git a/src/iommu/axi_rab/rab_slice.py b/src/iommu/axi_rab/rab_slice.py
deleted file mode 100644
index 59f84e3e..00000000
--- a/src/iommu/axi_rab/rab_slice.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module rab_slice
-# #(
-#    parameter ADDR_WIDTH_PHYS = 40,
-#    parameter ADDR_WIDTH_VIRT = 32
-#    )
-#   (
-#    input  logic [ADDR_WIDTH_VIRT-1:0] cfg_min,
-#    input  logic [ADDR_WIDTH_VIRT-1:0] cfg_max,
-#    input  logic [ADDR_WIDTH_PHYS-1:0] cfg_offset,
-#    input  logic                       cfg_wen,
-#    input  logic                       cfg_ren,
-#    input  logic                       cfg_en,
-#    input  logic                       in_trans_type,
-#    input  logic [ADDR_WIDTH_VIRT-1:0] in_addr_min,
-#    input  logic [ADDR_WIDTH_VIRT-1:0] in_addr_max,
-#    output logic                       out_hit,
-#    output logic                       out_prot,
-#    output logic [ADDR_WIDTH_PHYS-1:0] out_addr
-#  );
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class rab_slice(Elaboratable):
-
-    def __init__(self, params):  # pass config object
-        # TODO parameters
-        self.params = params
-        self.cfg_min = Signal(params.ADDR_WIDTH_VIRT)  # input
-        self.cfg_max = Signal(params.ADDR_WIDTH_VIRT)  # input
-        self.cfg_offset = Signal(params.ADDR_WIDTH_PHYS)  # input
-        self.cfg_wen = Signal()  # input
-        self.cfg_ren = Signal()  # input
-        self.cfg_en = Signal()  # input
-        self.in_trans_type = Signal()  # input
-        self.in_addr_min = Signal(params.ADDR_WIDTH_VIRT)  # input
-        self.in_addr_max = Signal(params.ADDR_WIDTH_VIRT)  # input
-        self.out_hit = Signal()  # output
-        self.out_prot = Signal()  # output
-        self.out_addr = Signal(params.ADDR_WIDTH_PHYS)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        min_above_min = Signal()
-        min_below_max = Signal()
-        max_below_max = Signal()
-
-        #  assign min_above_min = (in_addr_min >= cfg_min) ? 1'b1 : 1'b0;
-        #  assign min_below_max = (in_addr_min <= cfg_max) ? 1'b1 : 1'b0;
-        #  assign max_below_max = (in_addr_max <= cfg_max) ? 1'b1 : 1'b0;
-        #  assign out_hit  = cfg_en & min_above_min & min_below_max & max_below_max;
-        #  assign out_prot = out_hit & ((in_trans_type & ~cfg_wen) | (~in_trans_type & ~cfg_ren));
-        #  assign out_addr = in_addr_min - cfg_min + cfg_offset;
-        m.d.comb += [
-            min_above_min.eq(self.in_addr_min >= self.cfg_min),
-            min_below_max.eq(self.in_addr_min <= self.cfg_max),
-            max_below_max.eq(self.in_addr_max <= self.cfg_max),
-            self.out_hit.eq(self.cfg_en & min_above_min &
-                            min_below_max & max_below_max),
-            self.out_prot.eq(self.out_hit & (
-                (self.in_trans_type & ~self.cfg_wen) | (~self.in_trans_type & ~self.cfg_ren))),
-            self.out_addr.eq(self.in_addr_min - self.cfg_min + self.cfg_offset)
-        ]
-
-        return m
diff --git a/src/iommu/axi_rab/ram_tp_no_change.py b/src/iommu/axi_rab/ram_tp_no_change.py
deleted file mode 100644
index bdcd5550..00000000
--- a/src/iommu/axi_rab/ram_tp_no_change.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# /*
-# * ram_tp_no_change
-# *
-# * This code implements a parameterizable two-port memory. Port 0 can read and
-# * write while Port 1 can read only. The Xilinx tools will infer a BRAM with
-# * Port 0 in "no change" mode, i.e., during a write, it retains the last read
-# * value on the output. Port 1 (read-only) is in "write first" mode. Still, it
-# * outputs the old data during the write cycle. Note: Port 1 outputs invalid
-# * data in the cycle after the write when reading the same address.
-# *
-# * For more information, see Xilinx PG058 Block Memory Generator Product Guide.
-# */
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-from nmigen import Memory
-
-import math
-
-#
-# module ram_tp_no_change
-#  #(
-ADDR_WIDTH = 10
-DATA_WIDTH = 36
-#  )
-#  (
-#    input                   clk,
-#    input                   we,
-#    input  [ADDR_WIDTH-1:0] addr0,
-#    input  [ADDR_WIDTH-1:0] addr1,
-#    input  [DATA_WIDTH-1:0] d_i,
-#    output [DATA_WIDTH-1:0] d0_o,
-#    output [DATA_WIDTH-1:0] d1_o
-#  );
-
-
-class ram_tp_no_change(Elaboratable):
-
-    def __init__(self):
-        self.we = Signal()               # input
-        self.addr0 = Signal(ADDR_WIDTH)  # input
-        self.addr1 = Signal(ADDR_WIDTH)  # input
-        self.d_i = Signal(DATA_WIDTH)    # input
-        self.d0_o = Signal(DATA_WIDTH)   # output
-        self.d1_o = Signal(DATA_WIDTH)   # output
-
-        DEPTH = int(math.pow(2, ADDR_WIDTH))
-        self.ram = Memory(DATA_WIDTH, DEPTH)
-    #
-    #  localparam DEPTH = 2**ADDR_WIDTH;
-    #
-    #  (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH];
-    #                            reg [DATA_WIDTH-1:0] d0;
-    #                            reg [DATA_WIDTH-1:0] d1;
-    #
-    #  always_ff @(posedge clk) begin
-    #    if(we == 1'b1) begin
-    #      ram[addr0] <= d_i;
-    #    end else begin
-    # only change data if we==false
-    #      d0 <= ram[addr0];
-    #    end
-    #    d1   <= ram[addr1];
-    #  end
-    #
-    #  assign d0_o = d0;
-    #  assign d1_o = d1;
-    #
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.submodules.read_ram0 = read_ram0 = self.ram.read_port()
-        m.submodules.read_ram1 = read_ram1 = self.ram.read_port()
-        m.submodules.write_ram = write_ram = self.ram.write_port()
-
-        # write port
-        m.d.comb += write_ram.en.eq(self.we)
-        m.d.comb += write_ram.addr.eq(self.addr0)
-        m.d.comb += write_ram.data.eq(self.d_i)
-
-        # read ports
-        m.d.comb += read_ram0.addr.eq(self.addr0)
-        m.d.comb += read_ram1.addr.eq(self.addr1)
-        with m.If(self.we == 0):
-            m.d.sync += self.d0_o.eq(read_ram0.data)
-        m.d.sync += self.d1_o.eq(read_ram1.data)
-
-        return m
diff --git a/src/iommu/axi_rab/ram_tp_write_first.py b/src/iommu/axi_rab/ram_tp_write_first.py
deleted file mode 100644
index 7a21969c..00000000
--- a/src/iommu/axi_rab/ram_tp_write_first.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# /*
-# * ram_tp_write_first
-# *
-# * This code implements a parameterizable two-port memory. Port 0 can read and
-# * write while Port 1 can read only. Xilinx Vivado will infer a BRAM in
-# * "write first" mode, i.e., upon a read and write to the same address, the
-# * new value is read. Note: Port 1 outputs invalid data in the cycle after
-# * the write when reading the same address.
-# *
-# * For more information, see Xilinx PG058 Block Memory Generator Product Guide.
-# */
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-from nmigen import Memory
-
-import math
-#
-# module ram_tp_write_first
-#  #(
-ADDR_WIDTH = 10
-DATA_WIDTH = 36
-#  )
-#  (
-#    input                   clk,
-#    input                   we,
-#    input  [ADDR_WIDTH-1:0] addr0,
-#    input  [ADDR_WIDTH-1:0] addr1,
-#    input  [DATA_WIDTH-1:0] d_i,
-#    output [DATA_WIDTH-1:0] d0_o,
-#    output [DATA_WIDTH-1:0] d1_o
-#  );
-
-
-class ram_tp_write_first(Elaboratable):
-
-    def __init__(self):
-        self.we = Signal()               # input
-        self.addr0 = Signal(ADDR_WIDTH)  # input
-        self.addr1 = Signal(ADDR_WIDTH)  # input
-        self.d_i = Signal(DATA_WIDTH)    # input
-        self.d0_o = Signal(DATA_WIDTH)   # output
-        self.d1_o = Signal(DATA_WIDTH)   # output
-
-        DEPTH = int(math.pow(2, ADDR_WIDTH))
-        self.ram = Memory(DATA_WIDTH, DEPTH)
-
-    #
-    #  localparam DEPTH = 2**ADDR_WIDTH;
-    #
-    #  (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH];
-    #                            reg [ADDR_WIDTH-1:0] raddr0;
-    #                            reg [ADDR_WIDTH-1:0] raddr1;
-    #
-    #  always_ff @(posedge clk) begin
-    #    if(we == 1'b1) begin
-    #      ram[addr0] <= d_i;
-    #    end
-    #    raddr0 <= addr0;
-    #    raddr1 <= addr1;
-    #  end
-    #
-    #  assign d0_o = ram[raddr0];
-    #  assign d1_o = ram[raddr1];
-    #
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.submodules.read_ram0 = read_ram0 = self.ram.read_port()
-        m.submodules.read_ram1 = read_ram1 = self.ram.read_port()
-        m.submodules.write_ram = write_ram = self.ram.write_port()
-
-        # write port
-        m.d.comb += write_ram.en.eq(self.we)
-        m.d.comb += write_ram.addr.eq(self.addr0)
-        m.d.comb += write_ram.data.eq(self.d_i)
-
-        # read ports
-        m.d.comb += read_ram0.addr.eq(self.addr0)
-        m.d.comb += read_ram1.addr.eq(self.addr1)
-        m.d.sync += self.d0_o.eq(read_ram0.data)
-        m.d.sync += self.d1_o.eq(read_ram1.data)
-
-        return m
diff --git a/src/iommu/axi_rab/slice_top.py b/src/iommu/axi_rab/slice_top.py
deleted file mode 100644
index 6eedb1cd..00000000
--- a/src/iommu/axi_rab/slice_top.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-import rab_slice
-import coreconfig
-
-#
-# module slice_top
-# //#(
-#  //  parameter N_SLICES        = 16,
-#  //  parameter N_REGS          = 4*N_SLICES,
-#   // parameter ADDR_WIDTH_PHYS = 40,
-#   // parameter ADDR_WIDTH_VIRT = 32
-#  //  )
-#   (
-#    input   logic   [N_REGS-1:0] [63:0] int_cfg_regs,
-#    input   logic                       int_rw,
-#    input   logic [ADDR_WIDTH_VIRT-1:0] int_addr_min,
-#    input   logic [ADDR_WIDTH_VIRT-1:0] int_addr_max,
-#    input   logic                       multi_hit_allow,
-#    output  logic                       multi_hit,
-#    output  logic        [N_SLICES-1:0] prot,
-#    output  logic        [N_SLICES-1:0] hit,
-#    output  logic                       cache_coherent,
-#    output  logic [ADDR_WIDTH_PHYS-1:0] out_addr
-#  );
-#
-
-
-class slice_top(Elaboratable):
-
-    def __init__(self):
-        # FIXME self.int_cfg_regs = Signal()  # input
-        self.params = coreconfig.CoreConfig() # rename ?
-        self.int_rw = Signal()  # input
-        self.int_addr_min = Signal(self.params.ADDR_WIDTH_VIRT)  # input
-        self.int_addr_max = Signal(self.params.ADDR_WIDTH_VIRT)  # input
-        self.multi_hit_allow = Signal()  # input
-        self.multi_hit = Signal()  # output
-        self.prot = Signal(self.params.N_SLICES)  # output
-        self.hit = Signal(self.params.N_SLICES)  # output
-        self.cache_coherent = Signal()  # output
-        self.out_addr = Signal(self.params.ADDR_WIDTH_PHYS)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-
-        first_hit = Signal()
-
-        for i in range(self.params.N_SLICES):
-            # TODO pass params / core config here
-            u_slice = rab_slice.rab_slice(self.params)
-            setattr(m.submodules, "u_slice%d" % i, u_slice)
-            # TODO set param and connect ports
-
-        # In case of a multi hit, the lowest slice with a hit is selected.
-        # TODO always_comb begin : HIT_CHECK
-        m.d.comb += [
-            first_hit.eq(0),
-            self.multi_hit.eq(0),
-            self.out_addr.eq(0),
-            self.cache_coherent.eq(0)]
-
-        for j in range(self.params.N_SLICES):
-            with m.If(self.hit[j] == 1):
-                with m.If(first_hit == 1):
-                    with m.If(self.multi_hit_allow == 0):
-                        m.d.comb += [self.multi_hit.eq(1)]
-                with m.Elif(first_hit == 1):
-                    m.d.comb += [first_hit.eq(1)
-                                 # only output first slice that was hit
-                                 # SV self.out_addr.eq(slice_out_addr[ADDR_WIDTH_PHYS*j + : ADDR_WIDTH_PHYS]),
-                                 # SV self.cache_coherent.eq(int_cfg_regs[4*j+3][3]),
-                                 ]
-        return m
-
-  # TODO translate generate statement
-
-
-"""
-  logic [ADDR_WIDTH_PHYS*N_SLICES-1:0]  slice_out_addr;
-
-  generate
-    for ( i=0; i<N_SLICES; i++ )
-      begin
-        rab_slice
-          #(
-            .ADDR_WIDTH_PHYS ( ADDR_WIDTH_PHYS ),
-            .ADDR_WIDTH_VIRT ( ADDR_WIDTH_VIRT )
-            )
-          u_slice
-          (
-            .cfg_min       ( int_cfg_regs[4*i]  [ADDR_WIDTH_VIRT-1:0]                              ),
-            .cfg_max       ( int_cfg_regs[4*i+1][ADDR_WIDTH_VIRT-1:0]                              ),
-            .cfg_offset    ( int_cfg_regs[4*i+2][ADDR_WIDTH_PHYS-1:0]                              ),
-            .cfg_wen       ( int_cfg_regs[4*i+3][2]                                                ),
-            .cfg_ren       ( int_cfg_regs[4*i+3][1]                                                ),
-            .cfg_en        ( int_cfg_regs[4*i+3][0]                                                ),
-            .in_trans_type ( int_rw                                                                ),
-            .in_addr_min   ( int_addr_min                                                          ),
-            .in_addr_max   ( int_addr_max                                                          ),
-            .out_addr      ( slice_out_addr[ADDR_WIDTH_PHYS*i+ADDR_WIDTH_PHYS-1:ADDR_WIDTH_PHYS*i] ),
-            .out_prot      ( prot[i]                                                               ),
-            .out_hit       ( hit[i]                                                                )
-          );
-     end
-  endgenerate
-
-  // In case of a multi hit, the lowest slice with a hit is selected.
-  always_comb begin : HIT_CHECK
-    first_hit      =  0;
-    multi_hit      =  0;
-    out_addr       = '0;
-    cache_coherent =  0;
-    for (j = 0; j < N_SLICES; j++) begin
-      if (hit[j] == 1'b1) begin
-        if (first_hit == 1'b1) begin
-          if (multi_hit_allow == 1'b0) begin
-            multi_hit = 1'b1;
-          end
-        end else begin
-          first_hit       = 1'b1;
-          out_addr        = slice_out_addr[ADDR_WIDTH_PHYS*j +: ADDR_WIDTH_PHYS];
-          cache_coherent  = int_cfg_regs[4*j+3][3];
-        end
-      end
-    end
-  end
-"""
-
-# sv 2 migen: TODO add translate code for generate statements and for loops inside always_comb
diff --git a/src/iommu/axi_rab/test/test_ram_tp_no_change.py b/src/iommu/axi_rab/test/test_ram_tp_no_change.py
deleted file mode 100644
index 8d23ef05..00000000
--- a/src/iommu/axi_rab/test/test_ram_tp_no_change.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from ram_tp_write_first import ram_tp_write_first
-from nmigen.compat.sim import run_simulation
-import sys
-sys.path.append("../")
-
-
-def tbench(dut):
-    yield dut.we.eq(1)
-    for i in range(0, 255):
-        yield dut.addr0.eq(i)
-        yield dut.d_i.eq(i)
-        yield
-
-
-if __name__ == "__main__":
-    dut = ram_tp_write_first()
-    run_simulation(dut, tbench(dut), vcd_name="ram_tp_write_first.vcd")
-    print("ram_tp_write_first Unit Test Success")
diff --git a/src/iommu/axi_rab/test/test_slice_top.py b/src/iommu/axi_rab/test/test_slice_top.py
deleted file mode 100644
index c234b908..00000000
--- a/src/iommu/axi_rab/test/test_slice_top.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from nmigen.compat.sim import run_simulation
-import sys
-sys.path.append("../")
-# sys.path.append("../../../TestUtil")
-from slice_top import slice_top
-
-def tbench(dut):
-    yield
-
-
-if __name__ == "__main__":
-    dut = slice_top()
-    run_simulation(dut, tbench(dut), vcd_name="test_slice_top.vcd")
-    print("slice_top Unit Test Success")
diff --git a/src/regfile/regfile.py b/src/regfile/regfile.py
deleted file mode 100644
index b1d6f1c6..00000000
--- a/src/regfile/regfile.py
+++ /dev/null
@@ -1,290 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-
-from nmigen import Cat, Const, Array, Signal, Elaboratable, Module
-from nmutil.iocontrol import RecordObject
-
-from math import log
-from functools import reduce
-import operator
-
-
-class Register(Elaboratable):
-    def __init__(self, width, writethru=True):
-        self.width = width
-        self.writethru = writethru
-        self._rdports = []
-        self._wrports = []
-
-    def read_port(self, name=None):
-        port = RecordObject([("ren", 1),
-                             ("data_o", self.width)],
-                            name=name)
-        self._rdports.append(port)
-        return port
-
-    def write_port(self, name=None):
-        port = RecordObject([("wen", 1),
-                             ("data_i", self.width)],
-                            name=name)
-        self._wrports.append(port)
-        return port
-
-    def elaborate(self, platform):
-        m = Module()
-        self.reg = reg = Signal(self.width, name="reg")
-
-        # read ports. has write-through detection (returns data written)
-        for rp in self._rdports:
-            with m.If(rp.ren):
-                if self.writethru:
-                    wr_detect = Signal(reset_less=False)
-                    m.d.comb += wr_detect.eq(0)
-                    for wp in self._wrports:
-                        with m.If(wp.wen):
-                            m.d.comb += rp.data_o.eq(wp.data_i)
-                            m.d.comb += wr_detect.eq(1)
-                    with m.If(~wr_detect):
-                        m.d.comb += rp.data_o.eq(reg)
-                else:
-                    m.d.comb += rp.data_o.eq(reg)
-
-        # write ports, don't allow write to address 0 (ignore it)
-        for wp in self._wrports:
-            with m.If(wp.wen):
-                m.d.sync += reg.eq(wp.data_i)
-
-        return m
-
-    def __iter__(self):
-        for p in self._rdports:
-            yield from p
-        for p in self._wrports:
-            yield from p
-
-    def ports(self):
-        res = list(self)
-
-def treereduce(tree, attr="data_o"):
-    #print ("treereduce", tree)
-    if not isinstance(tree, list):
-        return tree
-    if len(tree) == 1:
-        return getattr(tree[0], attr)
-    if len(tree) == 2:
-        return getattr(tree[0], attr) | getattr(tree[1], attr)
-    split = len(tree) // 2
-    return treereduce(tree[:split], attr) | treereduce(tree[split:], attr)
-
-
-class RegFileArray(Elaboratable):
-    """ an array-based register file (register having write-through capability)
-        that has no "address" decoder, instead it has individual write-en
-        and read-en signals (per port).
-    """
-    def __init__(self, width, depth):
-        self.width = width
-        self.depth = depth
-        self.regs = Array(Register(width) for _ in range(self.depth))
-        self._rdports = []
-        self._wrports = []
-
-    def read_port(self, name=None):
-        regs = []
-        for i in range(self.depth):
-            port = self.regs[i].read_port(name)
-            regs.append(port)
-        regs = Array(regs)
-        port = RecordObject([("ren", self.depth),
-                             ("data_o", self.width)], name)
-        self._rdports.append((regs, port))
-        return port
-
-    def write_port(self, name=None):
-        regs = []
-        for i in range(self.depth):
-            port = self.regs[i].write_port(name)
-            regs.append(port)
-        regs = Array(regs)
-        port = RecordObject([("wen", self.depth),
-                             ("data_i", self.width)])
-        self._wrports.append((regs, port))
-        return port
-
-    def _get_en_sig(self, port, typ):
-        wen = []
-        for p in port:
-            wen.append(p[typ])
-        return Cat(*wen)
-
-    def elaborate(self, platform):
-        m = Module()
-        for i, reg in enumerate(self.regs):
-            setattr(m.submodules, "reg_%d" % i, reg)
-
-        for (regs, p) in self._rdports:
-            #print (p)
-            m.d.comb += self._get_en_sig(regs, 'ren').eq(p.ren)
-            ror = treereduce(list(regs))
-            m.d.comb += p.data_o.eq(ror)
-        for (regs, p) in self._wrports:
-            m.d.comb += self._get_en_sig(regs, 'wen').eq(p.wen)
-            for r in regs:
-                m.d.comb += r.data_i.eq(p.data_i)
-
-        return m
-
-    def __iter__(self):
-        for r in self.regs:
-            yield from r
-
-    def ports(self):
-        return list(self)
-
-
-class RegFile(Elaboratable):
-    def __init__(self, width, depth):
-        self.width = width
-        self.depth = depth
-        self._rdports = []
-        self._wrports = []
-
-    def read_port(self):
-        bsz = int(log(self.width) / log(2))
-        port = RecordObject([("raddr", bsz),
-                             ("ren", 1),
-                             ("data_o", self.width)])
-        self._rdports.append(port)
-        return port
-
-    def write_port(self):
-        bsz = int(log(self.width) / log(2))
-        port = RecordObject([("waddr", bsz),
-                             ("wen", 1),
-                             ("data_i", self.width)])
-        self._wrports.append(port)
-        return port
-
-    def elaborate(self, platform):
-        m = Module()
-        bsz = int(log(self.width) / log(2))
-        regs = Array(Signal(self.width, name="reg") for _ in range(self.depth))
-
-        # read ports. has write-through detection (returns data written)
-        for rp in self._rdports:
-            wr_detect = Signal(reset_less=False)
-            with m.If(rp.ren):
-                m.d.comb += wr_detect.eq(0)
-                for wp in self._wrports:
-                    addrmatch = Signal(reset_less=False)
-                    m.d.comb += addrmatch.eq(wp.waddr == rp.raddr)
-                    with m.If(wp.wen & addrmatch):
-                        m.d.comb += rp.data_o.eq(wp.data_i)
-                        m.d.comb += wr_detect.eq(1)
-                with m.If(~wr_detect):
-                    m.d.comb += rp.data_o.eq(regs[rp.raddr])
-
-        # write ports, don't allow write to address 0 (ignore it)
-        for wp in self._wrports:
-            with m.If(wp.wen & (wp.waddr != Const(0, bsz))):
-                m.d.sync += regs[wp.waddr].eq(wp.data_i)
-
-        return m
-
-    def __iter__(self):
-        yield from self._rdports
-        yield from self._wrports
-
-    def ports(self):
-        res = list(self)
-        for r in res:
-            if isinstance(r, RecordObject):
-                yield from r
-            else:
-                yield r
-
-def regfile_sim(dut, rp, wp):
-    yield wp.waddr.eq(1)
-    yield wp.data_i.eq(2)
-    yield wp.wen.eq(1)
-    yield
-    yield wp.wen.eq(0)
-    yield rp.ren.eq(1)
-    yield rp.raddr.eq(1)
-    yield
-    data = yield rp.data_o
-    print (data)
-    assert data == 2
-
-    yield wp.waddr.eq(5)
-    yield rp.raddr.eq(5)
-    yield rp.ren.eq(1)
-    yield wp.wen.eq(1)
-    yield wp.data_i.eq(6)
-    data = yield rp.data_o
-    print (data)
-    yield
-    yield wp.wen.eq(0)
-    yield rp.ren.eq(0)
-    data = yield rp.data_o
-    print (data)
-    assert data == 6
-    yield
-    data = yield rp.data_o
-    print (data)
-
-def regfile_array_sim(dut, rp1, rp2, wp):
-    yield wp.data_i.eq(2)
-    yield wp.wen.eq(1<<1)
-    yield
-    yield wp.wen.eq(0)
-    yield rp1.ren.eq(1<<1)
-    yield
-    data = yield rp1.data_o
-    print (data)
-    assert data == 2
-
-    yield rp1.ren.eq(1<<5)
-    yield rp2.ren.eq(1<<1)
-    yield wp.wen.eq(1<<5)
-    yield wp.data_i.eq(6)
-    data = yield rp1.data_o
-    print (data)
-    yield
-    yield wp.wen.eq(0)
-    yield rp1.ren.eq(0)
-    yield rp2.ren.eq(0)
-    data1 = yield rp1.data_o
-    print (data1)
-    data2 = yield rp2.data_o
-    print (data2)
-    assert data1 == 6
-    yield
-    data = yield rp1.data_o
-    print (data)
-
-def test_regfile():
-    dut = RegFile(32, 8)
-    rp = dut.read_port()
-    wp = dut.write_port()
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_regfile.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, regfile_sim(dut, rp, wp), vcd_name='test_regfile.vcd')
-
-    dut = RegFileArray(32, 8)
-    rp1 = dut.read_port("read1")
-    rp2 = dut.read_port("read2")
-    wp = dut.write_port("write")
-    ports=dut.ports()
-    print ("ports", ports)
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_regfile_array.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, regfile_array_sim(dut, rp1, rp2, wp),
-                   vcd_name='test_regfile_array.vcd')
-
-if __name__ == '__main__':
-    test_regfile()
diff --git a/src/scoreboard/__init__.py b/src/scoreboard/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/scoreboard/addr_match.py b/src/scoreboard/addr_match.py
deleted file mode 100644
index e42bbe52..00000000
--- a/src/scoreboard/addr_match.py
+++ /dev/null
@@ -1,130 +0,0 @@
-""" Load / Store partial address matcher
-
-Loads and Stores do not need a full match (CAM), they need "good enough"
-avoidance.  Around 11 bits on a 64-bit address is "good enough".
-
-The simplest way to use this module is to ignore not only the top bits,
-but also the bottom bits as well: in this case (this RV64 processor),
-enough to cover a DWORD (64-bit).  that means ignore the bottom 4 bits,
-due to the possibility of 64-bit LD/ST being misaligned.
-
-To reiterate: the use of this module is an *optimisation*.  All it has
-to do is cover the cases that are *definitely* matches (by checking 11
-bits or so), and if a few opportunities for parallel LD/STs are missed
-because the top (or bottom) bits weren't checked, so what: all that
-happens is: the mis-matched addresses are LD/STd on single-cycles. Big Deal.
-
-However, if we wanted to enhance this algorithm (without using a CAM and
-without using expensive comparators) probably the best way to do so would
-be to turn the last 16 bits into a byte-level bitmap.  LD/ST on a byte
-would have 1 of the 16 bits set.  LD/ST on a DWORD would have 8 of the 16
-bits set (offset if the LD/ST was misaligned).  TODO.
-
-Notes:
-
-> I have used bits <11:6> as they are not translated (4KB pages)
-> and larger than a cache line (64 bytes).
-> I have used bits <11:4> when the L1 cache was QuadW sized and
-> the L2 cache was Line sized.
-"""
-
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Const, Array, Cat, Elaboratable
-
-from nmutil.latch import latchregister, SRLatch
-
-
-class PartialAddrMatch(Elaboratable):
-    """A partial address matcher
-    """
-    def __init__(self, n_adr, bitwid):
-        self.n_adr = n_adr
-        self.bitwid = bitwid
-        # inputs
-        self.addrs_i = Array(Signal(bitwid, name="addr") for i in range(n_adr))
-        self.addr_we_i = Signal(n_adr) # write-enable for incoming address
-        self.addr_en_i = Signal(n_adr) # address latched in
-        self.addr_rs_i = Signal(n_adr) # address deactivated
-
-        # output
-        self.addr_nomatch_o = Signal(n_adr, name="nomatch_o")
-        self.addr_nomatch_a_o = Array(Signal(n_adr, name="nomatch_array_o") \
-                                  for i in range(n_adr))
-
-    def elaborate(self, platform):
-        m = Module()
-        return self._elaborate(m, platform)
-
-    def _elaborate(self, m, platform):
-        comb = m.d.comb
-        sync = m.d.sync
-
-        m.submodules.l = l = SRLatch(llen=self.n_adr, sync=False)
-        addrs_r = Array(Signal(self.bitwid, name="a_r") \
-                                for i in range(self.n_adr))
-
-        # latch set/reset
-        comb += l.s.eq(self.addr_en_i)
-        comb += l.r.eq(self.addr_rs_i)
-
-        # copy in addresses (and "enable" signals)
-        for i in range(self.n_adr):
-            latchregister(m, self.addrs_i[i], addrs_r[i], l.q[i])
-
-        # is there a clash, yes/no
-        matchgrp = []
-        for i in range(self.n_adr):
-            match = []
-            for j in range(self.n_adr):
-                if i == j:
-                    match.append(Const(0)) # don't match against self!
-                else:
-                    match.append(addrs_r[i] == addrs_r[j])
-            comb += self.addr_nomatch_a_o[i].eq(~Cat(*match) & l.q)
-            matchgrp.append(self.addr_nomatch_a_o[i] == l.q)
-        comb += self.addr_nomatch_o.eq(Cat(*matchgrp) & l.q)
-            
-        return m
-
-    def __iter__(self):
-        yield from self.addrs_i
-        yield self.addr_we_i
-        yield self.addr_en_i
-        yield from self.addr_nomatch_a_o
-        yield self.addr_nomatch_o
-
-    def ports(self):
-        return list(self)
-
-
-def part_addr_sim(dut):
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_rd_i.eq(1)
-    yield
-    yield dut.go_rd_i.eq(0)
-    yield
-    yield dut.go_wr_i.eq(1)
-    yield
-    yield dut.go_wr_i.eq(0)
-    yield
-
-def test_part_addr():
-    dut = PartialAddrMatch(3, 10)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_part_addr.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, part_addr_sim(dut), vcd_name='test_part_addr.vcd')
-
-if __name__ == '__main__':
-    test_part_addr()
diff --git a/src/scoreboard/dependence_cell.py b/src/scoreboard/dependence_cell.py
deleted file mode 100644
index 16108229..00000000
--- a/src/scoreboard/dependence_cell.py
+++ /dev/null
@@ -1,169 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl
-from nmutil.latch import SRLatch
-from functools import reduce
-from operator import or_
-
-
-class DependencyRow(Elaboratable):
-    """ implements 11.4.7 mitch alsup dependence cell, p27
-        adjusted to be clock-sync'd on rising edge only.
-        mitch design (as does 6600) requires alternating rising/falling clock
-
-        * SET mode: issue_i HI, go_i LO, reg_i HI - register is captured
-                                                  - FWD is DISABLED (~issue_i)
-                                                  - RSEL DISABLED
-        * QRY mode: issue_i LO, go_i LO, haz_i HI - FWD is ASSERTED
-                                         reg_i HI - ignored
-        * GO mode : issue_i LO, go_i HI           - RSEL is ASSERTED
-                                         haz_i HI - FWD still can be ASSERTED
-
-        FWD assertion (hazard protection) therefore still occurs in both
-        Query and Go Modes, for this cycle, due to the cq register
-
-        GO mode works for one cycle, again due to the cq register capturing
-        the latch output.  Without the cq register, the SR Latch (which is
-        asynchronous) would be reset at the exact moment that GO was requested,
-        and the RSEL would be garbage.
-    """
-    def __init__(self, n_reg, n_src, cancel_mode=False):
-        self.cancel_mode = cancel_mode
-        self.n_reg = n_reg
-        self.n_src = n_src
-        # arrays
-        src = []
-        rsel = []
-        fwd = []
-        for i in range(n_src):
-            j = i + 1 # name numbering to match src1/src2
-            src.append(Signal(n_reg, name="src%d" % j, reset_less=True))
-            rsel.append(Signal(n_reg, name="src%d_rsel_o" % j, reset_less=True))
-            fwd.append(Signal(n_reg, name="src%d_fwd_o" % j, reset_less=True))
-
-        # inputs
-        self.dest_i = Signal(n_reg, reset_less=True)     # Dest in (top)
-        self.src_i = Array(src)     # operands in (top)
-        self.issue_i = Signal(reset_less=True)    # Issue in (top)
-
-        self.rd_pend_i = Signal(n_reg, reset_less=True) # Read pend in (top)
-        self.wr_pend_i = Signal(n_reg, reset_less=True) # Write pend in (top)
-        self.v_rd_rsel_o = Signal(n_reg, reset_less=True) # Read pend out (bot)
-        self.v_wr_rsel_o = Signal(n_reg, reset_less=True) # Write pend out (bot)
-
-        self.go_wr_i = Signal(reset_less=True) # Go Write in (left)
-        self.go_rd_i = Signal(reset_less=True)  # Go Read in (left)
-        if self.cancel_mode:
-            self.go_die_i = Signal(n_reg, reset_less=True) # Go Die in (left)
-        else:
-            self.go_die_i = Signal(reset_less=True) # Go Die in (left)
-
-        # for Register File Select Lines (vertical)
-        self.dest_rsel_o = Signal(n_reg, reset_less=True)  # dest reg sel (bot)
-        self.src_rsel_o = Array(rsel)   # src reg sel (bot)
-        self.src2_rsel_o = Signal(n_reg, reset_less=True)  # src2 reg sel (bot)
-
-        # for Function Unit "forward progress" (horizontal)
-        self.dest_fwd_o = Signal(n_reg, reset_less=True)   # dest FU fw (right)
-        self.src_fwd_o = Array(fwd)    # src FU fw (right)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.dest_c = dest_c = SRLatch(sync=False, llen=self.n_reg)
-        src_c = []
-        for i in range(self.n_src):
-            src_l = SRLatch(sync=False, llen=self.n_reg)
-            setattr(m.submodules, "src%d_c" % (i+1), src_l)
-            src_c.append(src_l)
-
-        # connect go_rd / go_wr (dest->wr, src->rd)
-        wr_die = Signal(self.n_reg, reset_less=True)
-        rd_die = Signal(self.n_reg, reset_less=True)
-        if self.cancel_mode:
-            go_die = self.go_die_i
-        else:
-            go_die = Repl(self.go_die_i, self.n_reg)
-        m.d.comb += wr_die.eq(Repl(self.go_wr_i, self.n_reg) | go_die)
-        m.d.comb += rd_die.eq(Repl(self.go_rd_i, self.n_reg) | go_die)
-        m.d.comb += dest_c.r.eq(wr_die)
-        for i in range(self.n_src):
-            m.d.comb += src_c[i].r.eq(rd_die)
-
-        # connect input reg bit (unary)
-        i_ext = Repl(self.issue_i, self.n_reg)
-        m.d.comb += dest_c.s.eq(i_ext & self.dest_i)
-        for i in range(self.n_src):
-            m.d.comb += src_c[i].s.eq(i_ext & self.src_i[i])
-
-        # connect up hazard checks: read-after-write and write-after-read
-        m.d.comb += self.dest_fwd_o.eq(dest_c.q & self.rd_pend_i)
-        for i in range(self.n_src):
-            m.d.comb += self.src_fwd_o[i].eq(src_c[i].q & self.wr_pend_i)
-
-        # connect reg-sel outputs
-        rd_ext = Repl(self.go_rd_i, self.n_reg)
-        wr_ext = Repl(self.go_wr_i, self.n_reg)
-        m.d.comb += self.dest_rsel_o.eq(dest_c.qlq & wr_ext)
-        for i in range(self.n_src):
-            m.d.comb += self.src_rsel_o[i].eq(src_c[i].qlq & rd_ext)
-
-        # to be accumulated to indicate if register is in use (globally)
-        # after ORing, is fed back in to rd_pend_i / wr_pend_i
-        src_q = []
-        for i in range(self.n_src):
-            src_q.append(src_c[i].qlq)
-        m.d.comb += self.v_rd_rsel_o.eq(reduce(or_, src_q))
-        m.d.comb += self.v_wr_rsel_o.eq(dest_c.qlq)
-
-        return m
-
-    def __iter__(self):
-        yield self.dest_i
-        yield from self.src_i
-        yield self.rd_pend_i
-        yield self.wr_pend_i
-        yield self.issue_i
-        yield self.go_wr_i
-        yield self.go_rd_i
-        yield self.go_die_i
-        yield self.dest_rsel_o
-        yield from self.src_rsel_o
-        yield self.dest_fwd_o
-        yield from self.src_fwd_o
-
-    def ports(self):
-        return list(self)
-
-
-def dcell_sim(dut):
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_rd_i.eq(1)
-    yield
-    yield dut.go_rd_i.eq(0)
-    yield
-    yield dut.go_wr_i.eq(1)
-    yield
-    yield dut.go_wr_i.eq(0)
-    yield
-
-def test_dcell():
-    dut = DependencyRow(4, 2, True)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_drow.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, dcell_sim(dut), vcd_name='test_dcell.vcd')
-
-if __name__ == '__main__':
-    test_dcell()
diff --git a/src/scoreboard/fn_unit.py b/src/scoreboard/fn_unit.py
deleted file mode 100644
index 63beb70b..00000000
--- a/src/scoreboard/fn_unit.py
+++ /dev/null
@@ -1,321 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Cat, Array, Const, Elaboratable
-from nmigen.lib.coding import Decoder
-
-from nmutil.latch import SRLatch, latchregister
-
-from scoreboard.shadow import Shadow
-
-
-class FnUnit(Elaboratable):
-    """ implements 11.4.8 function unit, p31
-        also implements optional shadowing 11.5.1, p55
-
-        shadowing can be used for branches as well as exceptions (interrupts),
-        load/store hold (exceptions again), and vector-element predication
-        (once the predicate is known, which it may not be at instruction issue)
-
-        Inputs
-
-        * :wid:         register file width
-        * :shadow_wid:  number of shadow/fail/good/go_die sets
-        * :n_dests:     number of destination regfile(s) (index: rfile_sel_i)
-        * :wr_pend:     if true, writable observes the g_wr_pend_i vector
-                        otherwise observes g_rd_pend_i
-
-        notes:
-
-        * dest_i / src1_i / src2_i are in *binary*, whereas...
-        * ...g_rd_pend_i / g_wr_pend_i and rd_pend_o / wr_pend_o are UNARY
-        * req_rel_i (request release) is the direct equivalent of pipeline
-                    "output valid" (valid_o)
-        * recover is a local python variable (actually go_die_o)
-        * when shadow_wid = 0, recover and shadown are Consts (i.e. do nothing)
-        * wr_pend is set False for the majority of uses: however for
-          use in a STORE Function Unit it is set to True
-    """
-    def __init__(self, wid, shadow_wid=0, n_dests=1, wr_pend=False):
-        self.reg_width = wid
-        self.n_dests = n_dests
-        self.shadow_wid = shadow_wid
-        self.wr_pend = wr_pend
-
-        # inputs
-        if n_dests > 1:
-            self.rfile_sel_i = Signal(max=n_dests, reset_less=True)
-        else:
-            self.rfile_sel_i = Const(0) # no selection.  gets Array[0]
-        self.dest_i = Signal(max=wid, reset_less=True) # Dest R# in (top)
-        self.src1_i = Signal(max=wid, reset_less=True) # oper1 R# in (top)
-        self.src2_i = Signal(max=wid, reset_less=True) # oper2 R# in (top)
-        self.issue_i = Signal(reset_less=True)    # Issue in (top)
-
-        self.go_wr_i = Signal(reset_less=True) # Go Write in (left)
-        self.go_rd_i = Signal(reset_less=True)  # Go Read in (left)
-        self.req_rel_i = Signal(reset_less=True)  # request release (left)
-
-        self.g_xx_pend_i = Array(Signal(wid, reset_less=True, name="g_pend_i") \
-                               for i in range(n_dests)) # global rd (right)
-        self.g_wr_pend_i = Signal(wid, reset_less=True) # global wr (right)
-
-        if shadow_wid:
-            self.shadow_i = Signal(shadow_wid, reset_less=True)
-            self.s_fail_i  = Signal(shadow_wid, reset_less=True)
-            self.s_good_i  = Signal(shadow_wid, reset_less=True)
-            self.go_die_o  = Signal(reset_less=True)
-
-        # outputs
-        self.readable_o = Signal(reset_less=True) # Readable out (right)
-        self.writable_o = Array(Signal(reset_less=True, name="writable_o") \
-                               for i in range(n_dests)) # writable out (right)
-        self.busy_o = Signal(reset_less=True) # busy out (left)
-
-        self.src1_pend_o = Signal(wid, reset_less=True) # src1 pending
-        self.src2_pend_o = Signal(wid, reset_less=True) # src1 pending
-        self.rd_pend_o = Signal(wid, reset_less=True) # rd pending (right)
-        self.xx_pend_o = Array(Signal(wid, reset_less=True, name="pend_o") \
-                               for i in range(n_dests))# wr pending (right)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.rd_l = rd_l = SRLatch(sync=False)
-        m.submodules.wr_l = wr_l = SRLatch(sync=False)
-        m.submodules.dest_d = dest_d = Decoder(self.reg_width)
-        m.submodules.src1_d = src1_d = Decoder(self.reg_width)
-        m.submodules.src2_d = src2_d = Decoder(self.reg_width)
-
-        # shadow / recover (optional: shadow_wid > 0)
-        m.submodules.shadow = shadow = Shadow(self.shadow_wid)
-        if self.shadow_wid:
-            m.d.comb += shadow.issue_i.eq(self.issue_i)
-            m.d.comb += shadow.s_fail_i.eq(self.s_fail_i)
-            m.d.comb += shadow.s_good_i.eq(self.s_good_i)
-            m.d.comb += shadow.shadow_i.eq(self.shadow_i)
-        shadown = shadow.shadown_o
-        recover = shadow.go_die_o
-
-        # selector
-        xx_pend_o = self.xx_pend_o[self.rfile_sel_i]
-        writable_o = self.writable_o[self.rfile_sel_i]
-        g_pend_i = self.g_xx_pend_i[self.rfile_sel_i]
-
-        for i in range(self.n_dests):
-            m.d.comb += self.xx_pend_o[i].eq(0)  # initialise all array
-            m.d.comb += self.writable_o[i].eq(0) # to zero
-        m.d.comb += self.readable_o.eq(0) # to zero
-
-        # go_wr latch: reset on go_wr HI, set on issue
-        m.d.comb += wr_l.s.eq(self.issue_i)
-        m.d.comb += wr_l.r.eq(self.go_wr_i | recover)
-
-        # src1 latch: reset on go_rd HI, set on issue
-        m.d.comb += rd_l.s.eq(self.issue_i)
-        m.d.comb += rd_l.r.eq(self.go_rd_i | recover)
-
-        # latch/registers for dest / src1 / src2
-        dest_r = Signal(max=self.reg_width, reset_less=True)
-        src1_r = Signal(max=self.reg_width, reset_less=True)
-        src2_r = Signal(max=self.reg_width, reset_less=True)
-        # XXX latch based on *issue* rather than !latch (as in book)
-        latchregister(m, self.dest_i, dest_r, self.issue_i) #wr_l.qn)
-        latchregister(m, self.src1_i, src1_r, self.issue_i) #wr_l.qn)
-        latchregister(m, self.src2_i, src2_r, self.issue_i) #wr_l.qn)
-
-        # dest decoder (use dest reg as input): write-pending out
-        m.d.comb += dest_d.i.eq(dest_r)
-        m.d.comb += dest_d.n.eq(wr_l.qn) # decode is inverted
-        m.d.comb += self.busy_o.eq(wr_l.q) # busy if set
-        m.d.comb += xx_pend_o.eq(dest_d.o)
-
-        # src1/src2 decoder (use src1/2 regs as input): read-pending out
-        m.d.comb += src1_d.i.eq(src1_r)
-        m.d.comb += src1_d.n.eq(rd_l.qn) # decode is inverted
-        m.d.comb += src2_d.i.eq(src2_r)
-        m.d.comb += src2_d.n.eq(rd_l.qn) # decode is inverted
-        m.d.comb += self.src1_pend_o.eq(src1_d.o)
-        m.d.comb += self.src2_pend_o.eq(src2_d.o)
-        m.d.comb += self.rd_pend_o.eq(src1_d.o | src2_d.o)
-
-        # readable output signal
-        g_rd = Signal(self.reg_width, reset_less=True)
-        ro = Signal(reset_less=True)
-        m.d.comb += g_rd.eq(~self.g_wr_pend_i & self.rd_pend_o)
-        m.d.comb += ro.eq(~g_rd.bool())
-        m.d.comb += self.readable_o.eq(ro)
-
-        # writable output signal
-        g_wr_v = Signal(self.reg_width, reset_less=True)
-        g_wr = Signal(reset_less=True)
-        wo = Signal(reset_less=True)
-        m.d.comb += g_wr_v.eq(g_pend_i & xx_pend_o)
-        m.d.comb += g_wr.eq(~g_wr_v.bool())
-        m.d.comb += wo.eq(g_wr & rd_l.qn & self.req_rel_i & shadown)
-        m.d.comb += writable_o.eq(wo)
-
-        return m
-
-    def __iter__(self):
-        yield self.dest_i
-        yield self.src1_i
-        yield self.src2_i
-        yield self.issue_i
-        yield self.go_wr_i
-        yield self.go_rd_i
-        yield self.req_rel_i
-        yield from self.g_xx_pend_i
-        yield self.g_wr_pend_i
-        yield self.readable_o
-        yield from self.writable_o
-        yield self.rd_pend_o
-        yield from self.xx_pend_o
-
-    def ports(self):
-        return list(self)
-
-#############                                     ###############
-# ---                                                       --- #
-# --- renamed / redirected from base class                  --- #
-# ---                                                       --- #
-# --- below are convenience classes which match the names   --- #
-# --- of the various mitch alsup book chapter gate diagrams --- #
-# ---                                                       --- #
-#############                                     ###############
-
-
-class IntFnUnit(FnUnit):
-    def __init__(self, wid, shadow_wid=0):
-        FnUnit.__init__(self, wid, shadow_wid)
-        self.int_rd_pend_o = self.rd_pend_o
-        self.int_wr_pend_o = self.xx_pend_o[0]
-        self.g_int_wr_pend_i = self.g_wr_pend_i
-        self.g_int_rd_pend_i = self.g_xx_pend_i[0]
-        self.int_readable_o = self.readable_o
-        self.int_writable_o = self.writable_o[0]
-
-        self.int_rd_pend_o.name = "int_rd_pend_o"
-        self.int_wr_pend_o.name = "int_wr_pend_o"
-        self.g_int_rd_pend_i.name = "g_int_rd_pend_i"
-        self.g_int_wr_pend_i.name = "g_int_wr_pend_i"
-        self.int_readable_o.name = "int_readable_o"
-        self.int_writable_o.name = "int_writable_o"
-
-
-class FPFnUnit(FnUnit):
-    def __init__(self, wid, shadow_wid=0):
-        FnUnit.__init__(self, wid, shadow_wid)
-        self.fp_rd_pend_o = self.rd_pend_o
-        self.fp_wr_pend_o = self.xx_pend_o[0]
-        self.g_fp_wr_pend_i = self.g_wr_pend_i
-        self.g_fp_rd_pend_i = self.g_xx_pend_i[0]
-        self.fp_writable_o = self.writable_o[0]
-        self.fp_readable_o = self.readable_o
-
-        self.fp_rd_pend_o.name = "fp_rd_pend_o"
-        self.fp_wr_pend_o.name = "fp_wr_pend_o"
-        self.g_fp_rd_pend_i.name = "g_fp_rd_pend_i"
-        self.g_fp_wr_pend_i.name = "g_fp_wr_pend_i"
-        self.fp_writable_o.name = "fp_writable_o"
-        self.fp_readable_o.name = "fp_readable_o"
-
-
-class LDFnUnit(FnUnit):
-    """ number of dest selectors: 2. assumes len(int_regfile) == len(fp_regfile)
-        * when rfile_sel_i == 0, int_wr_pend_o is set
-        * when rfile_sel_i == 1, fp_wr_pend_o is set
-    """
-    def __init__(self, wid, shadow_wid=0):
-        FnUnit.__init__(self, wid, shadow_wid, n_dests=2)
-        self.int_rd_pend_o = self.rd_pend_o
-        self.int_wr_pend_o = self.xx_pend_o[0]
-        self.fp_wr_pend_o = self.xx_pend_o[1]
-        self.g_int_wr_pend_i = self.g_wr_pend_i
-        self.g_int_rd_pend_i = self.g_xx_pend_i[0]
-        self.g_fp_rd_pend_i = self.g_xx_pend_i[1]
-        self.int_readable_o = self.readable_o
-        self.int_writable_o = self.writable_o[0]
-        self.fp_writable_o = self.writable_o[1]
-
-        self.int_rd_pend_o.name = "int_rd_pend_o"
-        self.int_wr_pend_o.name = "int_wr_pend_o"
-        self.fp_wr_pend_o.name = "fp_wr_pend_o"
-        self.g_int_wr_pend_i.name = "g_int_wr_pend_i"
-        self.g_int_rd_pend_i.name = "g_int_rd_pend_i"
-        self.g_fp_rd_pend_i.name = "g_fp_rd_pend_i"
-        self.int_readable_o.name = "int_readable_o"
-        self.int_writable_o.name = "int_writable_o"
-        self.fp_writable_o.name = "fp_writable_o"
-
-
-class STFnUnit(FnUnit):
-    """ number of dest selectors: 2. assumes len(int_regfile) == len(fp_regfile)
-        * wr_pend=False indicates to observe global fp write pending
-        * when rfile_sel_i == 0, int_wr_pend_o is set
-        * when rfile_sel_i == 1, fp_wr_pend_o is set
-        *
-    """
-    def __init__(self, wid, shadow_wid=0):
-        FnUnit.__init__(self, wid, shadow_wid, n_dests=2, wr_pend=True)
-        self.int_rd_pend_o = self.rd_pend_o     # 1st int read-pending vector
-        self.int2_rd_pend_o = self.xx_pend_o[0] # 2nd int read-pending vector
-        self.fp_rd_pend_o = self.xx_pend_o[1]   # 1x FP read-pending vector
-        # yes overwrite FnUnit base class g_wr_pend_i vector
-        self.g_int_wr_pend_i = self.g_wr_pend_i = self.g_xx_pend_i[0]
-        self.g_fp_wr_pend_i = self.g_xx_pend_i[1]
-        self.int_readable_o = self.readable_o
-        self.int_writable_o = self.writable_o[0]
-        self.fp_writable_o = self.writable_o[1]
-
-        self.int_rd_pend_o.name = "int_rd_pend_o"
-        self.int2_rd_pend_o.name = "int2_rd_pend_o"
-        self.fp_rd_pend_o.name = "fp_rd_pend_o"
-        self.g_int_wr_pend_i.name = "g_int_wr_pend_i"
-        self.g_fp_wr_pend_i.name = "g_fp_wr_pend_i"
-        self.int_readable_o.name = "int_readable_o"
-        self.int_writable_o.name = "int_writable_o"
-        self.fp_writable_o.name = "fp_writable_o"
-
-
-
-def int_fn_unit_sim(dut):
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_rd_i.eq(1)
-    yield
-    yield dut.go_rd_i.eq(0)
-    yield
-    yield dut.go_wr_i.eq(1)
-    yield
-    yield dut.go_wr_i.eq(0)
-    yield
-
-def test_int_fn_unit():
-    dut = FnUnit(32, 2, 2)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_fn_unit.il", "w") as f:
-        f.write(vl)
-
-    dut = LDFnUnit(32, 2)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_ld_fn_unit.il", "w") as f:
-        f.write(vl)
-
-    dut = STFnUnit(32, 0)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_st_fn_unit.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, int_fn_unit_sim(dut), vcd_name='test_fn_unit.vcd')
-
-if __name__ == '__main__':
-    test_int_fn_unit()
diff --git a/src/scoreboard/fu_dep_cell.py b/src/scoreboard/fu_dep_cell.py
deleted file mode 100644
index 9946dcb5..00000000
--- a/src/scoreboard/fu_dep_cell.py
+++ /dev/null
@@ -1,92 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Const, Elaboratable
-from nmutil.latch import SRLatch
-
-
-class FUDependenceCell(Elaboratable):
-    """ implements 11.4.7 mitch alsup dependence cell, p27
-    """
-    def __init__(self, dummy, n_fu=1):
-        self.n_fu = n_fu
-        self.dummy = Const(~(1<<dummy), n_fu)
-        # inputs
-        self.rd_pend_i = Signal(n_fu, reset_less=True) # read pend in (left)
-        self.wr_pend_i = Signal(n_fu, reset_less=True) # write pend in (left)
-        self.issue_i = Signal(n_fu, reset_less=True)    # Issue in (top)
-
-        self.go_wr_i = Signal(n_fu, reset_less=True) # Go Write in (left)
-        self.go_rd_i = Signal(n_fu, reset_less=True)  # Go Read in (left)
-        self.go_die_i = Signal(n_fu, reset_less=True) # Go Die in (left)
-
-        # outputs (latched rd/wr wait)
-        self.rd_wait_o = Signal(n_fu, reset_less=True) # read wait out (right)
-        self.wr_wait_o = Signal(n_fu, reset_less=True) # write wait out (right)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.rd_c = rd_c = SRLatch(sync=False, llen=self.n_fu)
-        m.submodules.wr_c = wr_c = SRLatch(sync=False, llen=self.n_fu)
-
-        # reset on go HI, set on dest and issue
-        m.d.comb += rd_c.s.eq(self.issue_i & self.rd_pend_i)
-        m.d.comb += wr_c.s.eq(self.issue_i & self.wr_pend_i)
-
-        # connect go_rd / go_wr 
-        m.d.comb += wr_c.r.eq(self.go_wr_i | self.go_die_i)
-        m.d.comb += rd_c.r.eq(self.go_rd_i | self.go_die_i)
-
-        # connect pend_i
-        m.d.comb += rd_c.s.eq(self.issue_i & self.rd_pend_i & self.dummy)
-        m.d.comb += wr_c.s.eq(self.issue_i & self.wr_pend_i & self.dummy)
-
-        # connect output
-        m.d.comb += self.rd_wait_o.eq(rd_c.qlq & ~self.issue_i)
-        m.d.comb += self.wr_wait_o.eq(wr_c.qlq & ~self.issue_i)
-
-        return m
-
-    def __iter__(self):
-        yield self.rd_pend_i
-        yield self.wr_pend_i
-        yield self.issue_i
-        yield self.go_wr_i
-        yield self.go_rd_i
-        yield self.go_die_i
-        yield self.rd_wait_o
-        yield self.wr_wait_o
-                
-    def ports(self):
-        return list(self)
-
-
-def dcell_sim(dut):
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_rd_i.eq(1)
-    yield
-    yield dut.go_rd_i.eq(0)
-    yield
-    yield dut.go_wr_i.eq(1)
-    yield
-    yield dut.go_wr_i.eq(0)
-    yield
-
-def test_dcell():
-    dut = FUDependenceCell(dummy=0, n_fu=4)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_fu_dcell.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, dcell_sim(dut), vcd_name='test_fu_dcell.vcd')
-
-if __name__ == '__main__':
-    test_dcell()
diff --git a/src/scoreboard/fu_fu_matrix.py b/src/scoreboard/fu_fu_matrix.py
deleted file mode 100644
index cc2c1b96..00000000
--- a/src/scoreboard/fu_fu_matrix.py
+++ /dev/null
@@ -1,155 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Const
-
-from .fu_dep_cell import FUDependenceCell
-from .fu_picker_vec import FU_Pick_Vec
-
-"""
-
- 6600 Function Unit Dependency Table Matrix inputs / outputs
- -----------------------------------------------------------
-
-"""
-
-class FUFUDepMatrix(Elaboratable):
-    """ implements 11.4.7 mitch alsup FU-to-Reg Dependency Matrix, p26
-    """
-    def __init__(self, n_fu_row, n_fu_col):
-        self.n_fu_row = n_fu_row                  # Y (FU row#)   ^v
-        self.n_fu_col = n_fu_col                # X (FU col #)  <>
-        self.rd_pend_i = Signal(n_fu_row, reset_less=True) # Rd pending (left)
-        self.wr_pend_i = Signal(n_fu_row, reset_less=True) # Wr pending (left)
-        self.issue_i = Signal(n_fu_col, reset_less=True)    # Issue in (top)
-
-        self.go_wr_i = Signal(n_fu_row, reset_less=True) # Go Write in (left)
-        self.go_rd_i = Signal(n_fu_row, reset_less=True)  # Go Read in (left)
-        self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left)
-
-        # for Function Unit Readable/Writable (horizontal)
-        self.readable_o = Signal(n_fu_col, reset_less=True) # readable (bot)
-        self.writable_o = Signal(n_fu_col, reset_less=True) # writable (bot)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        # ---
-        # matrix of dependency cells
-        # ---
-        dm = Array(FUDependenceCell(f, self.n_fu_col) \
-                                            for f in range(self.n_fu_row))
-        for y in range(self.n_fu_row):
-                setattr(m.submodules, "dm%d" % y, dm[y])
-
-        # ---
-        # array of Function Unit Readable/Writable: row-length, horizontal
-        # ---
-        fur = Array(FU_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col))
-        for x in range(self.n_fu_col):
-            setattr(m.submodules, "fur_x%d" % (x), fur[x])
-
-        # ---
-        # connect FU Readable/Writable vector
-        # ---
-        readable = []
-        writable = []
-        for y in range(self.n_fu_row):
-            fu = fur[y]
-            # accumulate Readable/Writable Vector outputs
-            readable.append(fu.readable_o)
-            writable.append(fu.writable_o)
-
-        # ... and output them from this module (horizontal, width=REGs)
-        m.d.comb += self.readable_o.eq(Cat(*readable))
-        m.d.comb += self.writable_o.eq(Cat(*writable))
-
-        # ---
-        # connect FU Pending
-        # ---
-        for y in range(self.n_fu_row):
-            dc = dm[y]
-            fu = fur[y]
-            # connect cell reg-select outputs to Reg Vector In
-            m.d.comb += [fu.rd_pend_i.eq(dc.rd_wait_o),
-                         fu.wr_pend_i.eq(dc.wr_wait_o),
-                        ]
-
-        # ---
-        # connect Dependency Matrix dest/src1/src2/issue to module d/s/s/i
-        # ---
-        for x in range(self.n_fu_col):
-            issue_i = []
-            for y in range(self.n_fu_row):
-                dc = dm[y]
-                # accumulate cell inputs issue
-                issue_i.append(dc.issue_i[x])
-            # wire up inputs from module to row cell inputs
-            m.d.comb += Cat(*issue_i).eq(self.issue_i)
-
-        # ---
-        # connect Matrix go_rd_i/go_wr_i to module readable/writable
-        # ---
-        for y in range(self.n_fu_row):
-            dc = dm[y]
-            # wire up inputs from module to row cell inputs
-            m.d.comb += [dc.go_rd_i.eq(self.go_rd_i),
-                         dc.go_wr_i.eq(self.go_wr_i),
-                         dc.go_die_i.eq(self.go_die_i),
-                        ]
-
-        # ---
-        # connect Matrix pending
-        # ---
-        for y in range(self.n_fu_row):
-            dc = dm[y]
-            # wire up inputs from module to row cell inputs
-            m.d.comb += [dc.rd_pend_i.eq(self.rd_pend_i),
-                         dc.wr_pend_i.eq(self.wr_pend_i),
-                        ]
-
-        return m
-
-    def __iter__(self):
-        yield self.rd_pend_i
-        yield self.wr_pend_i
-        yield self.issue_i
-        yield self.go_wr_i
-        yield self.go_rd_i
-        yield self.readable_o
-        yield self.writable_o
-                
-    def ports(self):
-        return list(self)
-
-def d_matrix_sim(dut):
-    """ XXX TODO
-    """
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_rd_i.eq(1)
-    yield
-    yield dut.go_rd_i.eq(0)
-    yield
-    yield dut.go_wr_i.eq(1)
-    yield
-    yield dut.go_wr_i.eq(0)
-    yield
-
-def test_fu_fu_matrix():
-    dut = FUFUDepMatrix(n_fu_row=3, n_fu_col=4)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_fu_fu_matrix.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_fu_matrix.vcd')
-
-if __name__ == '__main__':
-    test_fu_fu_matrix()
diff --git a/src/scoreboard/fu_mem_matrix.py b/src/scoreboard/fu_mem_matrix.py
deleted file mode 100644
index baaa02be..00000000
--- a/src/scoreboard/fu_mem_matrix.py
+++ /dev/null
@@ -1,155 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Const
-
-from scoreboard.fumem_dep_cell import FUMemDependenceCell
-from scoreboard.fu_mem_picker_vec import FUMem_Pick_Vec
-
-"""
-
- 6600 Function Unit Dependency Table Matrix inputs / outputs
- -----------------------------------------------------------
-
-"""
-
-class FUMemDepMatrix(Elaboratable):
-    """ implements FU-to-FU Memory Dependency Matrix
-    """
-    def __init__(self, n_fu_row, n_fu_col):
-        self.n_fu_row = n_fu_row               # Y (FU row#)   ^v
-        self.n_fu_col = n_fu_col                # X (FU col #)  <>
-        self.st_pend_i = Signal(n_fu_row, reset_less=True) # Rd pending (left)
-        self.ld_pend_i = Signal(n_fu_row, reset_less=True) # Wr pending (left)
-        self.issue_i = Signal(n_fu_col, reset_less=True)    # Issue in (top)
-
-        self.go_ld_i = Signal(n_fu_row, reset_less=True) # Go Write in (left)
-        self.go_st_i = Signal(n_fu_row, reset_less=True)  # Go Read in (left)
-        self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left)
-
-        # for Function Unit Readable/Writable (horizontal)
-        self.storable_o = Signal(n_fu_col, reset_less=True) # storable (bot)
-        self.loadable_o = Signal(n_fu_col, reset_less=True) # loadable (bot)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        # ---
-        # matrix of dependency cells
-        # ---
-        dm = Array(FUMemDependenceCell(f, self.n_fu_col) \
-                                            for f in range(self.n_fu_row))
-        for y in range(self.n_fu_row):
-                setattr(m.submodules, "dm%d" % y, dm[y])
-
-        # ---
-        # array of Function Unit Readable/Writable: row-length, horizontal
-        # ---
-        fur = Array(FUMem_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col))
-        for x in range(self.n_fu_col):
-            setattr(m.submodules, "fur_x%d" % (x), fur[x])
-
-        # ---
-        # connect FU Readable/Writable vector
-        # ---
-        storable = []
-        loadable = []
-        for y in range(self.n_fu_row):
-            fu = fur[y]
-            # accumulate Readable/Writable Vector outputs
-            storable.append(fu.storable_o)
-            loadable.append(fu.loadable_o)
-
-        # ... and output them from this module (horizontal, width=REGs)
-        m.d.comb += self.storable_o.eq(Cat(*storable))
-        m.d.comb += self.loadable_o.eq(Cat(*loadable))
-
-        # ---
-        # connect FU Pending
-        # ---
-        for y in range(self.n_fu_row):
-            dc = dm[y]
-            fu = fur[y]
-            # connect cell reg-select outputs to Reg Vector In
-            m.d.comb += [fu.st_pend_i.eq(dc.st_wait_o),
-                         fu.ld_pend_i.eq(dc.ld_wait_o),
-                        ]
-
-        # ---
-        # connect Dependency Matrix dest/src1/src2/issue to module d/s/s/i
-        # ---
-        for x in range(self.n_fu_col):
-            issue_i = []
-            for y in range(self.n_fu_row):
-                dc = dm[y]
-                # accumulate cell inputs issue
-                issue_i.append(dc.issue_i[x])
-            # wire up inputs from module to row cell inputs
-            m.d.comb += Cat(*issue_i).eq(self.issue_i)
-
-        # ---
-        # connect Matrix go_st_i/go_ld_i to module storable/loadable
-        # ---
-        for y in range(self.n_fu_row):
-            dc = dm[y]
-            # wire up inputs from module to row cell inputs
-            m.d.comb += [dc.go_st_i.eq(self.go_st_i),
-                         dc.go_ld_i.eq(self.go_ld_i),
-                         dc.go_die_i.eq(self.go_die_i),
-                        ]
-
-        # ---
-        # connect Matrix pending
-        # ---
-        for y in range(self.n_fu_row):
-            dc = dm[y]
-            # wire up inputs from module to row cell inputs
-            m.d.comb += [dc.st_pend_i.eq(self.st_pend_i),
-                         dc.ld_pend_i.eq(self.ld_pend_i),
-                        ]
-
-        return m
-
-    def __iter__(self):
-        yield self.st_pend_i
-        yield self.ld_pend_i
-        yield self.issue_i
-        yield self.go_ld_i
-        yield self.go_st_i
-        yield self.storable_o
-        yield self.loadable_o
-                
-    def ports(self):
-        return list(self)
-
-def d_matrix_sim(dut):
-    """ XXX TODO
-    """
-    yield dut.ld_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.st_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_st_i.eq(1)
-    yield
-    yield dut.go_st_i.eq(0)
-    yield
-    yield dut.go_ld_i.eq(1)
-    yield
-    yield dut.go_ld_i.eq(0)
-    yield
-
-def test_fu_fu_matrix():
-    dut = FUMemDepMatrix(n_fu_row=3, n_fu_col=3)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_fu_mem_matrix.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_mem_matrix.vcd')
-
-if __name__ == '__main__':
-    test_fu_fu_matrix()
diff --git a/src/scoreboard/fu_mem_picker_vec.py b/src/scoreboard/fu_mem_picker_vec.py
deleted file mode 100644
index dc40bd09..00000000
--- a/src/scoreboard/fu_mem_picker_vec.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from nmigen import Elaboratable, Module, Signal, Cat
-
-
-class FUMem_Pick_Vec(Elaboratable):
-    """ these are allocated per-FU (horizontally),
-        and are of length fu_row_n
-    """
-    def __init__(self, fu_row_n):
-        self.fu_row_n = fu_row_n
-        self.st_pend_i = Signal(fu_row_n, reset_less=True)
-        self.ld_pend_i = Signal(fu_row_n, reset_less=True)
-
-        self.storable_o = Signal(reset_less=True)
-        self.loadable_o = Signal(reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        # Readable if there are no writes pending
-        m.d.comb += self.storable_o.eq(~self.ld_pend_i.bool())
-
-        # Writable if there are no reads pending
-        m.d.comb += self.loadable_o.eq(~self.st_pend_i.bool())
-
-        return m
-
diff --git a/src/scoreboard/fu_picker_vec.py b/src/scoreboard/fu_picker_vec.py
deleted file mode 100644
index d38bbfae..00000000
--- a/src/scoreboard/fu_picker_vec.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from nmigen import Elaboratable, Module, Signal, Cat
-
-
-class FU_Pick_Vec(Elaboratable):
-    """ these are allocated per-FU (horizontally),
-        and are of length fu_row_n
-    """
-    def __init__(self, fu_row_n):
-        self.fu_row_n = fu_row_n
-        self.rd_pend_i = Signal(fu_row_n, reset_less=True)
-        self.wr_pend_i = Signal(fu_row_n, reset_less=True)
-
-        self.readable_o = Signal(reset_less=True)
-        self.writable_o = Signal(reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        # Readable if there are no writes pending
-        m.d.comb += self.readable_o.eq(~self.wr_pend_i.bool())
-
-        # Writable if there are no reads pending
-        m.d.comb += self.writable_o.eq(~self.rd_pend_i.bool())
-
-        return m
-
diff --git a/src/scoreboard/fu_reg_matrix.py b/src/scoreboard/fu_reg_matrix.py
deleted file mode 100644
index 8ca1494e..00000000
--- a/src/scoreboard/fu_reg_matrix.py
+++ /dev/null
@@ -1,304 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl
-
-from scoreboard.dependence_cell import DependencyRow
-from scoreboard.fu_wr_pending import FU_RW_Pend
-from scoreboard.reg_select import Reg_Rsv
-from scoreboard.global_pending import GlobalPending
-
-"""
-
- 6600 Dependency Table Matrix inputs / outputs
- ---------------------------------------------
-
-                d s1 s2 i  d s1 s2 i  d s1 s2 i  d s1 s2 i
-                | |   | |  | |   | |  | |   | |  | |   | |
-                v v   v v  v v   v v  v v   v v  v v   v v
- go_rd/go_wr -> dm-r0-fu0  dm-r1-fu0  dm-r2-fu0  dm-r3-fu0 -> wr/rd-pend
- go_rd/go_wr -> dm-r0-fu1  dm-r1-fu1  dm-r2-fu1  dm-r3-fu1 -> wr/rd-pend
- go_rd/go_wr -> dm-r0-fu2  dm-r1-fu2  dm-r2-fu2  dm-r3-fu2 -> wr/rd-pend
-                 |  |  |    |  |  |    |  |  |    |  |  |
-                 v  v  v    v  v  v    v  v  v    v  v  v
-                 d  s1 s2   d  s1 s2   d  s1 s2   d  s1 s2
-                 reg sel    reg sel    reg sel    reg sel
-
-"""
-
-class FURegDepMatrix(Elaboratable):
-    """ implements 11.4.7 mitch alsup FU-to-Reg Dependency Matrix, p26
-    """
-    def __init__(self, n_fu_row, n_reg_col, n_src, cancel=None):
-        self.n_src = n_src
-        self.n_fu_row = nf = n_fu_row      # Y (FUs)   ^v
-        self.n_reg_col = n_reg = n_reg_col   # X (Regs)  <>
-
-        # arrays
-        src = []
-        rsel = []
-        for i in range(n_src):
-            j = i + 1 # name numbering to match src1/src2
-            src.append(Signal(n_reg, name="src%d" % j, reset_less=True))
-            rsel.append(Signal(n_reg, name="src%d_rsel_o" % j, reset_less=True))
-        pend = []
-        for i in range(nf):
-            j = i + 1 # name numbering to match src1/src2
-            pend.append(Signal(nf, name="rd_src%d_pend_o" % j, reset_less=True))
-
-        self.dest_i = Signal(n_reg_col, reset_less=True)     # Dest in (top)
-        self.src_i = Array(src)                              # oper in (top)
-
-        # cancellation array (from Address Matching), ties in with go_die_i
-        self.cancel = cancel
-
-        # Register "Global" vectors for determining RaW and WaR hazards
-        self.wr_pend_i = Signal(n_reg_col, reset_less=True) # wr pending (top)
-        self.rd_pend_i = Signal(n_reg_col, reset_less=True) # rd pending (top)
-        self.v_wr_rsel_o = Signal(n_reg_col, reset_less=True) # wr pending (bot)
-        self.v_rd_rsel_o = Signal(n_reg_col, reset_less=True) # rd pending (bot)
-
-        self.issue_i = Signal(n_fu_row, reset_less=True)  # Issue in (top)
-        self.go_wr_i = Signal(n_fu_row, reset_less=True)  # Go Write in (left)
-        self.go_rd_i = Signal(n_fu_row, reset_less=True)  # Go Read in (left)
-        self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left)
-
-        # for Register File Select Lines (horizontal), per-reg
-        self.dest_rsel_o = Signal(n_reg_col, reset_less=True) # dest reg (bot)
-        self.src_rsel_o = Array(rsel)                         # src reg (bot)
-
-        # for Function Unit "forward progress" (vertical), per-FU
-        self.wr_pend_o = Signal(n_fu_row, reset_less=True) # wr pending (right)
-        self.rd_pend_o = Signal(n_fu_row, reset_less=True) # rd pending (right)
-        self.rd_src_pend_o = Array(pend) # src1 pending
-
-    def elaborate(self, platform):
-        m = Module()
-        return self._elaborate(m, platform)
-
-    def _elaborate(self, m, platform):
-
-        # ---
-        # matrix of dependency cells
-        # ---
-        cancel_mode = self.cancel is not None
-        dm = Array(DependencyRow(self.n_reg_col, self.n_src, cancel_mode) \
-                    for r in range(self.n_fu_row))
-        for fu in range(self.n_fu_row):
-            setattr(m.submodules, "dr_fu%d" % fu, dm[fu])
-
-        # ---
-        # array of Function Unit Pending vectors
-        # ---
-        fupend = Array(FU_RW_Pend(self.n_reg_col, self.n_src) \
-                        for f in range(self.n_fu_row))
-        for fu in range(self.n_fu_row):
-            setattr(m.submodules, "fu_fu%d" % (fu), fupend[fu])
-
-        # ---
-        # array of Register Reservation vectors
-        # ---
-        regrsv = Array(Reg_Rsv(self.n_fu_row, self.n_src) \
-                        for r in range(self.n_reg_col))
-        for rn in range(self.n_reg_col):
-            setattr(m.submodules, "rr_r%d" % (rn), regrsv[rn])
-
-        # ---
-        # connect Function Unit vector
-        # ---
-        wr_pend = []
-        rd_pend = []
-        for fu in range(self.n_fu_row):
-            dc = dm[fu]
-            fup = fupend[fu]
-            dest_fwd_o = []
-            for rn in range(self.n_reg_col):
-                # accumulate cell fwd outputs for dest/src1/src2
-                dest_fwd_o.append(dc.dest_fwd_o[rn])
-            # connect cell fwd outputs to FU Vector in [Cat is gooood]
-            m.d.comb += [fup.dest_fwd_i.eq(Cat(*dest_fwd_o)),
-                        ]
-            # accumulate FU Vector outputs
-            wr_pend.append(fup.reg_wr_pend_o)
-            rd_pend.append(fup.reg_rd_pend_o)
-
-        # ... and output them from this module (vertical, width=FUs)
-        m.d.comb += self.wr_pend_o.eq(Cat(*wr_pend))
-        m.d.comb += self.rd_pend_o.eq(Cat(*rd_pend))
-
-        # same for src
-        for i in range(self.n_src):
-            rd_src_pend = []
-            for fu in range(self.n_fu_row):
-                dc = dm[fu]
-                fup = fupend[fu]
-                src_fwd_o = []
-                for rn in range(self.n_reg_col):
-                    # accumulate cell fwd outputs for dest/src1/src2
-                    src_fwd_o.append(dc.src_fwd_o[i][rn])
-                # connect cell fwd outputs to FU Vector in [Cat is gooood]
-                m.d.comb += [fup.src_fwd_i[i].eq(Cat(*src_fwd_o)),
-                            ]
-                # accumulate FU Vector outputs
-                rd_src_pend.append(fup.reg_rd_src_pend_o[i])
-            # ... and output them from this module (vertical, width=FUs)
-            m.d.comb += self.rd_src_pend_o[i].eq(Cat(*rd_src_pend))
-
-        # ---
-        # connect Reg Selection vector
-        # ---
-        dest_rsel = []
-        for rn in range(self.n_reg_col):
-            rsv = regrsv[rn]
-            dest_rsel_o = []
-            for fu in range(self.n_fu_row):
-                dc = dm[fu]
-                # accumulate cell reg-select outputs dest/src1/src2
-                dest_rsel_o.append(dc.dest_rsel_o[rn])
-            # connect cell reg-select outputs to Reg Vector In
-            m.d.comb += rsv.dest_rsel_i.eq(Cat(*dest_rsel_o)),
-
-            # accumulate Reg-Sel Vector outputs
-            dest_rsel.append(rsv.dest_rsel_o)
-
-        # ... and output them from this module (horizontal, width=REGs)
-        m.d.comb += self.dest_rsel_o.eq(Cat(*dest_rsel))
-
-        # same for src
-        for i in range(self.n_src):
-            src_rsel = []
-            for rn in range(self.n_reg_col):
-                rsv = regrsv[rn]
-                src_rsel_o = []
-                for fu in range(self.n_fu_row):
-                    dc = dm[fu]
-                    # accumulate cell reg-select outputs dest/src1/src2
-                    src_rsel_o.append(dc.src_rsel_o[i][rn])
-                # connect cell reg-select outputs to Reg Vector In
-                m.d.comb += rsv.src_rsel_i[i].eq(Cat(*src_rsel_o)),
-                # accumulate Reg-Sel Vector outputs
-                src_rsel.append(rsv.src_rsel_o[i])
-
-            # ... and output them from this module (horizontal, width=REGs)
-            m.d.comb += self.src_rsel_o[i].eq(Cat(*src_rsel))
-
-        # ---
-        # connect Dependency Matrix dest/src1/src2/issue to module d/s/s/i
-        # ---
-        for fu in range(self.n_fu_row):
-            dc = dm[fu]
-            # wire up inputs from module to row cell inputs (Cat is gooood)
-            m.d.comb += [dc.dest_i.eq(self.dest_i),
-                         dc.rd_pend_i.eq(self.rd_pend_i),
-                         dc.wr_pend_i.eq(self.wr_pend_i),
-                        ]
-        # same for src
-        for i in range(self.n_src):
-            for fu in range(self.n_fu_row):
-                dc = dm[fu]
-                # wire up inputs from module to row cell inputs (Cat is gooood)
-                m.d.comb += dc.src_i[i].eq(self.src_i[i])
-
-        # accumulate rsel bits into read/write pending vectors.
-        rd_pend_v = []
-        wr_pend_v = []
-        for fu in range(self.n_fu_row):
-            dc = dm[fu]
-            rd_pend_v.append(dc.v_rd_rsel_o)
-            wr_pend_v.append(dc.v_wr_rsel_o)
-        rd_v = GlobalPending(self.n_reg_col, rd_pend_v)
-        wr_v = GlobalPending(self.n_reg_col, wr_pend_v)
-        m.submodules.rd_v = rd_v
-        m.submodules.wr_v = wr_v
-
-        m.d.comb += self.v_rd_rsel_o.eq(rd_v.g_pend_o)
-        m.d.comb += self.v_wr_rsel_o.eq(wr_v.g_pend_o)
-
-        # ---
-        # connect Dep issue_i/go_rd_i/go_wr_i to module issue_i/go_rd/go_wr
-        # ---
-        go_rd_i = []
-        go_wr_i = []
-        issue_i = []
-        for fu in range(self.n_fu_row):
-            dc = dm[fu]
-            # accumulate cell fwd outputs for dest/src1/src2
-            go_rd_i.append(dc.go_rd_i)
-            go_wr_i.append(dc.go_wr_i)
-            issue_i.append(dc.issue_i)
-        # wire up inputs from module to row cell inputs (Cat is gooood)
-        m.d.comb += [Cat(*go_rd_i).eq(self.go_rd_i),
-                     Cat(*go_wr_i).eq(self.go_wr_i),
-                     Cat(*issue_i).eq(self.issue_i),
-                    ]
-
-        # ---
-        # connect Dep go_die_i
-        # ---
-        if cancel_mode:
-            for fu in range(self.n_fu_row):
-                dc = dm[fu]
-                go_die = Repl(self.go_die_i[fu], self.n_fu_row)
-                go_die = go_die | self.cancel[fu]
-                m.d.comb += dc.go_die_i.eq(go_die)
-        else:
-            go_die_i = []
-            for fu in range(self.n_fu_row):
-                dc = dm[fu]
-                # accumulate cell fwd outputs for dest/src1/src2
-                go_die_i.append(dc.go_die_i)
-            # wire up inputs from module to row cell inputs (Cat is gooood)
-            m.d.comb += Cat(*go_die_i).eq(self.go_die_i)
-        return m
-
-    def __iter__(self):
-        yield self.dest_i
-        yield from self.src_i
-        yield self.issue_i
-        yield self.go_wr_i
-        yield self.go_rd_i
-        yield self.go_die_i
-        yield self.dest_rsel_o
-        yield from self.src_rsel_o
-        yield self.wr_pend_o
-        yield self.rd_pend_o
-        yield self.wr_pend_i
-        yield self.rd_pend_i
-        yield self.v_wr_rsel_o
-        yield self.v_rd_rsel_o
-        yield from self.rd_src_pend_o
-
-    def ports(self):
-        return list(self)
-
-def d_matrix_sim(dut):
-    """ XXX TODO
-    """
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_rd_i.eq(1)
-    yield
-    yield dut.go_rd_i.eq(0)
-    yield
-    yield dut.go_wr_i.eq(1)
-    yield
-    yield dut.go_wr_i.eq(0)
-    yield
-
-def test_d_matrix():
-    dut = FURegDepMatrix(n_fu_row=3, n_reg_col=4, n_src=2)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_fu_reg_matrix.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_reg_matrix.vcd')
-
-if __name__ == '__main__':
-    test_d_matrix()
diff --git a/src/scoreboard/fu_wr_pending.py b/src/scoreboard/fu_wr_pending.py
deleted file mode 100644
index d0bcb954..00000000
--- a/src/scoreboard/fu_wr_pending.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from nmigen import Elaboratable, Module, Signal, Array
-
-
-class FU_RW_Pend(Elaboratable):
-    """ these are allocated per-FU (horizontally),
-        and are of length reg_count
-    """
-    def __init__(self, reg_count, n_src):
-        self.n_src = n_src
-        self.reg_count = reg_count
-        self.dest_fwd_i = Signal(reg_count, reset_less=True)
-        src = []
-        for i in range(n_src):
-            j = i + 1 # name numbering to match src1/src2
-            src.append(Signal(reg_count, name="src%d" % j, reset_less=True))
-        self.src_fwd_i = Array(src)
-
-        self.reg_wr_pend_o = Signal(reset_less=True)
-        self.reg_rd_pend_o = Signal(reset_less=True)
-        self.reg_rd_src_pend_o = Signal(n_src, reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.reg_wr_pend_o.eq(self.dest_fwd_i.bool())
-        for i in range(self.n_src):
-            m.d.comb += self.reg_rd_src_pend_o[i].eq(self.src_fwd_i[i].bool())
-        m.d.comb += self.reg_rd_pend_o.eq(self.reg_rd_src_pend_o.bool())
-        return m
-
diff --git a/src/scoreboard/fumem_dep_cell.py b/src/scoreboard/fumem_dep_cell.py
deleted file mode 100644
index 982b55a3..00000000
--- a/src/scoreboard/fumem_dep_cell.py
+++ /dev/null
@@ -1,92 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Const, Elaboratable
-from nmutil.latch import SRLatch
-
-
-class FUMemDependenceCell(Elaboratable):
-    """ implements 11.4.7 mitch alsup dependence cell, p27
-    """
-    def __init__(self, dummy, n_fu=1):
-        self.n_fu = n_fu
-        self.dummy = Const(~(1<<dummy), n_fu)
-        # inputs
-        self.st_pend_i = Signal(n_fu, reset_less=True) # read pend in (left)
-        self.ld_pend_i = Signal(n_fu, reset_less=True) # write pend in (left)
-        self.issue_i = Signal(n_fu, reset_less=True)    # Issue in (top)
-
-        self.go_ld_i = Signal(n_fu, reset_less=True) # Go Write in (left)
-        self.go_st_i = Signal(n_fu, reset_less=True)  # Go Read in (left)
-        self.go_die_i = Signal(n_fu, reset_less=True) # Go Die in (left)
-
-        # outputs (latched rd/wr wait)
-        self.st_wait_o = Signal(n_fu, reset_less=True) # read wait out (right)
-        self.ld_wait_o = Signal(n_fu, reset_less=True) # write wait out (right)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.st_c = st_c = SRLatch(sync=False, llen=self.n_fu)
-        m.submodules.ld_c = ld_c = SRLatch(sync=False, llen=self.n_fu)
-
-        # reset on go HI, set on dest and issue
-        m.d.comb += st_c.s.eq(self.issue_i & self.st_pend_i)
-        m.d.comb += ld_c.s.eq(self.issue_i & self.ld_pend_i)
-
-        # connect go_rd / go_wr 
-        m.d.comb += ld_c.r.eq(self.go_ld_i | self.go_die_i)
-        m.d.comb += st_c.r.eq(self.go_st_i | self.go_die_i)
-
-        # connect pend_i
-        m.d.comb += st_c.s.eq(self.issue_i & self.st_pend_i & self.dummy)
-        m.d.comb += ld_c.s.eq(self.issue_i & self.ld_pend_i & self.dummy)
-
-        # connect output
-        m.d.comb += self.st_wait_o.eq(st_c.qlq & ~self.issue_i)
-        m.d.comb += self.ld_wait_o.eq(ld_c.qlq & ~self.issue_i)
-
-        return m
-
-    def __iter__(self):
-        yield self.st_pend_i
-        yield self.ld_pend_i
-        yield self.issue_i
-        yield self.go_ld_i
-        yield self.go_st_i
-        yield self.go_die_i
-        yield self.st_wait_o
-        yield self.ld_wait_o
-                
-    def ports(self):
-        return list(self)
-
-
-def dcell_sim(dut):
-    yield dut.ld_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.st_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_st_i.eq(1)
-    yield
-    yield dut.go_st_i.eq(0)
-    yield
-    yield dut.go_ld_i.eq(1)
-    yield
-    yield dut.go_ld_i.eq(0)
-    yield
-
-def test_dcell():
-    dut = FUMemDependenceCell(dummy=0, n_fu=4)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_fumem_dcell.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, dcell_sim(dut), vcd_name='test_fumem_dcell.vcd')
-
-if __name__ == '__main__':
-    test_dcell()
diff --git a/src/scoreboard/global_pending.py b/src/scoreboard/global_pending.py
deleted file mode 100644
index 540f4430..00000000
--- a/src/scoreboard/global_pending.py
+++ /dev/null
@@ -1,95 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Cat, Elaboratable
-
-
-class GlobalPending(Elaboratable):
-    """ implements Global Pending Vector, basically ORs all incoming Function
-        Unit vectors together.  Can be used for creating Read or Write Global
-        Pending.  Can be used for INT or FP Global Pending.
-
-        Inputs:
-        * :dep:       register file depth
-        * :fu_vecs:   a python list of function unit "pending" vectors, each
-                      vector being a Signal of width equal to the reg file.
-
-        Notes:
-
-        * the regfile may be Int or FP, this code doesn't care which.
-          obviously do not try to put in a mixture of regfiles into fu_vecs.
-        * this code also doesn't care if it's used for Read Pending or Write
-          pending, it can be used for both: again, obviously, do not try to
-          put in a mixture of read *and* write pending vectors in.
-        * if some Function Units happen not to be uniform (don't operate
-          on a particular register (extremely unusual), they must set a Const
-          zero bit in the vector.
-    """
-    def __init__(self, dep, fu_vecs, sync=False):
-        self.reg_dep = dep
-        # inputs
-        self.fu_vecs = fu_vecs
-        self.sync = sync
-        for v in fu_vecs:
-            assert len(v) == dep, "FU Vector must be same width as regfile"
-
-        self.g_pend_o = Signal(dep, reset_less=True)  # global pending vector
-
-    def elaborate(self, platform):
-        m = Module()
-
-        pend_l = []
-        for i in range(self.reg_dep): # per-register
-            vec_bit_l = []
-            for v in self.fu_vecs:
-                vec_bit_l.append(v[i])             # fu bit for same register
-            pend_l.append(Cat(*vec_bit_l).bool())  # OR all bits for same reg
-        if self.sync:
-            m.d.sync += self.g_pend_o.eq(Cat(*pend_l)) # merge all OR'd bits
-        else:
-            m.d.comb += self.g_pend_o.eq(Cat(*pend_l)) # merge all OR'd bits
-
-        return m
-
-    def __iter__(self):
-        yield from self.fu_vecs
-        yield self.g_pend_o
-
-    def ports(self):
-        return list(self)
-
-
-def g_vec_sim(dut):
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_rd_i.eq(1)
-    yield
-    yield dut.go_rd_i.eq(0)
-    yield
-    yield dut.go_wr_i.eq(1)
-    yield
-    yield dut.go_wr_i.eq(0)
-    yield
-
-def test_g_vec():
-    vecs = []
-    for i in range(3):
-        vecs.append(Signal(32, name="fu%d" % i))
-    dut = GlobalPending(32, vecs)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_global_pending.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, g_vec_sim(dut), vcd_name='test_global_pending.vcd')
-
-if __name__ == '__main__':
-    test_g_vec()
diff --git a/src/scoreboard/group_picker.py b/src/scoreboard/group_picker.py
deleted file mode 100644
index a59fdd28..00000000
--- a/src/scoreboard/group_picker.py
+++ /dev/null
@@ -1,124 +0,0 @@
-""" Group Picker: to select an instruction that is permitted to read (or write)
-    based on the Function Unit expressing a *desire* to read (or write).
-
-    The job of the Group Picker is extremely simple yet extremely important.
-    It sits in front of a register file port (read or write) and stops it from
-    being corrupted.  It's a "port contention selector", basically.
-
-    The way it works is:
-
-    * Function Units need to read from (or write to) the register file,
-      in order to get (or store) their operands, so they each have a signal,
-      readable (or writable), which "expresses" this need.  This is an
-      *unary* encoding.
-
-    * The Function Units also have a signal which indicates that they
-      are requesting "release" of the register file port (this because
-      in the scoreboard, readable/writable can be permanently HI even
-      if the FU is idle, whereas the "release" signal is very specifically
-      only HI if the read (or write) latch is still active)
-
-    * The Group Picker takes this unary encoding of the desire to read
-      (or write) and, on a priority basis, activates one *and only* one
-      of those signals, again as an unary output.
-
-    * Due to the way that the Computation Unit works, that signal (Go_Read
-      or Go_Write) will fire for one (and only one) cycle, and can be used
-      to enable the register file port read (or write) lines.  The Go_Read/Wr
-      signal basically loops back to the Computation Unit and resets the
-      "desire-to-read/write-expressing" latch.
-
-    In theory (and in practice!) the following is possible:
-
-    * Separate src1 and src2 Group Pickers.  This would allow instructions
-      with only one operand to read to not block up other instructions,
-      and it would also allow 3-operand instructions to be interleaved
-      with 1 and 2 operand instructions.
-
-    * *Multiple* Group Pickers (multi-issue).  This would require
-      a corresponding increase in the number of register file ports,
-      either 4R2W (or more) or by "striping" the register file into
-      split banks (a strategy best deployed on Vector Processors)
-
-"""
-
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable
-
-from nmutil.picker import PriorityPicker
-
-
-class GroupPicker(Elaboratable):
-    """ implements 10.5 mitch alsup group picker, p27
-    """
-    def __init__(self, wid):
-        self.gp_wid = wid
-        # inputs
-        self.readable_i = Signal(wid, reset_less=True) # readable in (top)
-        self.writable_i = Signal(wid, reset_less=True) # writable in (top)
-        self.rd_rel_i = Signal(wid, reset_less=True)   # go read in (top)
-        self.req_rel_i = Signal(wid, reset_less=True) # release request in (top)
-
-        # outputs
-        self.go_rd_o = Signal(wid, reset_less=True)  # go read (bottom)
-        self.go_wr_o = Signal(wid, reset_less=True)  # go write (bottom)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        m.submodules.rpick = rpick = PriorityPicker(self.gp_wid)
-        m.submodules.wpick = wpick = PriorityPicker(self.gp_wid)
-
-        # combine release (output ready signal) with writeable
-        m.d.comb += wpick.i.eq(self.writable_i & self.req_rel_i)
-        m.d.comb += self.go_wr_o.eq(wpick.o)
-
-        m.d.comb += rpick.i.eq(self.readable_i & self.rd_rel_i)
-        m.d.comb += self.go_rd_o.eq(rpick.o)
-
-        return m
-
-    def __iter__(self):
-        yield self.readable_i
-        yield self.writable_i
-        yield self.req_rel_i
-        yield self.go_rd_o
-        yield self.go_wr_o
-
-    def ports(self):
-        return list(self)
-
-
-def grp_pick_sim(dut):
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.rd_rel_i.eq(1)
-    yield
-    yield dut.rd_rel_i.eq(0)
-    yield
-    yield dut.go_wr_i.eq(1)
-    yield
-    yield dut.go_wr_i.eq(0)
-    yield
-
-def test_grp_pick():
-    dut = GroupPicker(4)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_grp_pick.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, grp_pick_sim(dut), vcd_name='test_grp_pick.vcd')
-
-if __name__ == '__main__':
-    test_grp_pick()
diff --git a/src/scoreboard/instruction_q.py b/src/scoreboard/instruction_q.py
deleted file mode 100644
index 65496a6a..00000000
--- a/src/scoreboard/instruction_q.py
+++ /dev/null
@@ -1,179 +0,0 @@
-from math import log
-
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Cat, Array, Const, Repl, Elaboratable
-from nmutil.iocontrol import RecordObject
-from nmutil.nmoperator import eq, shape, cat
-
-
-class Instruction(RecordObject):
-    def __init__(self, name, wid, opwid):
-        RecordObject.__init__(self, name=name)
-        self.oper_i = Signal(opwid, reset_less=True)
-        self.opim_i = Signal(1, reset_less=True) # src2 is an immediate
-        self.imm_i = Signal(wid, reset_less=True)
-        self.dest_i = Signal(wid, reset_less=True)
-        self.src1_i = Signal(wid, reset_less=True)
-        self.src2_i = Signal(wid, reset_less=True)
-
-    @staticmethod
-    def nq(n_insns, name, wid, opwid):
-        q = []
-        for i in range(n_insns):
-            q.append(Instruction("%s%d" % (name, i), wid, opwid))
-        return Array(q)
-
-
-class InstructionQ(Elaboratable):
-    """ contains a queue of (part-decoded) instructions.
-
-        output is copied combinatorially from the front of the queue,
-        for easy access on the clock cycle.  only "n_in" instructions
-        are made available this way
-
-        input and shifting occurs on sync.
-    """
-    def __init__(self, wid, opwid, iqlen, n_in, n_out):
-        """ constructor
-
-            Inputs
-
-            * :wid:         register file width
-            * :opwid:       operand width
-            * :iqlen:       instruction queue length
-            * :n_in:        max number of instructions allowed "in"
-        """
-        self.iqlen = iqlen
-        self.reg_width = wid
-        self.opwid = opwid
-        self.n_in = n_in
-        self.n_out = n_out
-        mqbits = (int(log(iqlen) / log(2))+2, False)
-
-        self.p_add_i = Signal(mqbits) # instructions to add (from data_i)
-        self.p_ready_o = Signal() # instructions were added
-        self.data_i = Instruction.nq(n_in, "data_i", wid, opwid)
-        
-        self.data_o = Instruction.nq(n_out, "data_o", wid, opwid)
-        self.n_sub_i = Signal(mqbits) # number of instructions to remove
-        self.n_sub_o = Signal(mqbits) # number of instructions removed
-
-        self.qsz = shape(self.data_o[0])[0]
-        q = []
-        for i in range(iqlen):
-            q.append(Signal(self.qsz, name="q%d" % i))
-        self.q = Array(q)
-        self.qlen_o = Signal(mqbits)
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-        sync = m.d.sync
-
-        iqlen = self.iqlen
-        mqbits = int(log(iqlen) / log(2))
-
-        left = Signal((mqbits+2, False))
-        spare = Signal((mqbits+2, False))
-        qmaxed = Signal()
-
-        start_q = Signal(mqbits)
-        end_q = Signal(mqbits)
-        mqlen = Const(iqlen, (len(left), False))
-        print ("mqlen", mqlen)
-
-        # work out how many can be subtracted from the queue
-        with m.If(self.n_sub_i):
-            qinmax = Signal()
-            comb += qinmax.eq(self.n_sub_i > self.qlen_o)
-            with m.If(qinmax):
-                comb += self.n_sub_o.eq(self.qlen_o)
-            with m.Else():
-                comb += self.n_sub_o.eq(self.n_sub_i)
-
-        # work out how many new items are going to be in the queue
-        comb += left.eq(self.qlen_o )#- self.n_sub_o)
-        comb += spare.eq(mqlen - self.p_add_i)
-        comb += qmaxed.eq(left <= spare)
-        comb += self.p_ready_o.eq(qmaxed & (self.p_add_i != 0))
-
-        # put q (flattened) into output
-        for i in range(self.n_out):
-            opos = Signal(mqbits)
-            comb += opos.eq(end_q + i)
-            comb += cat(self.data_o[i]).eq(self.q[opos])
-
-        with m.If(self.n_sub_o):
-            # ok now the end's moved
-            sync += end_q.eq(end_q + self.n_sub_o)
-
-        with m.If(self.p_ready_o):
-            # copy in the input... insanely gate-costly... *sigh*...
-            for i in range(self.n_in):
-                with m.If(self.p_add_i > Const(i, len(self.p_add_i))):
-                    ipos = Signal(mqbits)
-                    comb += ipos.eq(start_q + i) # should roll round
-                    sync += self.q[ipos].eq(cat(self.data_i[i]))
-            sync += start_q.eq(start_q + self.p_add_i)
-
-        with m.If(self.p_ready_o):
-            # update the queue length
-            add2 = Signal(mqbits+1)
-            comb += add2.eq(self.qlen_o + self.p_add_i)
-            sync += self.qlen_o.eq(add2 - self.n_sub_o)
-        with m.Else():
-            sync += self.qlen_o.eq(self.qlen_o - self.n_sub_o)
-
-        return m
-
-    def __iter__(self):
-        yield from self.q
-
-        yield self.p_ready_o
-        for o in self.data_i:
-            yield from list(o)
-        yield self.p_add_i
-        
-        for o in self.data_o:
-            yield from list(o)
-        yield self.n_sub_i
-        yield self.n_sub_o
-
-    def ports(self):
-        return list(self)
-
-
-def instruction_q_sim(dut):
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_rd_i.eq(1)
-    yield
-    yield dut.go_rd_i.eq(0)
-    yield
-    yield dut.go_wr_i.eq(1)
-    yield
-    yield dut.go_wr_i.eq(0)
-    yield
-
-def test_instruction_q():
-    dut = InstructionQ(16, 4, 4, n_in=2, n_out=2)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_instruction_q.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, instruction_q_sim(dut),
-                   vcd_name='test_instruction_q.vcd')
-
-if __name__ == '__main__':
-    test_instruction_q()
diff --git a/src/scoreboard/issue_unit.py b/src/scoreboard/issue_unit.py
deleted file mode 100644
index 3ec2a31c..00000000
--- a/src/scoreboard/issue_unit.py
+++ /dev/null
@@ -1,278 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Cat, Array, Const, Repl, Elaboratable
-from nmigen.lib.coding import Decoder
-
-from scoreboard.group_picker import PriorityPicker
-
-
-class RegDecode(Elaboratable):
-    """ decodes registers into unary
-
-        Inputs
-
-        * :wid:         register file width
-    """
-    def __init__(self, wid):
-        self.reg_width = wid
-
-        # inputs
-        self.enable_i = Signal(reset_less=True) # enable decoders
-        self.dest_i = Signal(range(wid), reset_less=True) # Dest R# in
-        self.src1_i = Signal(range(wid), reset_less=True) # oper1 R# in
-        self.src2_i = Signal(range(wid), reset_less=True) # oper2 R# in
-
-        # outputs
-        self.dest_o = Signal(wid, reset_less=True) # Dest unary out
-        self.src1_o = Signal(wid, reset_less=True) # oper1 unary out
-        self.src2_o = Signal(wid, reset_less=True) # oper2 unary out
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.dest_d = dest_d = Decoder(self.reg_width)
-        m.submodules.src1_d = src1_d = Decoder(self.reg_width)
-        m.submodules.src2_d = src2_d = Decoder(self.reg_width)
-
-        # dest decoder: write-pending
-        for d, i, o in [(dest_d, self.dest_i, self.dest_o),
-                     (src1_d, self.src1_i, self.src1_o),
-                     (src2_d, self.src2_i, self.src2_o)]:
-            m.d.comb += d.i.eq(i)
-            m.d.comb += d.n.eq(~self.enable_i)
-            m.d.comb += o.eq(d.o)
-
-        return m
-
-    def __iter__(self):
-        yield self.enable_i
-        yield self.dest_i
-        yield self.src1_i
-        yield self.src2_i
-        yield self.dest_o
-        yield self.src1_o
-        yield self.src2_o
-
-    def ports(self):
-        return list(self)
-
-
-class IssueUnitGroup(Elaboratable):
-    """ Manages a batch of Computation Units all of which can do the same task
-
-        A priority picker will allocate one instruction in this cycle based
-        on whether the others are busy.
-
-        insn_i indicates to this module that there is an instruction to be
-        issued which this group can handle
-
-        busy_i is a vector of signals that indicate, in this cycle, which
-        of the units are currently busy.
-
-        busy_o indicates whether it is "safe to proceed" i.e. whether
-        there is a unit here that can *be* issued an instruction
-
-        fn_issue_o indicates, out of the available (non-busy) units,
-        which one may be selected
-    """
-    def __init__(self, n_insns):
-        """ Set up inputs and outputs for the Group
-
-            Input Parameters
-
-            * :n_insns:     number of instructions in this issue unit.
-        """
-        self.n_insns = n_insns
-
-        # inputs
-        self.insn_i = Signal(reset_less=True, name="insn_i")
-        self.busy_i = Signal(n_insns, reset_less=True, name="busy_i")
-
-        # outputs
-        self.fn_issue_o = Signal(n_insns, reset_less=True, name="fn_issue_o")
-        self.busy_o = Signal(reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        if self.n_insns == 0:
-            return m
-
-        m.submodules.pick = pick = PriorityPicker(self.n_insns)
-
-        # temporaries
-        allissue = Signal(self.n_insns, reset_less=True)
-
-        m.d.comb += allissue.eq(Repl(self.insn_i, self.n_insns))
-        # Pick one (and only one) of the units to proceed in this cycle
-        m.d.comb += pick.i.eq(~self.busy_i & allissue)
-
-        # "Safe to issue" condition is basically when all units are not busy
-        m.d.comb += self.busy_o.eq(~((~self.busy_i).bool()))
-
-        # Picker only raises one signal, therefore it's also the fn_issue
-        m.d.comb += self.fn_issue_o.eq(pick.o & Repl(~self.busy_o, self.n_insns))
-
-        return m
-
-    def __iter__(self):
-        yield self.insn_i
-        yield self.busy_i
-        yield self.fn_issue_o
-        yield self.g_issue_o
-
-    def ports(self):
-        return list(self)
-
-
-class IssueUnitArray(Elaboratable):
-    """ Convenience module that amalgamates the issue and busy signals
-
-        unit issue_i is to be set externally, at the same time as the
-        ALU group oper_i
-    """
-    def __init__(self, units):
-        self.units = units
-        self.issue_o = Signal(reset_less=True)
-        n_insns = 0
-        for u in self.units:
-            n_insns += len(u.fn_issue_o)
-        self.busy_i = Signal(n_insns, reset_less=True)
-        self.fn_issue_o = Signal(n_insns, reset_less=True)
-        self.n_insns = n_insns
-
-    def elaborate(self, platform):
-        m = Module()
-        for i, u in enumerate(self.units):
-            setattr(m.submodules, "issue%d" % i, u)
-
-        g_issue_o = []
-        busy_i = []
-        fn_issue_o = []
-        for u in self.units:
-            busy_i.append(u.busy_i)
-            g_issue_o.append(u.busy_o)
-            fn_issue_o.append(u.fn_issue_o)
-        m.d.comb += self.issue_o.eq(~(Cat(*g_issue_o).bool()))
-        m.d.comb += self.fn_issue_o.eq(Cat(*fn_issue_o))
-        m.d.comb += Cat(*busy_i).eq(self.busy_i)
-
-        return m
-
-    def ports(self):
-        yield self.busy_i
-        yield self.issue_o
-        yield self.fn_issue_o
-        yield from self.units
-
-
-
-class IssueUnit(Elaboratable):
-    """ implements 11.4.14 issue unit, p50
-
-        Inputs
-
-        * :n_insns:     number of instructions in this issue unit.
-    """
-    def __init__(self, n_insns):
-        self.n_insns = n_insns
-
-        # inputs
-        self.insn_i = Signal(n_insns, reset_less=True, name="insn_i")
-        self.busy_i = Signal(n_insns, reset_less=True, name="busy_i")
-
-        # outputs
-        self.fn_issue_o = Signal(n_insns, reset_less=True, name="fn_issue_o")
-        self.g_issue_o = Signal(reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        if self.n_insns == 0:
-            return m
-
-        # temporaries
-        fu_stall = Signal(reset_less=True)
-
-        ib_l = []
-        for i in range(self.n_insns):
-            ib_l.append(self.insn_i[i] & self.busy_i[i])
-        m.d.comb += fu_stall.eq(Cat(*ib_l).bool())
-        m.d.comb += self.g_issue_o.eq(~(fu_stall))
-        for i in range(self.n_insns):
-            m.d.comb += self.fn_issue_o[i].eq(self.g_issue_o & self.insn_i[i])
-
-        return m
-
-    def __iter__(self):
-        yield self.insn_i
-        yield self.busy_i
-        yield self.fn_issue_o
-        yield self.g_issue_o
-
-    def ports(self):
-        return list(self)
-
-
-class IntFPIssueUnit(Elaboratable):
-    def __init__(self, n_int_insns, n_fp_insns):
-        self.i = IssueUnit(n_int_insns)
-        self.f = IssueUnit(n_fp_insns)
-        self.issue_o = Signal(reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.intissue = self.i
-        m.submodules.fpissue = self.f
-
-        m.d.comb += self.issue_o.eq(self.i.g_issue_o | self.f.g_issue_o)
-
-        return m
-
-    def ports(self):
-        yield self.issue_o
-        yield from self.i
-        yield from self.f
-
-
-def issue_unit_sim(dut):
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_rd_i.eq(1)
-    yield
-    yield dut.go_rd_i.eq(0)
-    yield
-    yield dut.go_wr_i.eq(1)
-    yield
-    yield dut.go_wr_i.eq(0)
-    yield
-
-def test_issue_unit():
-    dut = IssueUnitGroup(3)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_issue_unit_group.il", "w") as f:
-        f.write(vl)
-
-    dut = IssueUnit(32, 3)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_issue_unit.il", "w") as f:
-        f.write(vl)
-
-    dut = IntFPIssueUnit(32, 3, 3)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_intfp_issue_unit.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, issue_unit_sim(dut), vcd_name='test_issue_unit.vcd')
-
-if __name__ == '__main__':
-    test_issue_unit()
diff --git a/src/scoreboard/ldst_dep_cell.py b/src/scoreboard/ldst_dep_cell.py
deleted file mode 100644
index 70f4b9ba..00000000
--- a/src/scoreboard/ldst_dep_cell.py
+++ /dev/null
@@ -1,116 +0,0 @@
-""" Mitch Alsup 6600-style LD/ST scoreboard Dependency Cell
-
-Relevant bugreports:
-
-* http://bugs.libre-riscv.org/show_bug.cgi?id=81
-
-"""
-
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Repl, Elaboratable
-from nmutil.latch import SRLatch
-
-
-class LDSTDepCell(Elaboratable):
-    """ implements 11.4.12 mitch alsup load/store dependence cell, p45
-    """
-    def __init__(self, n_ls=1):
-        self.n_ls = n_ls
-        # inputs
-        self.load_h_i = Signal(reset_less=True)     # load in (left)
-        self.stor_h_i = Signal(reset_less=True)     # store in (left)
-        self.load_v_i = Signal(n_ls, reset_less=True)     # load in (top)
-        self.stor_v_i = Signal(n_ls, reset_less=True)     # store in (top)
-        self.issue_i = Signal(reset_less=True)    # Issue in (left)
-        self.go_die_i = Signal(reset_less=True)    # Issue in (left)
-
-        # load / store hit - basically connect these to go_wr from LD/STCompUnit
-        # LD.go_wr -> load_hit_i, ST.go_wr -> stwd_hit_i.
-        self.load_hit_i = Signal(n_ls, reset_less=True) # ld hit in (right)
-        self.stwd_hit_i = Signal(n_ls, reset_less=True) # st w/ hit in (right)
-
-        # outputs (latched rd/wr pend)
-        self.ld_hold_st_o = Signal(reset_less=True) # ld holds st out (l)
-        self.st_hold_ld_o = Signal(reset_less=True) # st holds ld out (l)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.war_l = war_l = SRLatch(sync=False, llen=self.n_ls) # WaR
-        m.submodules.raw_l = raw_l = SRLatch(sync=False, llen=self.n_ls) # RaW
-
-        # temporaries (repeat-extend)
-        issue = Repl(self.issue_i, self.n_ls)
-        die = Repl(self.go_die_i, self.n_ls)
-
-        # issue & store & load - used for WAR Setting.  LD is left, ST is top
-        i_s = Signal(reset_less=True)
-        i_s_l = Signal(self.n_ls, reset_less=True)
-        m.d.comb += i_s.eq(issue & self.stor_h_i) # horizontal single-signal
-        m.d.comb += i_s_l.eq(Repl(i_s, self.n_ls) & self.load_v_i) # multi, vert
-
-        # issue & load & store - used for RAW Setting.  ST is left, LD is top
-        i_l = Signal(reset_less=True)
-        i_l_s = Signal(self.n_ls, reset_less=True)
-        m.d.comb += i_l.eq(issue & self.load_h_i) # horizontal single-signal
-        m.d.comb += i_l_s.eq(Repl(i_l, self.n_ls) & self.stor_v_i) # multi, vert
-
-        # write after read latch: loads block stores
-        m.d.comb += war_l.s.eq(i_s_l)
-        m.d.comb += war_l.r.eq(die | ~self.load_v_i) # reset on LD
-
-        # read after write latch: stores block loads
-        m.d.comb += raw_l.s.eq(i_s_l)
-        m.d.comb += raw_l.r.eq(die | ~self.stor_v_i) # reset on ST
-
-        # Hold results (read out horizontally, accumulate in OR fashion)
-        m.d.comb += self.ld_hold_st_o.eq((war_l.qn & self.load_hit_i).bool())
-        m.d.comb += self.st_hold_ld_o.eq((raw_l.qn & self.stwd_hit_i).bool())
-
-        return m
-
-    def __iter__(self):
-        yield self.load_h_i
-        yield self.load_v_i
-        yield self.stor_h_i
-        yield self.stor_h_i
-        yield self.issue_i
-        yield self.load_hit_i
-        yield self.stwd_hit_i
-        yield self.ld_hold_st_o
-        yield self.st_hold_ld_o
-
-    def ports(self):
-        return list(self)
-
-
-def dcell_sim(dut):
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_rd_i.eq(1)
-    yield
-    yield dut.go_rd_i.eq(0)
-    yield
-    yield dut.go_wr_i.eq(1)
-    yield
-    yield dut.go_wr_i.eq(0)
-    yield
-
-def test_dcell():
-    dut = LDSTDepCell()
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_ldst_dcell.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, dcell_sim(dut), vcd_name='test_ldst_dcell.vcd')
-
-if __name__ == '__main__':
-    test_dcell()
diff --git a/src/scoreboard/ldst_matrix.py b/src/scoreboard/ldst_matrix.py
deleted file mode 100644
index 1bb75b03..00000000
--- a/src/scoreboard/ldst_matrix.py
+++ /dev/null
@@ -1,163 +0,0 @@
-""" Mitch Alsup 6600-style LD/ST Memory Scoreboard Matrix (sparse vector)
-
-6600 LD/ST Dependency Table Matrix inputs / outputs
----------------------------------------------------
-
-Relevant comments (p45-46):
-
-* If there are no WAR dependencies on a Load instruction with a computed
-  address it can assert Bank_Addressable and Translate_Addressable.
-
-* If there are no RAW dependencies on a Store instruction with both a
-  write permission and store data present it can assert Bank_Addressable
-
-Relevant bugreports:
-
-* http://bugs.libre-riscv.org/show_bug.cgi?id=81
-
-Notes:
-
-* Load Hit (or Store Hit with Data) are asserted by the LD/ST Computation
-  Unit when it has data and address ready
-
-* Asserting the ld_hit_i (or stwd_hit_i) *requires* that the output be
-  captured or at least taken into consideration for the next LD/STs
-  *right then*.  Failure to observe the xx_hold_xx_o *will* result in
-  data corruption, as they are *only* asserted if xx_hit_i is asserted
-
-* The hold signals still have to go through "maybe address clashes"
-  detection, they cannot just be used as-is to stop a LD/ST.
-
-"""
-
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Const
-
-from ldst_dep_cell import LDSTDepCell
-
-
-class LDSTDepMatrix(Elaboratable):
-    """ implements 11.4.12 mitch alsup LD/ST Dependency Matrix, p46
-        actually a sparse matrix along the diagonal.
-
-        load-hold-store and store-hold-load accumulate in a priority-picking
-        fashion, ORing together.  the OR gate from the dependency cell is
-        here.
-    """
-    def __init__(self, n_ldst):
-        self.n_ldst = n_ldst                  # X and Y (FUs)
-        self.ld_pend_i = Signal(n_ldst, reset_less=True)  # load pending in
-        self.st_pend_i = Signal(n_ldst, reset_less=True)  # store pending in
-        self.issue_i = Signal(n_ldst, reset_less=True) # Issue in
-        self.go_die_i = Signal(n_ldst, reset_less=True) # Die/Reset in
-
-        self.load_hit_i = Signal(n_ldst, reset_less=True) # load hit in
-        self.stwd_hit_i = Signal(n_ldst, reset_less=True) # store w/data hit in
-
-        # outputs
-        self.ld_hold_st_o = Signal(n_ldst, reset_less=True) # load holds st out
-        self.st_hold_ld_o = Signal(n_ldst, reset_less=True) # st holds load out
-
-    def elaborate(self, platform):
-        m = Module()
-
-        # ---
-        # matrix of dependency cells.  actually, LDSTDepCell is a row, now
-        # ---
-        dm = Array(LDSTDepCell(self.n_ldst) for f in range(self.n_ldst))
-        for fu in range(self.n_ldst):
-            setattr(m.submodules, "dm_fu%d" % (fu), dm[fu])
-
-        # ---
-        # connect Function Unit vector, all horizontal
-        # ---
-        lhs_l = []
-        shl_l = []
-        issue_l = []
-        go_die_l = []
-        lh_l = []
-        sh_l = []
-        for fu in range(self.n_ldst):
-            dc = dm[fu]
-            # accumulate load-hold-store / store-hold-load bits (horizontal)
-            lhs_l.append(dc.ld_hold_st_o)
-            shl_l.append(dc.st_hold_ld_o)
-            # accumulate inputs (for Cat'ing later) - TODO: must be a better way
-            issue_l.append(dc.issue_i)
-            go_die_l.append(dc.go_die_i)
-
-            # load-hit and store-with-data-hit go in vertically (top)
-            m.d.comb += [dc.load_hit_i.eq(self.load_hit_i),
-                         dc.stwd_hit_i.eq(self.stwd_hit_i),
-                         dc.load_v_i.eq(self.ld_pend_i),
-                         dc.stor_v_i.eq(self.st_pend_i),
-                        ]
-
-        # connect cell inputs using Cat(*list_of_stuff)
-        m.d.comb += [Cat(*issue_l).eq(self.issue_i),
-                     Cat(*go_die_l).eq(self.go_die_i),
-                    ]
-        # connect the load-hold-store / store-hold-load OR-accumulated outputs
-        m.d.comb += self.ld_hold_st_o.eq(Cat(*lhs_l))
-        m.d.comb += self.st_hold_ld_o.eq(Cat(*shl_l))
-
-        # the load/store input also needs to be connected to "top" (vertically)
-        for fu in range(self.n_ldst):
-            load_h_l = []
-            stor_h_l = []
-            for fux in range(self.n_ldst):
-                dc = dm[fux]
-                load_h_l.append(dc.load_h_i)
-                stor_h_l.append(dc.stor_h_i)
-            m.d.comb += [Cat(*load_h_l).eq(self.ld_pend_i),
-                         Cat(*stor_h_l).eq(self.st_pend_i),
-                        ]
-
-        return m
-
-    def __iter__(self):
-        yield self.ld_pend_i
-        yield self.st_pend_i
-        yield self.issue_i
-        yield self.go_die_i
-        yield self.load_hit_i
-        yield self.stwd_hit_i
-        yield self.ld_hold_st_o
-        yield self.st_hold_ld_o
-
-    def ports(self):
-        return list(self)
-
-def d_matrix_sim(dut):
-    """ XXX TODO
-    """
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_rd_i.eq(1)
-    yield
-    yield dut.go_rd_i.eq(0)
-    yield
-    yield dut.go_wr_i.eq(1)
-    yield
-    yield dut.go_wr_i.eq(0)
-    yield
-
-def test_d_matrix():
-    dut = LDSTDepMatrix(n_ldst=4)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_ld_st_matrix.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, d_matrix_sim(dut), vcd_name='test_ld_st_matrix.vcd')
-
-if __name__ == '__main__':
-    test_d_matrix()
diff --git a/src/scoreboard/mdm.py b/src/scoreboard/mdm.py
deleted file mode 100644
index 184931ef..00000000
--- a/src/scoreboard/mdm.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module
-
-from scoreboard.fu_reg_matrix import FURegDepMatrix
-from scoreboard.addr_match import PartialAddrMatch
-
-class FUMemMatchMatrix(FURegDepMatrix, PartialAddrMatch):
-    """ implement a FU-Regs overload with memory-address matching
-    """
-    def __init__(self, n_fu, addrbitwid):
-        PartialAddrMatch.__init__(self, n_fu, addrbitwid)
-        FURegDepMatrix.__init__(self, n_fu, n_fu, 1, self.addr_nomatch_o)
-
-    def elaborate(self, platform):
-        m = Module()
-        PartialAddrMatch._elaborate(self, m, platform)
-        FURegDepMatrix._elaborate(self, m, platform)
-
-        return m
-
-
diff --git a/src/scoreboard/mem_dependence_cell.py b/src/scoreboard/mem_dependence_cell.py
deleted file mode 100644
index 2958d864..00000000
--- a/src/scoreboard/mem_dependence_cell.py
+++ /dev/null
@@ -1,120 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl
-from nmutil.latch import SRLatch
-
-
-class MemDepRow(Elaboratable):
-    """ implements 1st phase Memory Depencency cell
-    """
-    def __init__(self, n_reg):
-        self.n_reg = n_reg
-        # inputs
-        self.ld_i = Signal(n_reg, reset_less=True)     # Dest in (top)
-        self.st_i = Signal(n_reg, reset_less=True)     # oper1 in (top)
-        self.issue_i = Signal(reset_less=True)    # Issue in (top)
-
-        self.st_pend_i = Signal(n_reg, reset_less=True) # Read pend in (top)
-        self.ld_pend_i = Signal(n_reg, reset_less=True) # Write pend in (top)
-        self.v_st_rsel_o = Signal(n_reg, reset_less=True) # Read pend out (bot)
-        self.v_ld_rsel_o = Signal(n_reg, reset_less=True) # Write pend out (bot)
-
-        self.go_ld_i = Signal(reset_less=True) # Go Write in (left)
-        self.go_st_i = Signal(reset_less=True)  # Go Read in (left)
-        self.go_die_i = Signal(reset_less=True) # Go Die in (left)
-
-        # for Register File Select Lines (vertical)
-        self.ld_rsel_o = Signal(n_reg, reset_less=True)  # dest reg sel (bot)
-        self.st_rsel_o = Signal(n_reg, reset_less=True)  # src1 reg sel (bot)
-
-        # for Function Unit "forward progress" (horizontal)
-        self.ld_fwd_o = Signal(n_reg, reset_less=True)   # dest FU fw (right)
-        self.st_fwd_o = Signal(n_reg, reset_less=True)   # src1 FU fw (right)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.ld_c = ld_c = SRLatch(sync=False, llen=self.n_reg)
-        m.submodules.st_c = st_c = SRLatch(sync=False, llen=self.n_reg)
-
-        # connect go_rd / go_wr (dest->wr, src->rd)
-        ld_die = Signal(reset_less=True)
-        st_die = Signal(reset_less=True)
-        m.d.comb += ld_die.eq(self.go_ld_i | self.go_die_i)
-        m.d.comb += st_die.eq(self.go_st_i | self.go_die_i)
-        m.d.comb += ld_c.r.eq(Repl(ld_die, self.n_reg))
-        m.d.comb += st_c.r.eq(Repl(st_die, self.n_reg))
-
-        # connect input reg bit (unary)
-        i_ext = Repl(self.issue_i, self.n_reg)
-        m.d.comb += ld_c.s.eq(i_ext & self.ld_i)
-        m.d.comb += st_c.s.eq(i_ext & self.st_i)
-
-        # connect up hazard checks: read-after-write and write-after-read
-        m.d.comb += self.ld_fwd_o.eq(ld_c.q & self.st_pend_i)
-        m.d.comb += self.st_fwd_o.eq(st_c.q & self.ld_pend_i)
-
-        # connect reg-sel outputs
-        st_ext = Repl(self.go_st_i, self.n_reg)
-        ld_ext = Repl(self.go_ld_i, self.n_reg)
-        m.d.comb += self.ld_rsel_o.eq(ld_c.qlq & ld_ext)
-        m.d.comb += self.st_rsel_o.eq(st_c.qlq & st_ext)
-
-        # to be accumulated to indicate if register is in use (globally)
-        # after ORing, is fed back in to st_pend_i / ld_pend_i
-        m.d.comb += self.v_st_rsel_o.eq(st_c.qlq)
-        m.d.comb += self.v_ld_rsel_o.eq(ld_c.qlq)
-
-        return m
-
-    def __iter__(self):
-        yield self.ld_i
-        yield self.st_i
-        yield self.st_pend_i
-        yield self.ld_pend_i
-        yield self.issue_i
-        yield self.go_ld_i
-        yield self.go_st_i
-        yield self.go_die_i
-        yield self.v_ld_rsel_o
-        yield self.v_st_rsel_o
-        yield self.ld_rsel_o
-        yield self.st_rsel_o
-        yield self.ld_fwd_o
-        yield self.st_fwd_o
-
-    def ports(self):
-        return list(self)
-
-
-def dcell_sim(dut):
-    yield dut.ld_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.st_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_st_i.eq(1)
-    yield
-    yield dut.go_st_i.eq(0)
-    yield
-    yield dut.go_ld_i.eq(1)
-    yield
-    yield dut.go_ld_i.eq(0)
-    yield
-
-def test_dcell():
-    dut = MemDepRow(4)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_mem_drow.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, dcell_sim(dut), vcd_name='test_mem_dcell.vcd')
-
-if __name__ == '__main__':
-    test_dcell()
diff --git a/src/scoreboard/mem_fu_matrix.py b/src/scoreboard/mem_fu_matrix.py
deleted file mode 100644
index 98595996..00000000
--- a/src/scoreboard/mem_fu_matrix.py
+++ /dev/null
@@ -1,218 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat
-
-from scoreboard.mem_dependence_cell import MemDepRow
-from scoreboard.mem_fu_pending import MemFU_Pend
-from scoreboard.mem_select import Mem_Rsv
-from scoreboard.global_pending import GlobalPending
-
-"""
-
-"""
-
-class MemFUDepMatrix(Elaboratable):
-    """ implements 1st phase Memory-to-FU Dependency Matrix
-    """
-    def __init__(self, n_fu_row, n_reg_col):
-        self.n_fu_row = n_fu_row                  # Y (FUs)   ^v
-        self.n_reg_col = n_reg_col                # X (Regs)  <>
-        self.ld_i = Signal(n_reg_col, reset_less=True)     # LD in (top)
-        self.st_i = Signal(n_reg_col, reset_less=True)     # ST in (top)
-
-        # Register "Global" vectors for determining RaW and WaR hazards
-        self.ld_pend_i = Signal(n_reg_col, reset_less=True) # ld pending (top)
-        self.st_pend_i = Signal(n_reg_col, reset_less=True) # st pending (top)
-        self.v_ld_rsel_o = Signal(n_reg_col, reset_less=True) # ld pending (bot)
-        self.v_st_rsel_o = Signal(n_reg_col, reset_less=True) # st pending (bot)
-
-        self.issue_i = Signal(n_fu_row, reset_less=True)  # Issue in (top)
-        self.go_ld_i = Signal(n_fu_row, reset_less=True)  # Go LOAD in (left)
-        self.go_st_i = Signal(n_fu_row, reset_less=True)  # Go STOR in (left)
-        self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left)
-
-        # for Register File Select Lines (horizontal), per-reg
-        self.ld_rsel_o = Signal(n_reg_col, reset_less=True) # ld reg (bot)
-        self.st_rsel_o = Signal(n_reg_col, reset_less=True) # st reg (bot)
-
-        # for Function Unit "forward progress" (vertical), per-FU
-        self.ld_pend_o = Signal(n_fu_row, reset_less=True) # ld pending (right)
-        self.st_pend_o = Signal(n_fu_row, reset_less=True) # st pending (right)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        # ---
-        # matrix of dependency cells
-        # ---
-        dm = Array(MemDepRow(self.n_reg_col) for r in range(self.n_fu_row))
-        for fu in range(self.n_fu_row):
-            setattr(m.submodules, "dr_fu%d" % fu, dm[fu])
-
-        # ---
-        # array of Function Unit Pending vectors
-        # ---
-        fupend = Array(MemFU_Pend(self.n_reg_col) for f in range(self.n_fu_row))
-        for fu in range(self.n_fu_row):
-            setattr(m.submodules, "fu_fu%d" % (fu), fupend[fu])
-
-        # ---
-        # array of Register Reservation vectors
-        # ---
-        regrsv = Array(Mem_Rsv(self.n_fu_row) for r in range(self.n_reg_col))
-        for rn in range(self.n_reg_col):
-            setattr(m.submodules, "rr_r%d" % (rn), regrsv[rn])
-
-        # ---
-        # connect Function Unit vector
-        # ---
-        ld_pend = []
-        st_pend = []
-        for fu in range(self.n_fu_row):
-            dc = dm[fu]
-            fup = fupend[fu]
-            ld_fwd_o = []
-            st_fwd_o = []
-            for rn in range(self.n_reg_col):
-                # accumulate cell fwd outputs for dest/src1
-                ld_fwd_o.append(dc.ld_fwd_o[rn])
-                st_fwd_o.append(dc.st_fwd_o[rn])
-            # connect cell fwd outputs to FU Vector in [Cat is gooood]
-            m.d.comb += [fup.ld_fwd_i.eq(Cat(*ld_fwd_o)),
-                         fup.st_fwd_i.eq(Cat(*st_fwd_o)),
-                        ]
-            # accumulate FU Vector outputs
-            ld_pend.append(fup.reg_ld_pend_o)
-            st_pend.append(fup.reg_st_pend_o)
-
-        # ... and output them from this module (vertical, width=FUs)
-        m.d.comb += self.ld_pend_o.eq(Cat(*ld_pend))
-        m.d.comb += self.st_pend_o.eq(Cat(*st_pend))
-
-        # ---
-        # connect Reg Selection vector
-        # ---
-        ld_rsel = []
-        st_rsel = []
-        for rn in range(self.n_reg_col):
-            rsv = regrsv[rn]
-            ld_rsel_o = []
-            st_rsel_o = []
-            for fu in range(self.n_fu_row):
-                dc = dm[fu]
-                # accumulate cell reg-select outputs dest/src1
-                ld_rsel_o.append(dc.ld_rsel_o[rn])
-                st_rsel_o.append(dc.st_rsel_o[rn])
-            # connect cell reg-select outputs to Reg Vector In
-            m.d.comb += [rsv.ld_rsel_i.eq(Cat(*ld_rsel_o)),
-                         rsv.st_rsel_i.eq(Cat(*st_rsel_o)),
-                        ]
-            # accumulate Reg-Sel Vector outputs
-            ld_rsel.append(rsv.ld_rsel_o)
-            st_rsel.append(rsv.st_rsel_o)
-
-        # ... and output them from this module (horizontal, width=REGs)
-        m.d.comb += self.ld_rsel_o.eq(Cat(*ld_rsel))
-        m.d.comb += self.st_rsel_o.eq(Cat(*st_rsel))
-
-        # ---
-        # connect Dependency Matrix dest/src1/issue to module d/s/s/i
-        # ---
-        for fu in range(self.n_fu_row):
-            dc = dm[fu]
-            # wire up inputs from module to row cell inputs (Cat is gooood)
-            m.d.comb += [dc.ld_i.eq(self.ld_i),
-                         dc.st_i.eq(self.st_i),
-                         dc.st_pend_i.eq(self.st_pend_i),
-                         dc.ld_pend_i.eq(self.ld_pend_i),
-                        ]
-
-        # accumulate rsel bits into read/write pending vectors.
-        st_pend_v = []
-        ld_pend_v = []
-        for fu in range(self.n_fu_row):
-            dc = dm[fu]
-            st_pend_v.append(dc.v_st_rsel_o)
-            ld_pend_v.append(dc.v_ld_rsel_o)
-        st_v = GlobalPending(self.n_reg_col, st_pend_v)
-        ld_v = GlobalPending(self.n_reg_col, ld_pend_v)
-        m.submodules.st_v = st_v
-        m.submodules.ld_v = ld_v
-
-        m.d.comb += self.v_st_rsel_o.eq(st_v.g_pend_o)
-        m.d.comb += self.v_ld_rsel_o.eq(ld_v.g_pend_o)
-
-        # ---
-        # connect Dep issue_i/go_st_i/go_ld_i to module issue_i/go_rd/go_wr
-        # ---
-        go_st_i = []
-        go_ld_i = []
-        go_die_i = []
-        issue_i = []
-        for fu in range(self.n_fu_row):
-            dc = dm[fu]
-            # accumulate cell fwd outputs for dest/src1
-            go_st_i.append(dc.go_st_i)
-            go_ld_i.append(dc.go_ld_i)
-            go_die_i.append(dc.go_die_i)
-            issue_i.append(dc.issue_i)
-        # wire up inputs from module to row cell inputs (Cat is gooood)
-        m.d.comb += [Cat(*go_st_i).eq(self.go_st_i),
-                     Cat(*go_ld_i).eq(self.go_ld_i),
-                     Cat(*go_die_i).eq(self.go_die_i),
-                     Cat(*issue_i).eq(self.issue_i),
-                    ]
-
-        return m
-
-    def __iter__(self):
-        yield self.ld_i
-        yield self.st_i
-        yield self.issue_i
-        yield self.go_ld_i
-        yield self.go_st_i
-        yield self.go_die_i
-        yield self.ld_rsel_o
-        yield self.st_rsel_o
-        yield self.ld_pend_o
-        yield self.st_pend_o
-        yield self.ld_pend_i
-        yield self.st_pend_i
-        yield self.ld_rsel_o
-        yield self.st_rsel_o
-
-    def ports(self):
-        return list(self)
-
-def d_matrix_sim(dut):
-    """ XXX TODO
-    """
-    yield dut.ld_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.st_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_st_i.eq(1)
-    yield
-    yield dut.go_st_i.eq(0)
-    yield
-    yield dut.go_ld_i.eq(1)
-    yield
-    yield dut.go_ld_i.eq(0)
-    yield
-
-def test_d_matrix():
-    dut = MemFUDepMatrix(n_fu_row=3, n_reg_col=3)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_fu_mem_matrix.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_mem_matrix.vcd')
-
-if __name__ == '__main__':
-    test_d_matrix()
diff --git a/src/scoreboard/mem_fu_pending.py b/src/scoreboard/mem_fu_pending.py
deleted file mode 100644
index 951f7ac1..00000000
--- a/src/scoreboard/mem_fu_pending.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from nmigen import Elaboratable, Module, Signal, Cat
-
-
-class MemFU_Pend(Elaboratable):
-    """ these are allocated per-FU (horizontally),
-        and are of length reg_count
-    """
-    def __init__(self, reg_count):
-        self.reg_count = reg_count
-        self.ld_fwd_i = Signal(reg_count, reset_less=True)
-        self.st_fwd_i = Signal(reg_count, reset_less=True)
-
-        self.reg_ld_pend_o = Signal(reset_less=True)
-        self.reg_st_pend_o = Signal(reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.reg_ld_pend_o.eq(self.ld_fwd_i.bool())
-        m.d.comb += self.reg_st_pend_o.eq(self.st_fwd_i.bool())
-
-        return m
-
diff --git a/src/scoreboard/mem_select.py b/src/scoreboard/mem_select.py
deleted file mode 100644
index 627d7d10..00000000
--- a/src/scoreboard/mem_select.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from nmigen import Elaboratable, Module, Signal
-
-
-class Mem_Rsv(Elaboratable):
-    """ these are allocated per-Register (vertically),
-        and are each of length fu_count
-    """
-    def __init__(self, fu_count):
-        self.fu_count = fu_count
-        self.ld_rsel_i = Signal(fu_count, reset_less=True)
-        self.st_rsel_i = Signal(fu_count, reset_less=True)
-        self.ld_rsel_o = Signal(reset_less=True)
-        self.st_rsel_o = Signal(reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.ld_rsel_o.eq(self.ld_rsel_i.bool())
-        m.d.comb += self.st_rsel_o.eq(self.st_rsel_i.bool())
-        return m
-
diff --git a/src/scoreboard/memfu.py b/src/scoreboard/memfu.py
deleted file mode 100644
index 857d96c9..00000000
--- a/src/scoreboard/memfu.py
+++ /dev/null
@@ -1,120 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Array, Elaboratable
-
-from scoreboard.fu_fu_matrix import FUFUDepMatrix
-from scoreboard.mdm import FUMemMatchMatrix
-
-
-class MemFunctionUnits(Elaboratable):
-
-    def __init__(self, n_ldsts, addrbitwid):
-        self.n_ldsts = n_ldsts
-        self.bitwid = addrbitwid
-
-        self.st_i = Signal(n_ldsts, reset_less=True) # Dest R# in
-        self.ld_i = Signal(n_ldsts, reset_less=True) # oper1 R# in
-
-        self.g_int_ld_pend_o = Signal(n_ldsts, reset_less=True)
-        self.g_int_st_pend_o = Signal(n_ldsts, reset_less=True)
-
-        self.st_rsel_o = Signal(n_ldsts, reset_less=True) # dest reg (bot)
-        self.ld_rsel_o = Signal(n_ldsts, reset_less=True) # src1 reg (bot)
-
-        self.loadable_o = Signal(n_ldsts, reset_less=True)
-        self.storable_o = Signal(n_ldsts, reset_less=True)
-        self.addr_nomatch_o = Signal(n_ldsts, reset_less=True)
-
-        self.go_ld_i = Signal(n_ldsts, reset_less=True)
-        self.go_st_i = Signal(n_ldsts, reset_less=True)
-        self.go_die_i = Signal(n_ldsts, reset_less=True)
-        self.fn_issue_i = Signal(n_ldsts, reset_less=True)
-
-        # address matching
-        self.addrs_i = Array(Signal(self.bitwid, name="addrs_i%d" % i) \
-                             for i in range(n_ldsts))
-        self.addr_we_i = Signal(n_ldsts) # write-enable for incoming address
-        self.addr_en_i = Signal(n_ldsts) # address latched in
-        self.addr_rs_i = Signal(n_ldsts) # address deactivated
-
-        # Note: FURegs st_pend_o is also outputted from here, for use in WaWGrid
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-        sync = m.d.sync
-
-        n_fus = self.n_ldsts
-
-        # Integer FU-FU Dep Matrix
-        intfudeps = FUFUDepMatrix(n_fus, n_fus)
-        m.submodules.intfudeps = intfudeps
-        # Integer FU-Reg Dep Matrix
-        intregdeps = FUMemMatchMatrix(n_fus, self.bitwid)
-        m.submodules.intregdeps = intregdeps
-
-        # ok, because we do not know in advance what the AGEN (address gen)
-        # is, we have to make a transitive dependency set.  i.e. the LD
-        # (or ST) being requested now must depend on ALL prior LDs *AND* STs.
-        # these get dropped very rapidly once AGEN is carried out.
-        # XXX TODO
-
-        # connect fureg matrix as a mem system
-        comb += self.g_int_ld_pend_o.eq(intregdeps.v_rd_rsel_o)
-        comb += self.g_int_st_pend_o.eq(intregdeps.v_wr_rsel_o)
-
-        comb += intregdeps.rd_pend_i.eq(intregdeps.v_rd_rsel_o)
-        comb += intregdeps.wr_pend_i.eq(intregdeps.v_wr_rsel_o)
-
-        comb += intfudeps.rd_pend_i.eq(intregdeps.rd_pend_o)
-        comb += intfudeps.wr_pend_i.eq(intregdeps.wr_pend_o)
-        self.st_pend_o = intregdeps.wr_pend_o # also output for use in WaWGrid
-
-        comb += intfudeps.issue_i.eq(self.fn_issue_i)
-        comb += intfudeps.go_rd_i.eq(self.go_ld_i)
-        comb += intfudeps.go_wr_i.eq(self.go_st_i)
-        comb += intfudeps.go_die_i.eq(self.go_die_i)
-        comb += self.loadable_o.eq(intfudeps.readable_o)
-        comb += self.storable_o.eq(intfudeps.writable_o)
-        comb += self.addr_nomatch_o.eq(intregdeps.addr_nomatch_o)
-
-        # Connect function issue / arrays, and dest/src1/src2
-        comb += intregdeps.dest_i.eq(self.st_i)
-        comb += intregdeps.src_i[0].eq(self.ld_i)
-
-        comb += intregdeps.go_rd_i.eq(self.go_ld_i)
-        comb += intregdeps.go_wr_i.eq(self.go_st_i)
-        comb += intregdeps.go_die_i.eq(self.go_die_i)
-        comb += intregdeps.issue_i.eq(self.fn_issue_i)
-
-        comb += self.st_rsel_o.eq(intregdeps.dest_rsel_o)
-        comb += self.ld_rsel_o.eq(intregdeps.src_rsel_o[0])
-
-        # connect address matching: these get connected to the Addr CUs
-        for i in range(self.n_ldsts):
-            comb += intregdeps.addrs_i[i].eq(self.addrs_i[i])
-        comb += intregdeps.addr_we_i.eq(self.addr_we_i)
-        comb += intregdeps.addr_en_i.eq(self.addr_en_i)
-        comb += intregdeps.addr_rs_i.eq(self.addr_rs_i)
-
-        return m
-
-    def __iter__(self):
-        yield self.ld_i
-        yield self.st_i
-        yield self.g_int_st_pend_o
-        yield self.g_int_ld_pend_o
-        yield self.ld_rsel_o
-        yield self.st_rsel_o
-        yield self.loadable_o
-        yield self.storable_o
-        yield self.go_st_i
-        yield self.go_ld_i
-        yield self.go_die_i
-        yield self.fn_issue_i
-        yield from self.addrs_i
-        yield self.addr_we_i
-        yield self.addr_en_i
-
-    def ports(self):
-        return list(self)
diff --git a/src/scoreboard/reg_select.py b/src/scoreboard/reg_select.py
deleted file mode 100644
index 3919cce3..00000000
--- a/src/scoreboard/reg_select.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from nmigen import Elaboratable, Module, Signal, Array
-
-
-class Reg_Rsv(Elaboratable):
-    """ these are allocated per-Register (vertically),
-        and are each of length fu_count
-    """
-    def __init__(self, fu_count, n_src):
-        self.n_src = n_src
-        self.fu_count = fu_count
-        self.dest_rsel_i = Signal(fu_count, reset_less=True)
-        self.src_rsel_i = Array(Signal(fu_count, name="src_rsel_i",
-                                       reset_less=True) \
-                                for i in range(n_src))
-        self.dest_rsel_o = Signal(reset_less=True)
-        self.src_rsel_o = Signal(n_src, reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.dest_rsel_o.eq(self.dest_rsel_i.bool())
-        for i in range(self.n_src):
-            m.d.comb += self.src_rsel_o[i].eq(self.src_rsel_i[i].bool())
-        return m
-
diff --git a/src/scoreboard/shadow.py b/src/scoreboard/shadow.py
deleted file mode 100644
index 12f20893..00000000
--- a/src/scoreboard/shadow.py
+++ /dev/null
@@ -1,226 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Cat, Array, Const, Elaboratable, Repl
-from nmigen.lib.coding import Decoder
-
-from scoreboard.shadow_fn import ShadowFn
-
-
-class ShadowMatrix(Elaboratable):
-    """ Matrix of Shadow Functions.  One per FU.
-
-        Inputs
-        * :n_fus:       register file width
-        * :shadow_wid:  number of shadow/fail/good/go_die sets
-
-        Notes:
-
-        * Shadow enable/fail/good are all connected to all Shadow Functions
-          (incoming at the top)
-
-        * Output is an array of "shadow active" (schroedinger wires: neither
-          alive nor dead) and an array of "go die" signals, one per FU.
-
-        * the shadown must be connected to the Computation Unit's
-          write release request, preventing it (ANDing) from firing
-          (and thus preventing Writable.  this by the way being the
-           whole point of having the Shadow Matrix...)
-
-        * go_die_o must be connected to *both* the Computation Unit's
-          src-operand and result-operand latch resets, causing both
-          of them to reset.
-
-        * go_die_o also needs to be wired into the Dependency and Function
-          Unit Matrices by way of over-enabling (ORing) into Go_Read and
-          Go_Write, resetting every cell that is required to "die"
-    """
-    def __init__(self, n_fus, shadow_wid=0, syncreset=False):
-        self.syncreset = syncreset
-        self.n_fus = n_fus
-        self.shadow_wid = shadow_wid
-
-        # inputs
-        self.issue_i = Signal(n_fus, reset_less=True)
-        self.reset_i = Signal(n_fus, reset_less=True)
-        self.shadow_i = Array(Signal(shadow_wid, name="sh_i", reset_less=True) \
-                            for f in range(n_fus))
-        self.s_fail_i = Array(Signal(shadow_wid, name="fl_i", reset_less=True) \
-                            for f in range(n_fus))
-        self.s_good_i = Array(Signal(shadow_wid, name="gd_i", reset_less=True) \
-                            for f in range(n_fus))
-        # outputs
-        self.go_die_o = Signal(n_fus, reset_less=True)
-        self.shadown_o = Signal(n_fus, reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        shadows = []
-        for i in range(self.n_fus):
-            sh = ShadowFn(self.shadow_wid, self.syncreset)
-            setattr(m.submodules, "sh%d" % i, sh)
-            shadows.append(sh)
-            # connect shadow/fail/good to all shadows
-            m.d.comb += sh.s_fail_i.eq(self.s_fail_i[i])
-            m.d.comb += sh.s_good_i.eq(self.s_good_i[i])
-            # this one is the matrix (shadow enables)
-            m.d.comb += sh.shadow_i.eq(self.shadow_i[i])
-
-        # connect all shadow outputs and issue input
-        issue_l = []
-        reset_l = []
-        sho_l = []
-        rec_l = []
-        for l in shadows:
-            issue_l.append(l.issue_i)
-            reset_l.append(l.reset_i)
-            sho_l.append(l.shadown_o)
-            rec_l.append(l.go_die_o)
-        m.d.comb += Cat(*issue_l).eq(self.issue_i)
-        m.d.comb += Cat(*reset_l).eq(self.reset_i)
-        m.d.comb += self.shadown_o.eq(Cat(*sho_l))
-        m.d.comb += self.go_die_o.eq(Cat(*rec_l))
-
-        return m
-
-    def __iter__(self):
-        yield self.issue_i
-        yield self.reset_i
-        yield from self.shadow_i
-        yield from self.s_fail_i
-        yield from self.s_good_i
-        yield self.go_die_o
-        yield self.shadown_o
-
-    def ports(self):
-        return list(self)
-
-
-class BranchSpeculationRecord(Elaboratable):
-    """ A record of which function units will be cancelled and which
-        allowed to proceed, on a branch.
-
-        Whilst the input is a pair that says whether the instruction is
-        under the "success" branch shadow (good_i) or the "fail" shadow
-        (fail_i path), when the branch result is known, the "good" path
-        must be cancelled if "fail" occurred, and the "fail" path cancelled
-        if "good" occurred.
-
-        therefore, use "good|~fail" and "fail|~good" respectively as
-        output.
-    """
-
-    def __init__(self, n_fus):
-        self.n_fus = n_fus
-
-        # inputs: record *expected* status
-        self.active_i = Signal(reset_less=True)
-        self.good_i = Signal(n_fus, reset_less=True)
-        self.fail_i = Signal(n_fus, reset_less=True)
-
-        # inputs: status of branch (when result was known)
-        self.br_i = Signal(reset_less=True)
-        self.br_ok_i = Signal(reset_less=True)
-
-        # outputs: true if the *expected* outcome matched the *actual* outcome
-        self.match_f_o = Signal(n_fus, reset_less=True)
-        self.match_g_o = Signal(n_fus, reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        # registers to record *expected* status
-        good_r = Signal(self.n_fus)
-        fail_r = Signal(self.n_fus)
-
-        for i in range(self.n_fus):
-            with m.If(self.active_i):
-                m.d.sync += good_r[i].eq(good_r[i] | self.good_i[i])
-                m.d.sync += fail_r[i].eq(fail_r[i] | self.fail_i[i])
-            with m.If(self.br_i):
-                with m.If(good_r[i]):
-                    # we expected good, return OK that good was EXPECTED
-                    m.d.comb += self.match_g_o[i].eq(self.br_ok_i)
-                    m.d.comb += self.match_f_o[i].eq(~self.br_ok_i)
-                with m.If(fail_r[i]):
-                    # we expected fail, return OK that fail was EXPECTED
-                    m.d.comb += self.match_g_o[i].eq(~self.br_ok_i)
-                    m.d.comb += self.match_f_o[i].eq(self.br_ok_i)
-                m.d.sync += good_r[i].eq(0) # might be set if issue set as well
-                m.d.sync += fail_r[i].eq(0) # might be set if issue set as well
-
-        return m
-
-    def __iter__(self):
-        yield self.active_i
-        yield self.good_i
-        yield self.fail_i
-        yield self.br_i
-        yield self.br_good_i
-        yield self.br_fail_i
-        yield self.good_o
-        yield self.fail_o
-
-    def ports(self):
-        return list(self)
-
-
-
-class WaWGrid(Elaboratable):
-    """ An NxM grid-selector which raises a 2D bit selected by N and M
-    """
-
-    def __init__(self, n_fus, shadow_wid):
-        self.n_fus = n_fus
-        self.shadow_wid = shadow_wid
-
-        self.shadow_i = Signal(shadow_wid, reset_less=True)
-        self.fu_i = Signal(n_fus, reset_less=True)
-
-        self.waw_o = Array(Signal(shadow_wid, name="waw_o", reset_less=True) \
-                            for f in range(n_fus))
-
-    def elaborate(self, platform):
-        m = Module()
-        for i in range(self.n_fus):
-            v = Repl(self.fu_i[i], self.shadow_wid)
-            m.d.comb += self.waw_o[i].eq(v & self.shadow_i)
-        return m
-
-
-def shadow_sim(dut):
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_rd_i.eq(1)
-    yield
-    yield dut.go_rd_i.eq(0)
-    yield
-    yield dut.go_wr_i.eq(1)
-    yield
-    yield dut.go_wr_i.eq(0)
-    yield
-
-def test_shadow():
-    dut = ShadowMatrix(4, 2)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_shadow.il", "w") as f:
-        f.write(vl)
-
-    dut = BranchSpeculationRecord(4)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_branchspecrecord.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, shadow_sim(dut), vcd_name='test_shadow.vcd')
-
-if __name__ == '__main__':
-    test_shadow()
diff --git a/src/scoreboard/shadow_fn.py b/src/scoreboard/shadow_fn.py
deleted file mode 100644
index 69a56a5c..00000000
--- a/src/scoreboard/shadow_fn.py
+++ /dev/null
@@ -1,111 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Cat, Repl, Const, Elaboratable
-from nmutil.latch import SRLatch
-
-
-class ShadowFn(Elaboratable):
-    """ implements shadowing 11.5.1, p55, just the individual shadow function
-
-        shadowing can be used for branches as well as exceptions (interrupts),
-        load/store hold (exceptions again), and vector-element predication
-        (once the predicate is known, which it may not be at instruction issue)
-
-        Inputs
-        * :shadow_wid:  number of shadow/fail/good/go_die sets
-
-        notes:
-        * when shadow_wid = 0, recover and shadown are Consts (i.e. do nothing)
-    """
-    def __init__(self, slen, syncreset=False):
-
-        self.slen = slen
-        self.syncreset = syncreset
-
-        if self.slen:
-            # inputs
-            self.issue_i = Signal(reset_less=True)
-            self.shadow_i  = Signal(slen, reset_less=True)
-            self.reset_i  = Signal(reset_less=True)
-            self.s_fail_i  = Signal(slen, reset_less=True)
-            self.s_good_i  = Signal(slen, reset_less=True)
-
-            # outputs
-            self.shadown_o = Signal(reset_less=True)
-            self.go_die_o = Signal(reset_less=True)
-        else:
-            # outputs when no shadowing needed
-            self.shadown_o = Const(1)
-            self.go_die_o = Const(0)
-
-    def elaborate(self, platform):
-        m = Module()
-        if self.slen == 0:
-            return
-
-        m.submodules.sl = sl = SRLatch(sync=False, llen=self.slen)
-
-        r_ext = Repl(self.reset_i, self.slen)
-        reset_r = Signal(self.slen)
-        if self.syncreset:
-            m.d.comb += reset_r.eq(self.s_good_i | self.s_fail_i | r_ext)
-        else:
-            m.d.comb += reset_r.eq(self.s_good_i | self.s_fail_i | r_ext)
-
-        i_ext = Repl(self.issue_i, self.slen)
-        m.d.comb += sl.s.eq(self.shadow_i & i_ext & \
-                            ~self.s_good_i & ~reset_r)
-        m.d.comb += sl.r.eq(r_ext | reset_r | self.s_good_i | \
-                            (i_ext & ~self.shadow_i))
-        m.d.comb += self.go_die_o.eq((sl.qlq & self.s_fail_i).bool())
-        m.d.comb += self.shadown_o.eq(~sl.qlq.bool())
-
-        return m
-
-    def __iter__(self):
-        yield self.issue_i
-        yield self.reset_i
-        yield self.shadow_i
-        yield self.s_fail_i
-        yield self.s_good_i
-        yield self.shadown_o
-        yield self.go_die_o
-
-    def ports(self):
-        return list(self)
-
-
-def shadow_fn_unit_sim(dut):
-    yield dut.dest_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.src1_i.eq(1)
-    yield dut.issue_i.eq(1)
-    yield
-    yield
-    yield
-    yield dut.issue_i.eq(0)
-    yield
-    yield dut.go_rd_i.eq(1)
-    yield
-    yield dut.go_rd_i.eq(0)
-    yield
-    yield dut.go_wr_i.eq(1)
-    yield
-    yield dut.go_wr_i.eq(0)
-    yield
-
-
-def test_shadow_fn_unit():
-    dut = ShadowFn(4)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_shadow_fn_unit.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, shadow_fn_unit_sim(dut),
-                   vcd_name='test_shadow_fn_unit.vcd')
-
-if __name__ == '__main__':
-    test_shadow_fn_unit()
diff --git a/src/scoreboard/test_iq.py b/src/scoreboard/test_iq.py
deleted file mode 100644
index 94ceac7e..00000000
--- a/src/scoreboard/test_iq.py
+++ /dev/null
@@ -1,126 +0,0 @@
-""" testing of InstructionQ
-"""
-
-from copy import deepcopy
-from random import randint
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-
-from scoreboard.instruction_q import InstructionQ
-from nmutil.nmoperator import eq
-
-
-class IQSim:
-    def __init__(self, dut, iq, n_in, n_out):
-        self.dut = dut
-        self.iq = iq
-        self.oq = []
-        self.n_in = n_in
-        self.n_out = n_out
-
-    def send(self):
-        i = 0
-        while i < len(self.iq):
-            sendlen = randint(1, self.n_in)
-            sendlen = 1
-            sendlen = min(len(self.iq) - i, sendlen)
-            print ("sendlen", len(self.iq)-i, sendlen)
-            for idx in range(sendlen):
-                instr = self.iq[i+idx]
-                yield from eq(self.dut.data_i[idx], instr)
-                di = yield self.dut.data_i[idx]#.src1_i
-                print ("senddata %d %x" % ((i+idx), di))
-                self.oq.append(di)
-            yield self.dut.p_add_i.eq(sendlen)
-            yield
-            o_p_ready = yield self.dut.p_ready_o
-            while not o_p_ready:
-                yield
-                o_p_ready = yield self.dut.p_ready_o
-
-            yield self.dut.p_add_i.eq(0)
-
-            print ("send", len(self.iq), i, sendlen)
-
-            # wait random period of time before queueing another value
-            for j in range(randint(0, 3)):
-                yield
-
-            i += sendlen
-
-        yield self.dut.p_add_i.eq(0)
-        yield
-
-        print ("send ended")
-
-        ## wait random period of time before queueing another value
-        #for i in range(randint(0, 3)):
-        #    yield
-
-        #send_range = randint(0, 3)
-        #if send_range == 0:
-        #    send = True
-        #else:
-        #    send = randint(0, send_range) != 0
-
-    def rcv(self):
-        i = 0
-        yield
-        yield
-        yield
-        while i < len(self.iq):
-            rcvlen = randint(1, self.n_out)
-            #print ("outreq", rcvlen)
-            yield self.dut.n_sub_i.eq(rcvlen)
-            n_sub_o = yield self.dut.n_sub_o
-            print ("recv", n_sub_o)
-            for j in range(n_sub_o):
-                r = yield self.dut.data_o[j]#.src1_i
-                print ("recvdata %x %s" % (r, repr(self.iq[i+j])))
-                assert r == self.oq[i+j]
-            yield
-            if n_sub_o == 0:
-                continue
-            yield self.dut.n_sub_i.eq(0)
-
-            i += n_sub_o
-
-        print ("recv ended")
-
-
-def mk_insns(n_insns, wid, opwid):
-    res = []
-    for i in range(n_insns):
-        op1 = randint(0, (1<<wid)-1)
-        opi = randint(0, 1)
-        op2 = randint(0, (1<<wid)-1)
-        dst = randint(0, (1<<wid)-1)
-        oper = randint(0, (1<<opwid)-1)
-        imm = randint(0, (1<<wid)-1)
-        res.append({'oper_i': oper, 'opim_i': opi, 
-                    'imm_i': imm, 'dest_i': dst,
-                    'src1_i': op1, 'src2_i': op2})
-    return res
-
-
-def test_iq():
-    wid = 8
-    opwid = 4
-    qlen = 2
-    n_in = 1
-    n_out = 1
-    dut = InstructionQ(wid, opwid, qlen, n_in, n_out)
-    insns = mk_insns(1000, wid, opwid)
-
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_iq.il", "w") as f:
-        f.write(vl)
-
-    test = IQSim(dut, insns, n_in, n_out)
-    print (insns)
-    run_simulation(dut, [test.rcv(), test.send()
-                        ],
-                   vcd_name="test_iq.vcd")
-
-if __name__ == '__main__':
-    test_iq()
diff --git a/src/scoreboard/test_mem2_fu_matrix.py b/src/scoreboard/test_mem2_fu_matrix.py
deleted file mode 100644
index 0b0150ea..00000000
--- a/src/scoreboard/test_mem2_fu_matrix.py
+++ /dev/null
@@ -1,586 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Const, Signal, Array, Cat, Elaboratable
-
-from regfile.regfile import RegFileArray, treereduce
-from scoreboard.global_pending import GlobalPending
-from scoreboard.group_picker import GroupPicker
-from scoreboard.issue_unit import IssueUnitGroup, IssueUnitArray, RegDecode
-from scoreboard.shadow import ShadowMatrix, BranchSpeculationRecord
-from scoreboard.memfu import MemFunctionUnits
-from nmutil.latch import SRLatch
-from nmutil.nmoperator import eq
-
-from random import randint, seed
-from copy import deepcopy
-from math import log
-
-
-class Memory(Elaboratable):
-    def __init__(self, regwid, addrw):
-        self.ddepth = regwid/8
-        depth = (1<<addrw) / self.ddepth
-        self.adr   = Signal(addrw)
-        self.dat_r = Signal(regwid)
-        self.dat_w = Signal(regwid)
-        self.we    = Signal()
-        self.mem   = Memory(width=regwid, depth=depth, init=range(0, depth))
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.rdport = rdport = self.mem.read_port()
-        m.submodules.wrport = wrport = self.mem.write_port()
-        m.d.comb += [
-            rdport.addr.eq(self.adr[self.ddepth:]), # ignore low bits
-            self.dat_r.eq(rdport.data),
-            wrport.addr.eq(self.adr),
-            wrport.data.eq(self.dat_w),
-            wrport.en.eq(self.we),
-        ]
-        return m
-
-
-class MemSim:
-    def __init__(self, regwid, addrw):
-        self.regwid = regwid
-        self.ddepth = regwid//8
-        depth = (1<<addrw) // self.ddepth
-        self.mem = list(range(0, depth))
-
-    def ld(self, addr):
-        return self.mem[addr>>self.ddepth]
-
-    def st(self, addr, data):
-        self.mem[addr>>self.ddepth] = data & ((1<<self.regwid)-1)
-
-
-class Scoreboard(Elaboratable):
-    def __init__(self, rwid, n_regs):
-        """ Inputs:
-
-            * :rwid:   bit width of register file(s) - both FP and INT
-            * :n_regs: depth of register file(s) - number of FP and INT regs
-        """
-        self.rwid = rwid
-        self.n_regs = n_regs
-
-        # Register Files
-        self.intregs = RegFileArray(rwid, n_regs)
-        self.fpregs = RegFileArray(rwid, n_regs)
-
-        # issue q needs to get at these
-        self.aluissue = IssueUnitGroup(4)
-        self.brissue = IssueUnitGroup(1)
-        # and these
-        self.alu_oper_i = Signal(4, reset_less=True)
-        self.alu_imm_i = Signal(rwid, reset_less=True)
-        self.br_oper_i = Signal(4, reset_less=True)
-        self.br_imm_i = Signal(rwid, reset_less=True)
-
-        # inputs
-        self.int_dest_i = Signal(max=n_regs, reset_less=True) # Dest R# in
-        self.int_src1_i = Signal(max=n_regs, reset_less=True) # oper1 R# in
-        self.int_src2_i = Signal(max=n_regs, reset_less=True) # oper2 R# in
-        self.reg_enable_i = Signal(reset_less=True) # enable reg decode
-
-        # outputs
-        self.issue_o = Signal(reset_less=True) # instruction was accepted
-        self.busy_o = Signal(reset_less=True) # at least one CU is busy
-
-        # for branch speculation experiment.  branch_direction = 0 if
-        # the branch hasn't been met yet.  1 indicates "success", 2 is "fail"
-        # branch_succ and branch_fail are requests to have the current
-        # instruction be dependent on the branch unit "shadow" capability.
-        self.branch_succ_i = Signal(reset_less=True)
-        self.branch_fail_i = Signal(reset_less=True)
-        self.branch_direction_o = Signal(2, reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-        sync = m.d.sync
-
-        m.submodules.intregs = self.intregs
-        m.submodules.fpregs = self.fpregs
-
-        # register ports
-        int_dest = self.intregs.write_port("dest")
-        int_src1 = self.intregs.read_port("src1")
-        int_src2 = self.intregs.read_port("src2")
-
-        fp_dest = self.fpregs.write_port("dest")
-        fp_src1 = self.fpregs.read_port("src1")
-        fp_src2 = self.fpregs.read_port("src2")
-
-        # Int ALUs and Comp Units
-        n_int_alus = 5
-        cua = CompUnitALUs(self.rwid, 3)
-        cub = CompUnitBR(self.rwid, 3)
-        m.submodules.cu = cu = CompUnitsBase(self.rwid, [cua, cub])
-        bgt = cub.bgt # get at the branch computation unit
-        br1 = cub.br1
-
-        # Int FUs
-        m.submodules.intfus = intfus = FunctionUnits(self.n_regs, n_int_alus)
-
-        # Count of number of FUs
-        n_intfus = n_int_alus
-        n_fp_fus = 0 # for now
-
-        # Integer Priority Picker 1: Adder + Subtractor
-        intpick1 = GroupPicker(n_intfus) # picks between add, sub, mul and shf
-        m.submodules.intpick1 = intpick1
-
-        # INT/FP Issue Unit
-        regdecode = RegDecode(self.n_regs)
-        m.submodules.regdecode = regdecode
-        issueunit = IssueUnitArray([self.aluissue, self.brissue])
-        m.submodules.issueunit = issueunit
-
-        # Shadow Matrix.  currently n_intfus shadows, to be used for
-        # write-after-write hazards.  NOTE: there is one extra for branches,
-        # so the shadow width is increased by 1
-        m.submodules.shadows = shadows = ShadowMatrix(n_intfus, n_intfus, True)
-        m.submodules.bshadow = bshadow = ShadowMatrix(n_intfus, 1, False)
-
-        # record previous instruction to cast shadow on current instruction
-        prev_shadow = Signal(n_intfus)
-
-        # Branch Speculation recorder.  tracks the success/fail state as
-        # each instruction is issued, so that when the branch occurs the
-        # allow/cancel can be issued as appropriate.
-        m.submodules.specrec = bspec = BranchSpeculationRecord(n_intfus)
-
-        #---------
-        # ok start wiring things together...
-        # "now hear de word of de looord... dem bones dem bones dem dryy bones"
-        # https://www.youtube.com/watch?v=pYb8Wm6-QfA
-        #---------
-
-        #---------
-        # Issue Unit is where it starts.  set up some in/outs for this module
-        #---------
-        comb += [    regdecode.dest_i.eq(self.int_dest_i),
-                     regdecode.src1_i.eq(self.int_src1_i),
-                     regdecode.src2_i.eq(self.int_src2_i),
-                     regdecode.enable_i.eq(self.reg_enable_i),
-                     self.issue_o.eq(issueunit.issue_o)
-                    ]
-
-        # take these to outside (issue needs them)
-        comb += cua.oper_i.eq(self.alu_oper_i)
-        comb += cua.imm_i.eq(self.alu_imm_i)
-        comb += cub.oper_i.eq(self.br_oper_i)
-        comb += cub.imm_i.eq(self.br_imm_i)
-
-        # TODO: issueunit.f (FP)
-
-        # and int function issue / busy arrays, and dest/src1/src2
-        comb += intfus.dest_i.eq(regdecode.dest_o)
-        comb += intfus.src1_i.eq(regdecode.src1_o)
-        comb += intfus.src2_i.eq(regdecode.src2_o)
-
-        fn_issue_o = issueunit.fn_issue_o
-
-        comb += intfus.fn_issue_i.eq(fn_issue_o)
-        comb += issueunit.busy_i.eq(cu.busy_o)
-        comb += self.busy_o.eq(cu.busy_o.bool())
-
-        #---------
-        # merge shadow matrices outputs
-        #---------
-
-        # these are explained in ShadowMatrix docstring, and are to be
-        # connected to the FUReg and FUFU Matrices, to get them to reset
-        anydie = Signal(n_intfus, reset_less=True)
-        allshadown = Signal(n_intfus, reset_less=True)
-        shreset = Signal(n_intfus, reset_less=True)
-        comb += allshadown.eq(shadows.shadown_o & bshadow.shadown_o)
-        comb += anydie.eq(shadows.go_die_o | bshadow.go_die_o)
-        comb += shreset.eq(bspec.match_g_o | bspec.match_f_o)
-
-        #---------
-        # connect fu-fu matrix
-        #---------
-
-        # Group Picker... done manually for now.
-        go_rd_o = intpick1.go_rd_o
-        go_wr_o = intpick1.go_wr_o
-        go_rd_i = intfus.go_rd_i
-        go_wr_i = intfus.go_wr_i
-        go_die_i = intfus.go_die_i
-        # NOTE: connect to the shadowed versions so that they can "die" (reset)
-        comb += go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus]) # rd
-        comb += go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus]) # wr
-        comb += go_die_i[0:n_intfus].eq(anydie[0:n_intfus]) # die
-
-        # Connect Picker
-        #---------
-        comb += intpick1.rd_rel_i[0:n_intfus].eq(cu.rd_rel_o[0:n_intfus])
-        comb += intpick1.req_rel_i[0:n_intfus].eq(cu.req_rel_o[0:n_intfus])
-        int_rd_o = intfus.readable_o
-        int_wr_o = intfus.writable_o
-        comb += intpick1.readable_i[0:n_intfus].eq(int_rd_o[0:n_intfus])
-        comb += intpick1.writable_i[0:n_intfus].eq(int_wr_o[0:n_intfus])
-
-        #---------
-        # Shadow Matrix
-        #---------
-
-        comb += shadows.issue_i.eq(fn_issue_o)
-        #comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
-        comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
-        #---------
-        # NOTE; this setup is for the instruction order preservation...
-
-        # connect shadows / go_dies to Computation Units
-        comb += cu.shadown_i[0:n_intfus].eq(allshadown)
-        comb += cu.go_die_i[0:n_intfus].eq(anydie)
-
-        # ok connect first n_int_fu shadows to busy lines, to create an
-        # instruction-order linked-list-like arrangement, using a bit-matrix
-        # (instead of e.g. a ring buffer).
-        # XXX TODO
-
-        # when written, the shadow can be cancelled (and was good)
-        for i in range(n_intfus):
-            comb += shadows.s_good_i[i][0:n_intfus].eq(go_wr_o[0:n_intfus])
-
-        # *previous* instruction shadows *current* instruction, and, obviously,
-        # if the previous is completed (!busy) don't cast the shadow!
-        comb += prev_shadow.eq(~fn_issue_o & cu.busy_o)
-        for i in range(n_intfus):
-            comb += shadows.shadow_i[i][0:n_intfus].eq(prev_shadow)
-
-        #---------
-        # ... and this is for branch speculation.  it uses the extra bit
-        # tacked onto the ShadowMatrix (hence shadow_wid=n_intfus+1)
-        # only needs to set shadow_i, s_fail_i and s_good_i
-
-        # issue captures shadow_i (if enabled)
-        comb += bshadow.reset_i[0:n_intfus].eq(shreset[0:n_intfus])
-
-        bactive = Signal(reset_less=True)
-        comb += bactive.eq((bspec.active_i | br1.issue_i) & ~br1.go_wr_i)
-
-        # instruction being issued (fn_issue_o) has a shadow cast by the branch
-        with m.If(bactive & (self.branch_succ_i | self.branch_fail_i)):
-            comb += bshadow.issue_i.eq(fn_issue_o)
-            for i in range(n_intfus):
-                with m.If(fn_issue_o & (Const(1<<i))):
-                    comb += bshadow.shadow_i[i][0].eq(1)
-
-        # finally, we need an indicator to the test infrastructure as to
-        # whether the branch succeeded or failed, plus, link up to the
-        # "recorder" of whether the instruction was under shadow or not
-
-        with m.If(br1.issue_i):
-            sync += bspec.active_i.eq(1)
-        with m.If(self.branch_succ_i):
-            comb += bspec.good_i.eq(fn_issue_o & 0x1f)
-        with m.If(self.branch_fail_i):
-            comb += bspec.fail_i.eq(fn_issue_o & 0x1f)
-
-        # branch is active (TODO: a better signal: this is over-using the
-        # go_write signal - actually the branch should not be "writing")
-        with m.If(br1.go_wr_i):
-            sync += self.branch_direction_o.eq(br1.data_o+Const(1, 2))
-            sync += bspec.active_i.eq(0)
-            comb += bspec.br_i.eq(1)
-            # branch occurs if data == 1, failed if data == 0
-            comb += bspec.br_ok_i.eq(br1.data_o == 1)
-            for i in range(n_intfus):
-                # *expected* direction of the branch matched against *actual*
-                comb += bshadow.s_good_i[i][0].eq(bspec.match_g_o[i])
-                # ... or it didn't
-                comb += bshadow.s_fail_i[i][0].eq(bspec.match_f_o[i])
-
-        #---------
-        # Connect Register File(s)
-        #---------
-        comb += int_dest.wen.eq(intfus.dest_rsel_o)
-        comb += int_src1.ren.eq(intfus.src1_rsel_o)
-        comb += int_src2.ren.eq(intfus.src2_rsel_o)
-
-        # connect ALUs to regfule
-        comb += int_dest.data_i.eq(cu.data_o)
-        comb += cu.src1_i.eq(int_src1.data_o)
-        comb += cu.src2_i.eq(int_src2.data_o)
-
-        # connect ALU Computation Units
-        comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
-        comb += cu.go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus])
-        comb += cu.issue_i[0:n_intfus].eq(fn_issue_o[0:n_intfus])
-
-        return m
-
-    def __iter__(self):
-        yield from self.intregs
-        yield from self.fpregs
-        yield self.int_dest_i
-        yield self.int_src1_i
-        yield self.int_src2_i
-        yield self.issue_o
-        yield self.branch_succ_i
-        yield self.branch_fail_i
-        yield self.branch_direction_o
-
-    def ports(self):
-        return list(self)
-
-
-
-
-def int_instr(dut, op, imm, src1, src2, dest, branch_success, branch_fail):
-    yield from disable_issue(dut)
-    yield dut.int_dest_i.eq(dest)
-    yield dut.int_src1_i.eq(src1)
-    yield dut.int_src2_i.eq(src2)
-    if (op & (0x3<<2)) != 0: # branch
-        yield dut.brissue.insn_i.eq(1)
-        yield dut.br_oper_i.eq(Const(op & 0x3, 2))
-        yield dut.br_imm_i.eq(imm)
-        dut_issue = dut.brissue
-    else:
-        yield dut.aluissue.insn_i.eq(1)
-        yield dut.alu_oper_i.eq(Const(op & 0x3, 2))
-        yield dut.alu_imm_i.eq(imm)
-        dut_issue = dut.aluissue
-    yield dut.reg_enable_i.eq(1)
-
-    # these indicate that the instruction is to be made shadow-dependent on
-    # (either) branch success or branch fail
-    yield dut.branch_fail_i.eq(branch_fail)
-    yield dut.branch_succ_i.eq(branch_success)
-
-    yield
-    yield from wait_for_issue(dut, dut_issue)
-
-
-def print_reg(dut, rnums):
-    rs = []
-    for rnum in rnums:
-        reg = yield dut.intregs.regs[rnum].reg
-        rs.append("%x" % reg)
-    rnums = map(str, rnums)
-    print ("reg %s: %s" % (','.join(rnums), ','.join(rs)))
-
-
-def create_random_ops(dut, n_ops, shadowing=False, max_opnums=3):
-    insts = []
-    for i in range(n_ops):
-        src1 = randint(1, dut.n_regs-1)
-        src2 = randint(1, dut.n_regs-1)
-        imm = randint(1, (1<<dut.rwid)-1)
-        dest = randint(1, dut.n_regs-1)
-        op = randint(0, max_opnums)
-        opi = 0 if randint(0, 2) else 1 # set true if random is nonzero
-
-        if shadowing:
-            insts.append((src1, src2, dest, op, opi, imm, (0, 0)))
-        else:
-            insts.append((src1, src2, dest, op, opi, imm))
-    return insts
-
-
-
-def scoreboard_sim(dut, alusim):
-
-    seed(0)
-
-    for i in range(50):
-
-        # set random values in the registers
-        for i in range(1, dut.n_regs):
-            val = randint(0, (1<<alusim.rwidth)-1)
-            #val = 31+i*3
-            #val = i
-            yield dut.intregs.regs[i].reg.eq(val)
-            alusim.setval(i, val)
-
-        # create some instructions (some random, some regression tests)
-        instrs = []
-        if True:
-            instrs = create_random_ops(dut, 15, True, 4)
-
-        if False:
-            instrs.append( (1, 2, 2, 1, 1, 20, (0, 0)) )
-
-        if False:
-            instrs.append( (7, 3, 2, 4, (0, 0)) )
-            instrs.append( (7, 6, 6, 2, (0, 0)) )
-            instrs.append( (1, 7, 2, 2, (0, 0)) )
-
-        if False:
-            instrs.append((2, 3, 3, 0, 0, 0, (0, 0)))
-            instrs.append((5, 3, 3, 1, 0, 0, (0, 0)))
-            instrs.append((3, 5, 5, 2, 0, 0, (0, 0)))
-            instrs.append((5, 3, 3, 3, 0, 0, (0, 0)))
-            instrs.append((3, 5, 5, 0, 0, 0, (0, 0)))
-
-        if False:
-            instrs.append( (3, 3, 4, 0, 0, 13979, (0, 0)))
-            instrs.append( (6, 4, 1, 2, 0, 40976, (0, 0)))
-            instrs.append( (1, 4, 7, 4, 1, 23652, (0, 0)))
-
-        if False:
-            instrs.append((5, 6, 2, 1))
-            instrs.append((2, 2, 4, 0))
-            #instrs.append((2, 2, 3, 1))
-
-        if False:
-            instrs.append((2, 1, 2, 3))
-
-        if False:
-            instrs.append((2, 6, 2, 1))
-            instrs.append((2, 1, 2, 0))
-
-        if False:
-            instrs.append((1, 2, 7, 2))
-            instrs.append((7, 1, 5, 0))
-            instrs.append((4, 4, 1, 1))
-
-        if False:
-            instrs.append((5, 6, 2, 2))
-            instrs.append((1, 1, 4, 1))
-            instrs.append((6, 5, 3, 0))
-
-        if False:
-            # Write-after-Write Hazard
-            instrs.append( (3, 6, 7, 2) )
-            instrs.append( (4, 4, 7, 1) )
-
-        if False:
-            # self-read/write-after-write followed by Read-after-Write
-            instrs.append((1, 1, 1, 1))
-            instrs.append((1, 5, 3, 0))
-
-        if False:
-            # Read-after-Write followed by self-read-after-write
-            instrs.append((5, 6, 1, 2))
-            instrs.append((1, 1, 1, 1))
-
-        if False:
-            # self-read-write sandwich
-            instrs.append((5, 6, 1, 2))
-            instrs.append((1, 1, 1, 1))
-            instrs.append((1, 5, 3, 0))
-
-        if False:
-            # very weird failure
-            instrs.append( (5, 2, 5, 2) )
-            instrs.append( (2, 6, 3, 0) )
-            instrs.append( (4, 2, 2, 1) )
-
-        if False:
-            v1 = 4
-            yield dut.intregs.regs[5].reg.eq(v1)
-            alusim.setval(5, v1)
-            yield dut.intregs.regs[3].reg.eq(5)
-            alusim.setval(3, 5)
-            instrs.append((5, 3, 3, 4, (0, 0)))
-            instrs.append((4, 2, 1, 2, (0, 1)))
-
-        if False:
-            v1 = 6
-            yield dut.intregs.regs[5].reg.eq(v1)
-            alusim.setval(5, v1)
-            yield dut.intregs.regs[3].reg.eq(5)
-            alusim.setval(3, 5)
-            instrs.append((5, 3, 3, 4, (0, 0)))
-            instrs.append((4, 2, 1, 2, (1, 0)))
-
-        if False:
-            instrs.append( (4, 3, 5, 1, 0, (0, 0)) )
-            instrs.append( (5, 2, 3, 1, 0, (0, 0)) )
-            instrs.append( (7, 1, 5, 2, 0, (0, 0)) )
-            instrs.append( (5, 6, 6, 4, 0, (0, 0)) )
-            instrs.append( (7, 5, 2, 2, 0, (1, 0)) )
-            instrs.append( (1, 7, 5, 0, 0, (0, 1)) )
-            instrs.append( (1, 6, 1, 2, 0, (1, 0)) )
-            instrs.append( (1, 6, 7, 3, 0, (0, 0)) )
-            instrs.append( (6, 7, 7, 0, 0, (0, 0)) )
-
-        # issue instruction(s), wait for issue to be free before proceeding
-        for i, instr in enumerate(instrs):
-            src1, src2, dest, op, opi, imm, (br_ok, br_fail) = instr
-
-            print ("instr %d: (%d, %d, %d, %d, %d, %d)" % \
-                    (i, src1, src2, dest, op, opi, imm))
-            alusim.op(op, opi, imm, src1, src2, dest)
-            yield from instr_q(dut, op, opi, imm, src1, src2, dest,
-                               br_ok, br_fail)
-
-        # wait for all instructions to stop before checking
-        while True:
-            iqlen = yield dut.qlen_o
-            if iqlen == 0:
-                break
-            yield
-        yield
-        yield
-        yield
-        yield
-        yield from wait_for_busy_clear(dut)
-
-        # check status
-        yield from alusim.check(dut)
-        yield from alusim.dump(dut)
-
-
-def test_scoreboard():
-    dut = IssueToScoreboard(2, 1, 1, 16, 8, 8)
-    alusim = RegSim(16, 8)
-    memsim = MemSim(16, 16)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_scoreboard6600.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, scoreboard_sim(dut, alusim),
-                        vcd_name='test_scoreboard6600.vcd')
-
-    #run_simulation(dut, scoreboard_branch_sim(dut, alusim),
-    #                    vcd_name='test_scoreboard6600.vcd')
-
-
-def mem_sim(dut):
-    yield dut.ld_i.eq(0x1)
-    yield dut.fn_issue_i.eq(0x1)
-    yield
-    yield dut.ld_i.eq(0x0)
-    yield dut.st_i.eq(0x3)
-    yield dut.fn_issue_i.eq(0x2)
-    yield
-    yield dut.st_i.eq(0x0)
-    yield dut.fn_issue_i.eq(0x0)
-    yield
-
-    yield dut.addrs_i[0].eq(0x012)
-    yield dut.addrs_i[1].eq(0x012)
-    yield dut.addrs_i[2].eq(0x010)
-    yield dut.addr_en_i.eq(0x3)
-    yield
-    yield dut.addr_we_i.eq(0x3)
-    yield
-    yield dut.go_ld_i.eq(0x1)
-    yield
-    yield dut.go_ld_i.eq(0x0)
-    yield
-    yield dut.go_st_i.eq(0x2)
-    yield
-    yield dut.go_st_i.eq(0x0)
-    yield
-
-
-def test_mem_fus():
-    dut = MemFunctionUnits(3, 11)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_mem_fus.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, mem_sim(dut),
-                        vcd_name='test_mem_fus.vcd')
-
-
-if __name__ == '__main__':
-    test_mem_fus()
diff --git a/src/scoreboard/test_mem_fu_matrix.py b/src/scoreboard/test_mem_fu_matrix.py
deleted file mode 100644
index 9d2a7c6b..00000000
--- a/src/scoreboard/test_mem_fu_matrix.py
+++ /dev/null
@@ -1,679 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Const, Signal, Array, Cat, Elaboratable
-
-from regfile.regfile import RegFileArray, treereduce
-from scoreboard.ldst_matrix import LDSTDepMatrix
-from scoreboard.fu_mem_matrix import FUMemDepMatrix
-from scoreboard.global_pending import GlobalPending
-from scoreboard.group_picker import GroupPicker
-from scoreboard.issue_unit import IssueUnitGroup, IssueUnitArray, RegDecode
-from scoreboard.shadow import ShadowMatrix, BranchSpeculationRecord
-
-from nmutil.latch import SRLatch
-from nmutil.nmoperator import eq
-
-from random import randint, seed
-from copy import deepcopy
-from math import log
-
-
-class Memory(Elaboratable):
-    def __init__(self, regwid, addrw):
-        self.ddepth = regwid/8
-        depth = (1<<addrw) / self.ddepth
-        self.adr   = Signal(addrw)
-        self.dat_r = Signal(regwid)
-        self.dat_w = Signal(regwid)
-        self.we    = Signal()
-        self.mem   = Memory(width=regwid, depth=depth, init=range(0, depth))
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.rdport = rdport = self.mem.read_port()
-        m.submodules.wrport = wrport = self.mem.write_port()
-        m.d.comb += [
-            rdport.addr.eq(self.adr[self.ddepth:]), # ignore low bits
-            self.dat_r.eq(rdport.data),
-            wrport.addr.eq(self.adr),
-            wrport.data.eq(self.dat_w),
-            wrport.en.eq(self.we),
-        ]
-        return m
-
-
-class MemSim:
-    def __init__(self, regwid, addrw):
-        self.regwid = regwid
-        self.ddepth = regwid//8
-        depth = (1<<addrw) // self.ddepth
-        self.mem = list(range(0, depth))
-
-    def ld(self, addr):
-        return self.mem[addr>>self.ddepth]
-
-    def st(self, addr, data):
-        self.mem[addr>>self.ddepth] = data & ((1<<self.regwid)-1)
-
-
-class MemFunctionUnits(Elaboratable):
-
-    def __init__(self, n_int_alus):
-        self.n_int_alus = n_int_alus
-
-        self.ld_i = Signal(n_int_alus, reset_less=True) # Dest R# in
-        self.st_i = Signal(n_int_alus, reset_less=True) # oper1 R# in
-
-        self.load_hit_i = Signal(n_int_alus, reset_less=True) # Load Hit
-        self.stwd_hit_i = Signal(n_int_alus, reset_less=True) # Store Hit
-
-        #self.g_int_st_pend_o = Signal(n_int_alus, reset_less=True)
-        #self.g_int_ld_pend_o = Signal(n_int_alus, reset_less=True)
-
-        #self.ld_rsel_o = Signal(n_int_alus, reset_less=True) # dest reg (bot)
-        #self.st_rsel_o = Signal(n_int_alus, reset_less=True) # src1 reg (bot)
-
-        self.req_rel_i = Signal(n_int_alus, reset_less = True)
-        self.loadable_o = Signal(n_int_alus, reset_less=True)
-        self.storable_o = Signal(n_int_alus, reset_less=True)
-
-        self.go_st_i = Signal(n_int_alus, reset_less=True)
-        self.go_ld_i = Signal(n_int_alus, reset_less=True)
-        self.go_die_i = Signal(n_int_alus, reset_less=True)
-        self.req_rel_o = Signal(n_int_alus, reset_less=True)
-        self.fn_issue_i = Signal(n_int_alus, reset_less=True)
-
-        # Note: FURegs ld_pend_o is also outputted from here, for use in WaWGrid
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-        sync = m.d.sync
-
-        n_intfus = self.n_int_alus
-
-        # Integer LD/ST Dep Matrix
-        ldstdeps = LDSTDepMatrix(n_intfus)
-        m.submodules.ldstdeps = ldstdeps
-        # Integer FU-Mem Dep Matrix
-        fumemdeps = FUMemDepMatrix(n_intfus, n_intfus)
-        m.submodules.fumemdeps = fumemdeps
-
-        #comb += self.g_int_st_pend_o.eq(fumemdeps.v_st_rsel_o)
-        #comb += self.g_int_ld_pend_o.eq(fumemdeps.v_ld_rsel_o)
-
-        #comb += fumemdeps.st_pend_i.eq(fumemdeps.v_st_rsel_o)
-        #comb += fumemdeps.ld_pend_i.eq(fumemdeps.v_ld_rsel_o)
-
-        #comb += ldstdeps.st_pend_i.eq(fumemdeps.st_pend_o)
-        #comb += ldstdeps.ld_pend_i.eq(fumemdeps.ld_pend_o)
-        #self.ld_pend_o = fumemdeps.ld_pend_o # also output for use in WaWGrid
-
-        comb += ldstdeps.ld_pend_i.eq(self.ld_i)
-        comb += ldstdeps.st_pend_i.eq(self.st_i)
-        comb += ldstdeps.issue_i.eq(self.fn_issue_i)
-        comb += ldstdeps.load_hit_i.eq(self.load_hit_i)
-        comb += ldstdeps.stwd_hit_i.eq(self.stwd_hit_i)
-        comb += ldstdeps.go_die_i.eq(self.go_die_i)
-        comb += self.storable_o.eq(fumemdeps.storable_o)
-        comb += self.loadable_o.eq(fumemdeps.loadable_o)
-        comb += fumemdeps.ld_pend_i.eq(ldstdeps.ld_hold_st_o)
-        comb += fumemdeps.st_pend_i.eq(ldstdeps.st_hold_ld_o)
-
-        # Connect function issue / arrays, and dest/src1/src2
-
-        comb += fumemdeps.go_st_i.eq(self.stwd_hit_i)
-        comb += fumemdeps.go_ld_i.eq(self.load_hit_i)
-        comb += fumemdeps.go_die_i.eq(self.go_die_i)
-        comb += fumemdeps.issue_i.eq(self.fn_issue_i)
-
-        #comb += self.ld_rsel_o.eq(fumemdeps.ld_rsel_o)
-        #comb += self.st_rsel_o.eq(fumemdeps.st_rsel_o)
-
-        return m
-
-    def __iter__(self):
-        yield self.ld_i
-        yield self.st_i
-        #yield self.g_int_st_pend_o
-        #yield self.g_int_ld_pend_o
-        #yield self.ld_rsel_o
-        #yield self.st_rsel_o
-        yield self.req_rel_i
-        yield self.loadable_o
-        yield self.storable_o
-        yield self.load_hit_i
-        yield self.stwd_hit_i
-        yield self.go_st_i
-        yield self.go_ld_i
-        yield self.go_die_i
-        yield self.req_rel_o
-        yield self.fn_issue_i
-
-    def ports(self):
-        return list(self)
-
-
-class Scoreboard(Elaboratable):
-    def __init__(self, rwid, n_regs):
-        """ Inputs:
-
-            * :rwid:   bit width of register file(s) - both FP and INT
-            * :n_regs: depth of register file(s) - number of FP and INT regs
-        """
-        self.rwid = rwid
-        self.n_regs = n_regs
-
-        # Register Files
-        self.intregs = RegFileArray(rwid, n_regs)
-        self.fpregs = RegFileArray(rwid, n_regs)
-
-        # issue q needs to get at these
-        self.aluissue = IssueUnitGroup(4)
-        self.brissue = IssueUnitGroup(1)
-        # and these
-        self.alu_oper_i = Signal(4, reset_less=True)
-        self.alu_imm_i = Signal(rwid, reset_less=True)
-        self.br_oper_i = Signal(4, reset_less=True)
-        self.br_imm_i = Signal(rwid, reset_less=True)
-
-        # inputs
-        self.int_dest_i = Signal(max=n_regs, reset_less=True) # Dest R# in
-        self.int_src1_i = Signal(max=n_regs, reset_less=True) # oper1 R# in
-        self.int_src2_i = Signal(max=n_regs, reset_less=True) # oper2 R# in
-        self.reg_enable_i = Signal(reset_less=True) # enable reg decode
-
-        # outputs
-        self.issue_o = Signal(reset_less=True) # instruction was accepted
-        self.busy_o = Signal(reset_less=True) # at least one CU is busy
-
-        # for branch speculation experiment.  branch_direction = 0 if
-        # the branch hasn't been met yet.  1 indicates "success", 2 is "fail"
-        # branch_succ and branch_fail are requests to have the current
-        # instruction be dependent on the branch unit "shadow" capability.
-        self.branch_succ_i = Signal(reset_less=True)
-        self.branch_fail_i = Signal(reset_less=True)
-        self.branch_direction_o = Signal(2, reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-        sync = m.d.sync
-
-        m.submodules.intregs = self.intregs
-        m.submodules.fpregs = self.fpregs
-
-        # register ports
-        int_dest = self.intregs.write_port("dest")
-        int_src1 = self.intregs.read_port("src1")
-        int_src2 = self.intregs.read_port("src2")
-
-        fp_dest = self.fpregs.write_port("dest")
-        fp_src1 = self.fpregs.read_port("src1")
-        fp_src2 = self.fpregs.read_port("src2")
-
-        # Int ALUs and Comp Units
-        n_int_alus = 5
-        cua = CompUnitALUs(self.rwid, 3)
-        cub = CompUnitBR(self.rwid, 3)
-        m.submodules.cu = cu = CompUnitsBase(self.rwid, [cua, cub])
-        bgt = cub.bgt # get at the branch computation unit
-        br1 = cub.br1
-
-        # Int FUs
-        m.submodules.intfus = intfus = FunctionUnits(self.n_regs, n_int_alus)
-
-        # Count of number of FUs
-        n_intfus = n_int_alus
-        n_fp_fus = 0 # for now
-
-        # Integer Priority Picker 1: Adder + Subtractor
-        intpick1 = GroupPicker(n_intfus) # picks between add, sub, mul and shf
-        m.submodules.intpick1 = intpick1
-
-        # INT/FP Issue Unit
-        regdecode = RegDecode(self.n_regs)
-        m.submodules.regdecode = regdecode
-        issueunit = IssueUnitArray([self.aluissue, self.brissue])
-        m.submodules.issueunit = issueunit
-
-        # Shadow Matrix.  currently n_intfus shadows, to be used for
-        # write-after-write hazards.  NOTE: there is one extra for branches,
-        # so the shadow width is increased by 1
-        m.submodules.shadows = shadows = ShadowMatrix(n_intfus, n_intfus, True)
-        m.submodules.bshadow = bshadow = ShadowMatrix(n_intfus, 1, False)
-
-        # record previous instruction to cast shadow on current instruction
-        prev_shadow = Signal(n_intfus)
-
-        # Branch Speculation recorder.  tracks the success/fail state as
-        # each instruction is issued, so that when the branch occurs the
-        # allow/cancel can be issued as appropriate.
-        m.submodules.specrec = bspec = BranchSpeculationRecord(n_intfus)
-
-        #---------
-        # ok start wiring things together...
-        # "now hear de word of de looord... dem bones dem bones dem dryy bones"
-        # https://www.youtube.com/watch?v=pYb8Wm6-QfA
-        #---------
-
-        #---------
-        # Issue Unit is where it starts.  set up some in/outs for this module
-        #---------
-        comb += [    regdecode.dest_i.eq(self.int_dest_i),
-                     regdecode.src1_i.eq(self.int_src1_i),
-                     regdecode.src2_i.eq(self.int_src2_i),
-                     regdecode.enable_i.eq(self.reg_enable_i),
-                     self.issue_o.eq(issueunit.issue_o)
-                    ]
-
-        # take these to outside (issue needs them)
-        comb += cua.oper_i.eq(self.alu_oper_i)
-        comb += cua.imm_i.eq(self.alu_imm_i)
-        comb += cub.oper_i.eq(self.br_oper_i)
-        comb += cub.imm_i.eq(self.br_imm_i)
-
-        # TODO: issueunit.f (FP)
-
-        # and int function issue / busy arrays, and dest/src1/src2
-        comb += intfus.dest_i.eq(regdecode.dest_o)
-        comb += intfus.src1_i.eq(regdecode.src1_o)
-        comb += intfus.src2_i.eq(regdecode.src2_o)
-
-        fn_issue_o = issueunit.fn_issue_o
-
-        comb += intfus.fn_issue_i.eq(fn_issue_o)
-        comb += issueunit.busy_i.eq(cu.busy_o)
-        comb += self.busy_o.eq(cu.busy_o.bool())
-
-        #---------
-        # merge shadow matrices outputs
-        #---------
-
-        # these are explained in ShadowMatrix docstring, and are to be
-        # connected to the FUReg and FUFU Matrices, to get them to reset
-        anydie = Signal(n_intfus, reset_less=True)
-        allshadown = Signal(n_intfus, reset_less=True)
-        shreset = Signal(n_intfus, reset_less=True)
-        comb += allshadown.eq(shadows.shadown_o & bshadow.shadown_o)
-        comb += anydie.eq(shadows.go_die_o | bshadow.go_die_o)
-        comb += shreset.eq(bspec.match_g_o | bspec.match_f_o)
-
-        #---------
-        # connect fu-fu matrix
-        #---------
-
-        # Group Picker... done manually for now.
-        go_rd_o = intpick1.go_rd_o
-        go_wr_o = intpick1.go_wr_o
-        go_rd_i = intfus.go_rd_i
-        go_wr_i = intfus.go_wr_i
-        go_die_i = intfus.go_die_i
-        # NOTE: connect to the shadowed versions so that they can "die" (reset)
-        comb += go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus]) # rd
-        comb += go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus]) # wr
-        comb += go_die_i[0:n_intfus].eq(anydie[0:n_intfus]) # die
-
-        # Connect Picker
-        #---------
-        comb += intpick1.rd_rel_i[0:n_intfus].eq(cu.rd_rel_o[0:n_intfus])
-        comb += intpick1.req_rel_i[0:n_intfus].eq(cu.req_rel_o[0:n_intfus])
-        int_rd_o = intfus.readable_o
-        int_wr_o = intfus.writable_o
-        comb += intpick1.readable_i[0:n_intfus].eq(int_rd_o[0:n_intfus])
-        comb += intpick1.writable_i[0:n_intfus].eq(int_wr_o[0:n_intfus])
-
-        #---------
-        # Shadow Matrix
-        #---------
-
-        comb += shadows.issue_i.eq(fn_issue_o)
-        #comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
-        comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
-        #---------
-        # NOTE; this setup is for the instruction order preservation...
-
-        # connect shadows / go_dies to Computation Units
-        comb += cu.shadown_i[0:n_intfus].eq(allshadown)
-        comb += cu.go_die_i[0:n_intfus].eq(anydie)
-
-        # ok connect first n_int_fu shadows to busy lines, to create an
-        # instruction-order linked-list-like arrangement, using a bit-matrix
-        # (instead of e.g. a ring buffer).
-        # XXX TODO
-
-        # when written, the shadow can be cancelled (and was good)
-        for i in range(n_intfus):
-            comb += shadows.s_good_i[i][0:n_intfus].eq(go_wr_o[0:n_intfus])
-
-        # *previous* instruction shadows *current* instruction, and, obviously,
-        # if the previous is completed (!busy) don't cast the shadow!
-        comb += prev_shadow.eq(~fn_issue_o & cu.busy_o)
-        for i in range(n_intfus):
-            comb += shadows.shadow_i[i][0:n_intfus].eq(prev_shadow)
-
-        #---------
-        # ... and this is for branch speculation.  it uses the extra bit
-        # tacked onto the ShadowMatrix (hence shadow_wid=n_intfus+1)
-        # only needs to set shadow_i, s_fail_i and s_good_i
-
-        # issue captures shadow_i (if enabled)
-        comb += bshadow.reset_i[0:n_intfus].eq(shreset[0:n_intfus])
-
-        bactive = Signal(reset_less=True)
-        comb += bactive.eq((bspec.active_i | br1.issue_i) & ~br1.go_wr_i)
-
-        # instruction being issued (fn_issue_o) has a shadow cast by the branch
-        with m.If(bactive & (self.branch_succ_i | self.branch_fail_i)):
-            comb += bshadow.issue_i.eq(fn_issue_o)
-            for i in range(n_intfus):
-                with m.If(fn_issue_o & (Const(1<<i))):
-                    comb += bshadow.shadow_i[i][0].eq(1)
-
-        # finally, we need an indicator to the test infrastructure as to
-        # whether the branch succeeded or failed, plus, link up to the
-        # "recorder" of whether the instruction was under shadow or not
-
-        with m.If(br1.issue_i):
-            sync += bspec.active_i.eq(1)
-        with m.If(self.branch_succ_i):
-            comb += bspec.good_i.eq(fn_issue_o & 0x1f)
-        with m.If(self.branch_fail_i):
-            comb += bspec.fail_i.eq(fn_issue_o & 0x1f)
-
-        # branch is active (TODO: a better signal: this is over-using the
-        # go_write signal - actually the branch should not be "writing")
-        with m.If(br1.go_wr_i):
-            sync += self.branch_direction_o.eq(br1.data_o+Const(1, 2))
-            sync += bspec.active_i.eq(0)
-            comb += bspec.br_i.eq(1)
-            # branch occurs if data == 1, failed if data == 0
-            comb += bspec.br_ok_i.eq(br1.data_o == 1)
-            for i in range(n_intfus):
-                # *expected* direction of the branch matched against *actual*
-                comb += bshadow.s_good_i[i][0].eq(bspec.match_g_o[i])
-                # ... or it didn't
-                comb += bshadow.s_fail_i[i][0].eq(bspec.match_f_o[i])
-
-        #---------
-        # Connect Register File(s)
-        #---------
-        comb += int_dest.wen.eq(intfus.dest_rsel_o)
-        comb += int_src1.ren.eq(intfus.src1_rsel_o)
-        comb += int_src2.ren.eq(intfus.src2_rsel_o)
-
-        # connect ALUs to regfule
-        comb += int_dest.data_i.eq(cu.data_o)
-        comb += cu.src1_i.eq(int_src1.data_o)
-        comb += cu.src2_i.eq(int_src2.data_o)
-
-        # connect ALU Computation Units
-        comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
-        comb += cu.go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus])
-        comb += cu.issue_i[0:n_intfus].eq(fn_issue_o[0:n_intfus])
-
-        return m
-
-    def __iter__(self):
-        yield from self.intregs
-        yield from self.fpregs
-        yield self.int_dest_i
-        yield self.int_src1_i
-        yield self.int_src2_i
-        yield self.issue_o
-        yield self.branch_succ_i
-        yield self.branch_fail_i
-        yield self.branch_direction_o
-
-    def ports(self):
-        return list(self)
-
-
-
-
-def int_instr(dut, op, imm, src1, src2, dest, branch_success, branch_fail):
-    yield from disable_issue(dut)
-    yield dut.int_dest_i.eq(dest)
-    yield dut.int_src1_i.eq(src1)
-    yield dut.int_src2_i.eq(src2)
-    if (op & (0x3<<2)) != 0: # branch
-        yield dut.brissue.insn_i.eq(1)
-        yield dut.br_oper_i.eq(Const(op & 0x3, 2))
-        yield dut.br_imm_i.eq(imm)
-        dut_issue = dut.brissue
-    else:
-        yield dut.aluissue.insn_i.eq(1)
-        yield dut.alu_oper_i.eq(Const(op & 0x3, 2))
-        yield dut.alu_imm_i.eq(imm)
-        dut_issue = dut.aluissue
-    yield dut.reg_enable_i.eq(1)
-
-    # these indicate that the instruction is to be made shadow-dependent on
-    # (either) branch success or branch fail
-    yield dut.branch_fail_i.eq(branch_fail)
-    yield dut.branch_succ_i.eq(branch_success)
-
-    yield
-    yield from wait_for_issue(dut, dut_issue)
-
-
-def print_reg(dut, rnums):
-    rs = []
-    for rnum in rnums:
-        reg = yield dut.intregs.regs[rnum].reg
-        rs.append("%x" % reg)
-    rnums = map(str, rnums)
-    print ("reg %s: %s" % (','.join(rnums), ','.join(rs)))
-
-
-def create_random_ops(dut, n_ops, shadowing=False, max_opnums=3):
-    insts = []
-    for i in range(n_ops):
-        src1 = randint(1, dut.n_regs-1)
-        src2 = randint(1, dut.n_regs-1)
-        imm = randint(1, (1<<dut.rwid)-1)
-        dest = randint(1, dut.n_regs-1)
-        op = randint(0, max_opnums)
-        opi = 0 if randint(0, 2) else 1 # set true if random is nonzero
-
-        if shadowing:
-            insts.append((src1, src2, dest, op, opi, imm, (0, 0)))
-        else:
-            insts.append((src1, src2, dest, op, opi, imm))
-    return insts
-
-
-
-def scoreboard_sim(dut, alusim):
-
-    seed(0)
-
-    for i in range(50):
-
-        # set random values in the registers
-        for i in range(1, dut.n_regs):
-            val = randint(0, (1<<alusim.rwidth)-1)
-            #val = 31+i*3
-            #val = i
-            yield dut.intregs.regs[i].reg.eq(val)
-            alusim.setval(i, val)
-
-        # create some instructions (some random, some regression tests)
-        instrs = []
-        if True:
-            instrs = create_random_ops(dut, 15, True, 4)
-
-        if False:
-            instrs.append( (1, 2, 2, 1, 1, 20, (0, 0)) )
-
-        if False:
-            instrs.append( (7, 3, 2, 4, (0, 0)) )
-            instrs.append( (7, 6, 6, 2, (0, 0)) )
-            instrs.append( (1, 7, 2, 2, (0, 0)) )
-
-        if False:
-            instrs.append((2, 3, 3, 0, 0, 0, (0, 0)))
-            instrs.append((5, 3, 3, 1, 0, 0, (0, 0)))
-            instrs.append((3, 5, 5, 2, 0, 0, (0, 0)))
-            instrs.append((5, 3, 3, 3, 0, 0, (0, 0)))
-            instrs.append((3, 5, 5, 0, 0, 0, (0, 0)))
-
-        if False:
-            instrs.append( (3, 3, 4, 0, 0, 13979, (0, 0)))
-            instrs.append( (6, 4, 1, 2, 0, 40976, (0, 0)))
-            instrs.append( (1, 4, 7, 4, 1, 23652, (0, 0)))
-
-        if False:
-            instrs.append((5, 6, 2, 1))
-            instrs.append((2, 2, 4, 0))
-            #instrs.append((2, 2, 3, 1))
-
-        if False:
-            instrs.append((2, 1, 2, 3))
-
-        if False:
-            instrs.append((2, 6, 2, 1))
-            instrs.append((2, 1, 2, 0))
-
-        if False:
-            instrs.append((1, 2, 7, 2))
-            instrs.append((7, 1, 5, 0))
-            instrs.append((4, 4, 1, 1))
-
-        if False:
-            instrs.append((5, 6, 2, 2))
-            instrs.append((1, 1, 4, 1))
-            instrs.append((6, 5, 3, 0))
-
-        if False:
-            # Write-after-Write Hazard
-            instrs.append( (3, 6, 7, 2) )
-            instrs.append( (4, 4, 7, 1) )
-
-        if False:
-            # self-read/write-after-write followed by Read-after-Write
-            instrs.append((1, 1, 1, 1))
-            instrs.append((1, 5, 3, 0))
-
-        if False:
-            # Read-after-Write followed by self-read-after-write
-            instrs.append((5, 6, 1, 2))
-            instrs.append((1, 1, 1, 1))
-
-        if False:
-            # self-read-write sandwich
-            instrs.append((5, 6, 1, 2))
-            instrs.append((1, 1, 1, 1))
-            instrs.append((1, 5, 3, 0))
-
-        if False:
-            # very weird failure
-            instrs.append( (5, 2, 5, 2) )
-            instrs.append( (2, 6, 3, 0) )
-            instrs.append( (4, 2, 2, 1) )
-
-        if False:
-            v1 = 4
-            yield dut.intregs.regs[5].reg.eq(v1)
-            alusim.setval(5, v1)
-            yield dut.intregs.regs[3].reg.eq(5)
-            alusim.setval(3, 5)
-            instrs.append((5, 3, 3, 4, (0, 0)))
-            instrs.append((4, 2, 1, 2, (0, 1)))
-
-        if False:
-            v1 = 6
-            yield dut.intregs.regs[5].reg.eq(v1)
-            alusim.setval(5, v1)
-            yield dut.intregs.regs[3].reg.eq(5)
-            alusim.setval(3, 5)
-            instrs.append((5, 3, 3, 4, (0, 0)))
-            instrs.append((4, 2, 1, 2, (1, 0)))
-
-        if False:
-            instrs.append( (4, 3, 5, 1, 0, (0, 0)) )
-            instrs.append( (5, 2, 3, 1, 0, (0, 0)) )
-            instrs.append( (7, 1, 5, 2, 0, (0, 0)) )
-            instrs.append( (5, 6, 6, 4, 0, (0, 0)) )
-            instrs.append( (7, 5, 2, 2, 0, (1, 0)) )
-            instrs.append( (1, 7, 5, 0, 0, (0, 1)) )
-            instrs.append( (1, 6, 1, 2, 0, (1, 0)) )
-            instrs.append( (1, 6, 7, 3, 0, (0, 0)) )
-            instrs.append( (6, 7, 7, 0, 0, (0, 0)) )
-
-        # issue instruction(s), wait for issue to be free before proceeding
-        for i, instr in enumerate(instrs):
-            src1, src2, dest, op, opi, imm, (br_ok, br_fail) = instr
-
-            print ("instr %d: (%d, %d, %d, %d, %d, %d)" % \
-                    (i, src1, src2, dest, op, opi, imm))
-            alusim.op(op, opi, imm, src1, src2, dest)
-            yield from instr_q(dut, op, opi, imm, src1, src2, dest,
-                               br_ok, br_fail)
-
-        # wait for all instructions to stop before checking
-        while True:
-            iqlen = yield dut.qlen_o
-            if iqlen == 0:
-                break
-            yield
-        yield
-        yield
-        yield
-        yield
-        yield from wait_for_busy_clear(dut)
-
-        # check status
-        yield from alusim.check(dut)
-        yield from alusim.dump(dut)
-
-
-def test_scoreboard():
-    dut = IssueToScoreboard(2, 1, 1, 16, 8, 8)
-    alusim = RegSim(16, 8)
-    memsim = MemSim(16, 16)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_scoreboard6600.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, scoreboard_sim(dut, alusim),
-                        vcd_name='test_scoreboard6600.vcd')
-
-    #run_simulation(dut, scoreboard_branch_sim(dut, alusim),
-    #                    vcd_name='test_scoreboard6600.vcd')
-
-
-def mem_sim(dut):
-    yield dut.ld_i.eq(0x1)
-    yield dut.fn_issue_i.eq(0x1)
-    yield
-    #yield dut.ld_i.eq(0x0)
-    yield dut.st_i.eq(0x2)
-    yield dut.fn_issue_i.eq(0x2)
-    yield
-    #yield dut.st_i.eq(0x0)
-    yield dut.fn_issue_i.eq(0x0)
-    yield
-
-    yield dut.load_hit_i.eq(0x1)
-    yield
-    yield dut.load_hit_i.eq(0x0)
-    yield
-    yield dut.stwd_hit_i.eq(0x2)
-    yield
-    yield dut.stwd_hit_i.eq(0x0)
-    yield
-
-
-def test_mem_fus():
-    dut = MemFunctionUnits(3)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_mem_fus.il", "w") as f:
-        f.write(vl)
-
-    run_simulation(dut, mem_sim(dut),
-                        vcd_name='test_mem_fus.vcd')
-
-
-if __name__ == '__main__':
-    test_mem_fus()
diff --git a/src/soc/TLB/.gitignore b/src/soc/TLB/.gitignore
new file mode 100644
index 00000000..3324664b
--- /dev/null
+++ b/src/soc/TLB/.gitignore
@@ -0,0 +1,2 @@
+*.wpr
+__pycache__
diff --git a/src/soc/TLB/AddressEncoder.py b/src/soc/TLB/AddressEncoder.py
new file mode 100644
index 00000000..128f2c97
--- /dev/null
+++ b/src/soc/TLB/AddressEncoder.py
@@ -0,0 +1,75 @@
+from nmigen import Module, Signal, Elaboratable
+from nmigen.lib.coding import Encoder, PriorityEncoder
+
+class AddressEncoder(Elaboratable):
+    """Address Encoder
+
+       The purpose of this module is to take in a vector and
+       encode the bits that are one hot into an address. This module
+       combines both nmigen's Encoder and PriorityEncoder and will state
+       whether the input line has a single bit hot, multiple bits hot,
+       or no bits hot. The output line will always have the lowest value
+       address output.
+
+       Usage:
+       The output is valid when either single or multiple match is high.
+       Otherwise output is 0.
+    """
+    def __init__(self, width):
+        """ Arguments:
+            * width: The desired length of the input vector
+        """
+        # Internal
+        self.encoder = Encoder(width)
+        self.p_encoder = PriorityEncoder(width)
+
+        # Input
+        self.i = Signal(width)
+
+        # Output
+        self.single_match = Signal(1)
+        self.multiple_match = Signal(1)
+        self.o = Signal(max=width)
+
+    def elaborate(self, platform=None):
+        m = Module()
+
+        # Add internal submodules
+        m.submodules.encoder = self.encoder
+        m.submodules.p_encoder = self.p_encoder
+
+        m.d.comb += [
+            self.encoder.i.eq(self.i),
+            self.p_encoder.i.eq(self.i)
+        ]
+
+        # Steps:
+        # 1. check if the input vector is non-zero
+        # 2. if non-zero, check if single match or multiple match
+        # 3. set output line to be lowest value address output
+
+        # If the priority encoder recieves an input of 0
+        # If n is 1 then the output is not valid
+        with m.If(self.p_encoder.n):
+            m.d.comb += [
+                self.single_match.eq(0),
+                self.multiple_match.eq(0),
+                self.o.eq(0)
+            ]
+        # If the priority encoder recieves an input > 0
+        with m.Else():
+            # Multiple Match if encoder n is invalid
+            with m.If(self.encoder.n):
+                m.d.comb += [
+                    self.single_match.eq(0),
+                    self.multiple_match.eq(1)
+                ]
+            # Single Match if encoder n is valid
+            with m.Else():
+                m.d.comb += [
+                    self.single_match.eq(1),
+                    self.multiple_match.eq(0)
+                ]
+            # Always set output based on priority encoder output
+            m.d.comb += self.o.eq(self.p_encoder.o)
+        return m
diff --git a/src/soc/TLB/Cam.py b/src/soc/TLB/Cam.py
new file mode 100644
index 00000000..e7d901ff
--- /dev/null
+++ b/src/soc/TLB/Cam.py
@@ -0,0 +1,125 @@
+from nmigen import Array, Cat, Module, Signal, Elaboratable
+from nmigen.lib.coding import Decoder
+from nmigen.cli import main #, verilog
+
+from .CamEntry import CamEntry
+from .AddressEncoder import AddressEncoder
+
+
+class Cam(Elaboratable):
+    """ Content Addressable Memory (CAM)
+
+        The purpose of this module is to quickly look up whether an
+        entry exists given a data key.
+        This module will search for the given data in all internal entries
+        and output whether a  single or multiple match was found.
+        If an single entry is found the address be returned and single_match
+        is set HIGH. If multiple entries are found the lowest address is
+        returned and multiple_match is set HIGH. If neither single_match or
+        multiple_match are HIGH this implies no match was found. To write
+        to the CAM set the address bus to the desired entry and set write_enable
+        HIGH. Entry managment should be performed one level above this block
+        as lookup is performed within.
+
+        Notes:
+        The read and write operations take one clock cycle to complete.
+        Currently the read_warning line is present for interfacing but
+        is not necessary for this design. This module is capable of writing
+        in the first cycle, reading on the second, and output the correct
+        address on the third.
+    """
+
+    def __init__(self, data_size, cam_size):
+        """ Arguments:
+            * data_size: (bits) The bit size of the data
+            * cam_size: (number) The number of entries in the CAM
+        """
+
+        # Internal
+        self.cam_size = cam_size
+        self.encoder = AddressEncoder(cam_size)
+        self.decoder = Decoder(cam_size)
+        self.entry_array = Array(CamEntry(data_size) for x in range(cam_size))
+
+        # Input
+        self.enable = Signal(1)
+        self.write_enable = Signal(1)
+        self.data_in = Signal(data_size) # The data to be written
+        self.data_mask = Signal(data_size) # mask for ternary writes
+        self.address_in = Signal(max=cam_size) # address of CAM Entry to write
+
+        # Output
+        self.read_warning = Signal(1) # High when a read interrupts a write
+        self.single_match = Signal(1) # High when there is only one match
+        self.multiple_match = Signal(1) # High when there at least two matches
+        self.match_address = Signal(max=cam_size) # The lowest address matched
+
+    def elaborate(self, platform=None):
+        m = Module()
+        # AddressEncoder for match types and output address
+        m.submodules.AddressEncoder = self.encoder
+        # Decoder is used to select which entry will be written to
+        m.submodules.Decoder = self.decoder
+        # CamEntry Array Submodules
+        # Note these area added anonymously
+        entry_array = self.entry_array
+        m.submodules += entry_array
+
+        # Decoder logic
+        m.d.comb += [
+            self.decoder.i.eq(self.address_in),
+            self.decoder.n.eq(0)
+        ]
+
+        encoder_vector = []
+        with m.If(self.enable):
+            # Set the key value for every CamEntry
+            for index in range(self.cam_size):
+
+                # Write Operation
+                with m.If(self.write_enable):
+                    with m.If(self.decoder.o[index]):
+                        m.d.comb += entry_array[index].command.eq(2)
+                    with m.Else():
+                        m.d.comb += entry_array[index].command.eq(0)
+
+                # Read Operation
+                with m.Else():
+                    m.d.comb += entry_array[index].command.eq(1)
+
+                # Send data input to all entries
+                m.d.comb += entry_array[index].data_in.eq(self.data_in)
+                # Send all entry matches to encoder
+                ematch = entry_array[index].match
+                encoder_vector.append(ematch)
+
+            # Give input to and accept output from encoder module
+            m.d.comb += [
+                self.encoder.i.eq(Cat(*encoder_vector)),
+                self.single_match.eq(self.encoder.single_match),
+                self.multiple_match.eq(self.encoder.multiple_match),
+                self.match_address.eq(self.encoder.o)
+            ]
+
+        # If the CAM is not enabled set all outputs to 0
+        with m.Else():
+            m.d.comb += [
+                    self.read_warning.eq(0),
+                    self.single_match.eq(0),
+                    self.multiple_match.eq(0),
+                    self.match_address.eq(0)
+            ]
+
+        return m
+
+    def ports(self):
+        return [self.enable, self.write_enable,
+                     self.data_in, self.data_mask,
+                     self.read_warning, self.single_match,
+                     self.multiple_match, self.match_address]
+
+
+if __name__ == '__main__':
+    cam = Cam(4, 4)
+    main(cam, ports=cam.ports())
+
diff --git a/src/soc/TLB/CamEntry.py b/src/soc/TLB/CamEntry.py
new file mode 100644
index 00000000..b1d93082
--- /dev/null
+++ b/src/soc/TLB/CamEntry.py
@@ -0,0 +1,46 @@
+from nmigen import Module, Signal, Elaboratable
+
+
+class CamEntry(Elaboratable):
+    """ Content Addressable Memory (CAM) Entry
+
+        The purpose of this module is to represent an entry within a CAM.
+        This module when given a read command will compare  the given data
+        and output whether a match was found or not. When given a write
+        command it will write the given data into internal registers.
+    """
+
+    def __init__(self, data_size):
+        """ Arguments:
+            * data_size: (bit count) The size of the data
+        """
+        # Input
+        self.command = Signal(2) # 00 => NA 01 => Read 10 => Write 11 => Reset
+        self.data_in = Signal(data_size) # Data input when writing
+
+        # Output
+        self.match = Signal(1) # Result of the internal/input key comparison
+        self.data = Signal(data_size)
+
+    def elaborate(self, platform=None):
+        m = Module()
+        with m.Switch(self.command):
+            with m.Case("00"):
+                m.d.sync += self.match.eq(0)
+            with m.Case("01"):
+                with m.If(self.data == self.data_in):
+                    m.d.sync += self.match.eq(1)
+                with m.Else():
+                    m.d.sync += self.match.eq(0)
+            with m.Case("10"):
+                m.d.sync += [
+                    self.data.eq(self.data_in),
+                    self.match.eq(0)
+                ]
+            with m.Case():
+                m.d.sync += [
+                    self.match.eq(0),
+                    self.data.eq(0)
+                ]
+
+        return m
diff --git a/src/soc/TLB/LFSR.py b/src/soc/TLB/LFSR.py
new file mode 100644
index 00000000..d8b606ec
--- /dev/null
+++ b/src/soc/TLB/LFSR.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# See Notices.txt for copyright information
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+from nmigen.cli import verilog, rtlil
+
+
+class LFSRPolynomial(set):
+    """ implements a polynomial for use in LFSR
+    """
+    def __init__(self, exponents=()):
+        for e in exponents:
+            assert isinstance(e, int), TypeError("%s must be an int" % repr(e))
+            assert (e >= 0), ValueError("%d must not be negative" % e)
+        set.__init__(self, set(exponents).union({0})) # must contain zero
+
+    @property
+    def max_exponent(self):
+        return max(self) # derived from set, so this returns the max exponent
+
+    @property
+    def exponents(self):
+        exponents = list(self) # get elements of set as a list
+        exponents.sort(reverse=True)
+        return exponents
+
+    def __str__(self):
+        expd = {0: "1", 1: 'x', 2: "x^{}"} # case 2 isn't 2, it's min(i,2)
+        retval = map(lambda i: expd[min(i,2)].format(i), self.exponents)
+        return " + ".join(retval)
+
+    def __repr__(self):
+        return "LFSRPolynomial(%s)" % self.exponents
+
+
+# list of selected polynomials from https://web.archive.org/web/20190418121923/https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Some_polynomials_for_maximal_LFSRs  # noqa
+LFSR_POLY_2 = LFSRPolynomial([2, 1, 0])
+LFSR_POLY_3 = LFSRPolynomial([3, 2, 0])
+LFSR_POLY_4 = LFSRPolynomial([4, 3, 0])
+LFSR_POLY_5 = LFSRPolynomial([5, 3, 0])
+LFSR_POLY_6 = LFSRPolynomial([6, 5, 0])
+LFSR_POLY_7 = LFSRPolynomial([7, 6, 0])
+LFSR_POLY_8 = LFSRPolynomial([8, 6, 5, 4, 0])
+LFSR_POLY_9 = LFSRPolynomial([9, 5, 0])
+LFSR_POLY_10 = LFSRPolynomial([10, 7, 0])
+LFSR_POLY_11 = LFSRPolynomial([11, 9, 0])
+LFSR_POLY_12 = LFSRPolynomial([12, 11, 10, 4, 0])
+LFSR_POLY_13 = LFSRPolynomial([13, 12, 11, 8, 0])
+LFSR_POLY_14 = LFSRPolynomial([14, 13, 12, 2, 0])
+LFSR_POLY_15 = LFSRPolynomial([15, 14, 0])
+LFSR_POLY_16 = LFSRPolynomial([16, 15, 13, 4, 0])
+LFSR_POLY_17 = LFSRPolynomial([17, 14, 0])
+LFSR_POLY_18 = LFSRPolynomial([18, 11, 0])
+LFSR_POLY_19 = LFSRPolynomial([19, 18, 17, 14, 0])
+LFSR_POLY_20 = LFSRPolynomial([20, 17, 0])
+LFSR_POLY_21 = LFSRPolynomial([21, 19, 0])
+LFSR_POLY_22 = LFSRPolynomial([22, 21, 0])
+LFSR_POLY_23 = LFSRPolynomial([23, 18, 0])
+LFSR_POLY_24 = LFSRPolynomial([24, 23, 22, 17, 0])
+
+
+class LFSR(LFSRPolynomial, Elaboratable):
+    """ implements a Linear Feedback Shift Register
+    """
+    def __init__(self, polynomial):
+        """ Inputs:
+            ------
+            :polynomial: the polynomial to feedback on.  may be a LFSRPolynomial
+                         instance or an iterable of ints (list/tuple/generator)
+            :enable:     enable (set LO to disable.  NOTE: defaults to HI)
+
+            Outputs:
+            -------
+            :state: the LFSR state.  bitwidth is taken from the polynomial
+                    maximum exponent.
+
+            Note: if an LFSRPolynomial is passed in as the input, because
+            LFSRPolynomial is derived from set() it's ok:
+            LFSRPolynomial(LFSRPolynomial(p)) == LFSRPolynomial(p)
+        """
+        LFSRPolynomial.__init__(self, polynomial)
+        self.state = Signal(self.max_exponent, reset=1)
+        self.enable = Signal(reset=1)
+
+    def elaborate(self, platform):
+        m = Module()
+        # do absolutely nothing if the polynomial is empty (always has a zero)
+        if self.max_exponent <= 1:
+            return m
+
+        # create XOR-bunch, select bits from state based on exponent
+        feedback = Const(0) # doesn't do any harm starting from 0b0 (xor chain)
+        for exponent in self:
+            if exponent > 0: # don't have to skip, saves CPU cycles though
+                feedback ^= self.state[exponent - 1]
+
+        # if enabled, shift-and-feedback
+        with m.If(self.enable):
+            # shift up lower bits by Cat'ing in a new bit zero (feedback)
+            newstate = Cat(feedback, self.state[:-1])
+            m.d.sync += self.state.eq(newstate)
+
+        return m
+
+
+# example: Poly24
+if __name__ == '__main__':
+    p24 = rtlil.convert(LFSR(LFSR_POLY_24))
+    with open("lfsr2_p24.il", "w") as f:
+        f.write(p24)
diff --git a/src/soc/TLB/LFSR.pyi b/src/soc/TLB/LFSR.pyi
new file mode 100644
index 00000000..64eb9115
--- /dev/null
+++ b/src/soc/TLB/LFSR.pyi
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# See Notices.txt for copyright information
+from nmigen import Module
+from typing import Iterable, Optional, Iterator, Any, Union
+from typing_extensions import final
+
+
+@final
+class LFSRPolynomial(set):
+    def __init__(self, exponents: Iterable[int] = ()):
+        def elements() -> Iterable[int]: ...
+    @property
+    def exponents(self) -> list[int]: ...
+    def __str__(self) -> str: ...
+    def __repr__(self) -> str: ...
+
+
+@final
+class LFSR:
+    def __init__(self, polynomial: Union[Iterable[int], LFSRPolynomial]): ...
+    @property
+    def width(self) -> int: ...
+    def elaborate(self, platform: Any) -> Module: ...
diff --git a/src/soc/TLB/Makefile b/src/soc/TLB/Makefile
new file mode 100644
index 00000000..1eb67acc
--- /dev/null
+++ b/src/soc/TLB/Makefile
@@ -0,0 +1,2 @@
+verilog:
+	python3 Cam.py generate -t v > Cam.v
diff --git a/src/soc/TLB/MemorySet.py b/src/soc/TLB/MemorySet.py
new file mode 100644
index 00000000..ea61bdf5
--- /dev/null
+++ b/src/soc/TLB/MemorySet.py
@@ -0,0 +1,66 @@
+from nmigen import Cat, Memory, Module, Signal, Elaboratable
+from nmigen.cli import main
+from nmigen.cli import verilog, rtlil
+
+
+class MemorySet(Elaboratable):
+    def __init__(self, data_size, tag_size, set_count, active):
+        self.active = active
+        input_size = tag_size + data_size # Size of the input data
+        memory_width = input_size + 1 # The width of the cache memory
+        self.active = active
+        self.data_size = data_size
+        self.tag_size = tag_size
+
+        # XXX TODO, use rd-enable and wr-enable?
+        self.mem = Memory(memory_width, set_count)
+        self.r = self.mem.read_port()
+        self.w = self.mem.write_port()
+
+        # inputs (address)
+        self.cset = Signal(max=set_count)  # The set to be checked
+        self.tag = Signal(tag_size)        # The tag to find
+        self.data_i = Signal(data_size)    # Incoming data
+
+        # outputs
+        self.valid = Signal()
+        self.data_o = Signal(data_size)    # Outgoing data (excludes tag)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.mem = self.mem
+        m.submodules.r = self.r
+        m.submodules.w = self.w
+
+        # temporaries
+        active_bit = Signal()
+        tag_valid = Signal()
+        data_start = self.active + 1
+        data_end = data_start + self.data_size
+        tag_start = data_end
+        tag_end = tag_start + self.tag_size
+
+        # connect the read port address to the set/entry
+        read_port = self.r
+        m.d.comb += read_port.addr.eq(self.cset)
+        # Pull out active bit from data
+        data = read_port.data
+        m.d.comb += active_bit.eq(data[self.active])
+        # Validate given tag vs stored tag
+        tag = data[tag_start:tag_end]
+        m.d.comb += tag_valid.eq(self.tag == tag)
+        # An entry is only valid if the tags match AND
+        # is marked as a valid entry
+        m.d.comb += self.valid.eq(tag_valid & active_bit)
+
+        # output data: TODO, check rd-enable?
+        m.d.comb += self.data_o.eq(data[data_start:data_end])
+
+        # connect the write port addr to the set/entry (only if write enabled)
+        # (which is only done on a match, see SAC.write_entry below)
+        write_port = self.w
+        with m.If(write_port.en):
+            m.d.comb += write_port.addr.eq(self.cset)
+            m.d.comb += write_port.data.eq(Cat(1, self.data_i, self.tag))
+
+        return m
diff --git a/src/soc/TLB/PermissionValidator.py b/src/soc/TLB/PermissionValidator.py
new file mode 100644
index 00000000..0107c0e9
--- /dev/null
+++ b/src/soc/TLB/PermissionValidator.py
@@ -0,0 +1,68 @@
+from nmigen import Module, Signal, Elaboratable
+from nmigen.cli import main
+
+from TLB.PteEntry import PteEntry
+
+
+class PermissionValidator(Elaboratable):
+    """ The purpose of this Module is to check the Permissions of a given PTE
+        against the requested access permissions.
+
+        This module will either validate (by setting the valid bit HIGH)
+        the request or find a permission fault and invalidate (by setting
+        the valid bit LOW) the request
+    """
+
+    def __init__(self, asid_size, pte_size):
+        """ Arguments:
+            * asid_size: (bit count) The size of the asid to be processed
+            * pte_size: (bit count) The size of the pte to be processed
+
+            Return:
+            * valid HIGH when permissions are correct
+        """
+        # Internal
+        self.pte_entry = PteEntry(asid_size, pte_size)
+
+        # Input
+        self.data = Signal(asid_size + pte_size);
+        self.xwr = Signal(3) # Execute, Write, Read
+        self.super_mode = Signal(1) # Supervisor Mode
+        self.super_access = Signal(1) # Supervisor Access
+        self.asid = Signal(15) # Address Space IDentifier (ASID)
+
+        # Output
+        self.valid = Signal(1) # Denotes if the permissions are correct
+
+    def elaborate(self, platform=None):
+        m = Module()
+
+        m.submodules.pte_entry = self.pte_entry
+
+        m.d.comb += self.pte_entry.i.eq(self.data)
+
+        # Check if the entry is valid
+        with m.If(self.pte_entry.v):
+            # ASID match or Global Permission
+            # Note that the MSB bound is exclusive
+            with m.If((self.pte_entry.asid == self.asid) | self.pte_entry.g):
+                # Check Execute, Write, Read (XWR) Permissions
+                with m.If(self.pte_entry.xwr == self.xwr):
+                    # Supervisor Logic
+                    with m.If(self.super_mode):
+                        # Valid if entry is not in user mode or supervisor
+                        # has Supervisor User Memory (SUM) access via the
+                        # SUM bit in the sstatus register
+                        m.d.comb += self.valid.eq((~self.pte_entry.u) \
+                                                  | self.super_access)
+                    # User logic
+                    with m.Else():
+                        # Valid if the entry is in user mode only
+                        m.d.comb += self.valid.eq(self.pte_entry.u)
+                with m.Else():
+                    m.d.comb += self.valid.eq(0)
+            with m.Else():
+                m.d.comb += self.valid.eq(0)
+        with m.Else():
+            m.d.comb += self.valid.eq(0)
+        return m
diff --git a/src/soc/TLB/PteEntry.py b/src/soc/TLB/PteEntry.py
new file mode 100644
index 00000000..73ea9220
--- /dev/null
+++ b/src/soc/TLB/PteEntry.py
@@ -0,0 +1,67 @@
+from nmigen import Module, Signal, Elaboratable
+from nmigen.cli import main
+
+
+class PteEntry(Elaboratable):
+    """ The purpose of this Module is to  centralize the parsing of Page
+        Table Entries (PTE) into one module to prevent common mistakes
+        and duplication of code. The control bits are parsed out for
+        ease of use.
+
+        This module parses according to the standard PTE given by the
+        Volume II: RISC-V Privileged Architectures V1.10 Pg 60.
+        The Address Space IDentifier (ASID) is appended to the MSB of the input
+        and is parsed out as such.
+
+        An valid input Signal would be:
+              ASID   PTE
+        Bits:[78-64][63-0]
+
+        The output PTE value will include the control bits.
+    """
+    def __init__(self, asid_size, pte_size):
+        """ Arguments:
+            * asid_size: (bit count) The size of the asid to be processed
+            * pte_size: (bit count) The size of the pte to be processed
+
+            Return:
+            * d The Dirty bit from the PTE portion of i
+            * a The Accessed bit from the PTE portion of i
+            * g The Global bit from the PTE portion of i
+            * u The User Mode bit from the PTE portion of i
+            * xwr The Execute/Write/Read bit from the PTE portion of i
+            * v The Valid bit from the PTE portion of i
+            * asid The asid portion of i
+            * pte The pte portion of i
+        """
+        # Internal
+        self.asid_start = pte_size
+        self.asid_end = pte_size + asid_size
+
+        # Input
+        self.i = Signal(asid_size + pte_size)
+
+        # Output
+        self.d = Signal(1) # Dirty bit (From pte)
+        self.a = Signal(1) # Accessed bit (From pte)
+        self.g = Signal(1) # Global Access (From pte)
+        self.u = Signal(1) # User Mode (From pte)
+        self.xwr = Signal(3) # Execute Read Write (From pte)
+        self.v = Signal(1) # Valid (From pte)
+        self.asid = Signal(asid_size) # Associated Address Space IDentifier
+        self.pte = Signal(pte_size) # Full Page Table Entry
+
+    def elaborate(self, platform=None):
+        m = Module()
+        # Pull out all control bites from PTE
+        m.d.comb += [
+            self.d.eq(self.i[7]),
+            self.a.eq(self.i[6]),
+            self.g.eq(self.i[5]),
+            self.u.eq(self.i[4]),
+            self.xwr.eq(self.i[1:4]),
+            self.v.eq(self.i[0])
+        ]
+        m.d.comb += self.asid.eq(self.i[self.asid_start:self.asid_end])
+        m.d.comb += self.pte.eq(self.i[0:self.asid_start])
+        return m
diff --git a/src/soc/TLB/SetAssociativeCache.py b/src/soc/TLB/SetAssociativeCache.py
new file mode 100644
index 00000000..70c075da
--- /dev/null
+++ b/src/soc/TLB/SetAssociativeCache.py
@@ -0,0 +1,272 @@
+"""
+
+Online simulator of 4-way set-associative cache:
+http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/sa4.html
+
+Python simulator of a N-way set-associative cache:
+https://github.com/vaskevich/CacheSim/blob/master/cachesim.py
+"""
+
+from nmigen import Array, Cat, Memory, Module, Signal, Mux, Elaboratable
+from nmigen.compat.genlib import fsm
+from nmigen.cli import main
+from nmigen.cli import verilog, rtlil
+
+from .AddressEncoder import AddressEncoder
+from .MemorySet import MemorySet
+
+# TODO: use a LFSR that advances continuously and picking the bottom
+# few bits from it to select which cache line to replace, instead of PLRU
+# http://bugs.libre-riscv.org/show_bug.cgi?id=71
+from .ariane.plru import PLRU
+from .LFSR import LFSR, LFSR_POLY_24
+
+SA_NA = "00" # no action (none)
+SA_RD = "01" # read
+SA_WR = "10" # write
+
+
+class SetAssociativeCache(Elaboratable):
+    """ Set Associative Cache Memory
+
+        The purpose of this module is to generate a memory cache given the
+        constraints passed in. This will create a n-way set associative cache.
+        It is expected for the SV TLB that the VMA will provide the set number
+        while the ASID provides the tag (still to be decided).
+
+    """
+    def __init__(self, tag_size, data_size, set_count, way_count, lfsr=False):
+        """ Arguments
+            * tag_size (bits): The bit count of the tag
+            * data_size (bits): The bit count of the data to be stored
+            * set_count (number): The number of sets/entries in the cache
+            * way_count (number): The number of slots a data can be stored
+                                  in one set
+            * lfsr: if set, use an LFSR for (pseudo-randomly) selecting
+                    set/entry to write to.  otherwise, use a PLRU
+        """
+        # Internals
+        self.lfsr_mode = lfsr
+        self.way_count = way_count  # The number of slots in one set
+        self.tag_size = tag_size    # The bit count of the tag
+        self.data_size = data_size  # The bit count of the data to be stored
+
+        # set up Memory array
+        self.mem_array = Array() # memory array
+        for i in range(way_count):
+            ms = MemorySet(data_size, tag_size, set_count, active=0)
+            self.mem_array.append(ms)
+
+        # Finds valid entries
+        self.encoder = AddressEncoder(way_count)
+
+        # setup PLRU or LFSR
+        if lfsr:
+            # LFSR mode
+            self.lfsr = LFSR(LFSR_POLY_24)
+        else:
+            # PLRU mode
+            self.plru = PLRU(way_count) # One block to handle plru calculations
+            self.plru_array = Array() # PLRU data on each set
+            for i in range(set_count):
+                name="plru%d" % i
+                self.plru_array.append(Signal(self.plru.TLBSZ, name=name))
+
+        # Input
+        self.enable = Signal(1)   # Whether the cache is enabled
+        self.command = Signal(2)  # 00=None, 01=Read, 10=Write (see SA_XX)
+        self.cset = Signal(max=set_count)  # The set to be checked
+        self.tag = Signal(tag_size)        # The tag to find
+        self.data_i = Signal(data_size)    # The input data
+
+        # Output
+        self.ready = Signal(1) # 0 => Processing 1 => Ready for commands
+        self.hit = Signal(1)            # Tag matched one way in the given set
+        self.multiple_hit = Signal(1)   # Tag matched many ways in the given set
+        self.data_o = Signal(data_size) # The data linked to the matched tag
+
+    def check_tags(self, m):
+        """ Validate the tags in the selected set. If one and only one
+            tag matches set its state to zero and increment all others
+            by one. We only advance to next state if a single hit is found.
+        """
+        # Vector to store way valid results
+        # A zero denotes a way is invalid
+        valid_vector = []
+        # Loop through memory to prep read/write ports and set valid_vector
+        for i in range(self.way_count):
+            valid_vector.append(self.mem_array[i].valid)
+
+        # Pass encoder the valid vector
+        m.d.comb += self.encoder.i.eq(Cat(*valid_vector))
+
+        # Only one entry should be marked
+        # This is due to already verifying the tags
+        # matched and the valid bit is high
+        with m.If(self.hit):
+            m.next = "FINISHED_READ"
+            # Pull out data from the read port
+            data = self.mem_array[self.encoder.o].data_o
+            m.d.comb += self.data_o.eq(data)
+            if not self.lfsr_mode:
+                self.access_plru(m)
+
+        # Oh no! Seal the gates! Multiple tags matched?!? kasd;ljkafdsj;k
+        with m.Elif(self.multiple_hit):
+            # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck
+            m.d.comb += self.data_o.eq(0)
+
+        # No tag matches means no data
+        with m.Else():
+            # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck
+            m.d.comb += self.data_o.eq(0)
+
+    def access_plru(self, m):
+        """ An entry was accessed and the plru tree must now be updated
+        """
+        # Pull out the set's entry being edited
+        plru_entry = self.plru_array[self.cset]
+        m.d.comb += [
+            # Set the plru data to the current state
+            self.plru.plru_tree.eq(plru_entry),
+            # Set that the cache was accessed
+            self.plru.lu_access_i.eq(1)
+        ]
+
+    def read(self, m):
+        """ Go through the read process of the cache.
+            This takes two cycles to complete. First it checks for a valid tag
+            and secondly it updates the LRU values.
+        """
+        with m.FSM() as fsm_read:
+            with m.State("READY"):
+                m.d.comb += self.ready.eq(0)
+                # check_tags will set the state if the conditions are met
+                self.check_tags(m)
+            with m.State("FINISHED_READ"):
+                m.next = "READY"
+                m.d.comb += self.ready.eq(1)
+                if not self.lfsr_mode:
+                    plru_tree_o = self.plru.plru_tree_o
+                    m.d.sync += self.plru_array[self.cset].eq(plru_tree_o)
+
+    def write_entry(self, m):
+        if not self.lfsr_mode:
+            m.d.comb += [# set cset (mem address) into PLRU
+                         self.plru.plru_tree.eq(self.plru_array[self.cset]),
+                         # and connect plru to encoder for write
+                         self.encoder.i.eq(self.plru.replace_en_o)
+                        ]
+            write_port = self.mem_array[self.encoder.o].w
+        else:
+            # use the LFSR to generate a random(ish) one of the mem array
+            lfsr_output = Signal(max=self.way_count)
+            lfsr_random = Signal(max=self.way_count)
+            m.d.comb += lfsr_output.eq(self.lfsr.state) # lose some bits
+            # address too big, limit to range of array
+            m.d.comb += lfsr_random.eq(Mux(lfsr_output > self.way_count,
+                                           lfsr_output - self.way_count,
+                                           lfsr_output))
+            write_port = self.mem_array[lfsr_random].w
+
+        # then if there is a match from the encoder, enable the selected write
+        with m.If(self.encoder.single_match):
+            m.d.comb += write_port.en.eq(1)
+
+    def write(self, m):
+        """ Go through the write process of the cache.
+            This takes two cycles to complete. First it writes the entry,
+            and secondly it updates the PLRU (in plru mode)
+        """
+        with m.FSM() as fsm_write:
+            with m.State("READY"):
+                m.d.comb += self.ready.eq(0)
+                self.write_entry(m)
+                m.next ="FINISHED_WRITE"
+            with m.State("FINISHED_WRITE"):
+                m.d.comb += self.ready.eq(1)
+                if not self.lfsr_mode:
+                    plru_entry = self.plru_array[self.cset]
+                    m.d.sync += plru_entry.eq(self.plru.plru_tree_o)
+                m.next = "READY"
+
+
+    def elaborate(self, platform=None):
+        m = Module()
+
+        # ----
+        # set up Modules: AddressEncoder, LFSR/PLRU, Mem Array
+        # ----
+
+        m.submodules.AddressEncoder = self.encoder
+        if self.lfsr_mode:
+            m.submodules.LFSR = self.lfsr
+        else:
+            m.submodules.PLRU = self.plru
+
+        for i, mem in enumerate(self.mem_array):
+            setattr(m.submodules, "mem%d" % i, mem)
+
+        # ----
+        # select mode: PLRU connect to encoder, LFSR do... something
+        # ----
+
+        if not self.lfsr_mode:
+            # Set what entry was hit
+            m.d.comb += self.plru.lu_hit.eq(self.encoder.o)
+        else:
+            # enable LFSR
+            m.d.comb += self.lfsr.enable.eq(self.enable)
+
+        # ----
+        # connect hit/multiple hit to encoder output
+        # ----
+
+        m.d.comb += [
+            self.hit.eq(self.encoder.single_match),
+            self.multiple_hit.eq(self.encoder.multiple_match),
+        ]
+
+        # ----
+        # connect incoming data/tag/cset(addr) to mem_array
+        # ----
+
+        for mem in self.mem_array:
+            write_port = mem.w
+            m.d.comb += [mem.cset.eq(self.cset),
+                         mem.tag.eq(self.tag),
+                         mem.data_i.eq(self.data_i),
+                         write_port.en.eq(0), # default: disable write
+                        ]
+        # ----
+        # Commands: READ/WRITE/TODO
+        # ----
+
+        with m.If(self.enable):
+            with m.Switch(self.command):
+                # Search all sets at a particular tag
+                with m.Case(SA_RD):
+                    self.read(m)
+                with m.Case(SA_WR):
+                    self.write(m)
+                    # Maybe catch multiple tags write here?
+                    # TODO
+                # TODO: invalidate/flush, flush-all?
+
+        return m
+
+    def ports(self):
+        return [self.enable, self.command, self.cset, self.tag, self.data_i,
+                self.ready, self.hit, self.multiple_hit, self.data_o]
+
+
+if __name__ == '__main__':
+    sac = SetAssociativeCache(4, 8, 4, 6)
+    vl = rtlil.convert(sac, ports=sac.ports())
+    with open("SetAssociativeCache.il", "w") as f:
+        f.write(vl)
+
+    sac_lfsr = SetAssociativeCache(4, 8, 4, 6, True)
+    vl = rtlil.convert(sac_lfsr, ports=sac_lfsr.ports())
+    with open("SetAssociativeCacheLFSR.il", "w") as f:
+        f.write(vl)
diff --git a/src/soc/TLB/TLB.py b/src/soc/TLB/TLB.py
new file mode 100644
index 00000000..98c9af72
--- /dev/null
+++ b/src/soc/TLB/TLB.py
@@ -0,0 +1,175 @@
+""" TLB Module
+
+    The expected form of the data is:
+    * Item (Bits)
+    * Tag (N - 79) / ASID (78 - 64) / PTE (63 - 0)
+"""
+
+from nmigen import Memory, Module, Signal, Cat, Elaboratable
+from nmigen.cli import main
+
+from .PermissionValidator import PermissionValidator
+from .Cam import Cam
+
+class TLB(Elaboratable):
+    def __init__(self, asid_size, vma_size, pte_size, L1_size):
+        """ Arguments
+            * asid_size: Address Space IDentifier (ASID) typically 15 bits
+            * vma_size: Virtual Memory Address (VMA) typically 36 bits
+            * pte_size: Page Table Entry (PTE) typically 64 bits
+
+            Notes:
+            These arguments should represent the largest possible size
+            defined by the MODE settings. See
+            Volume II: RISC-V Privileged Architectures V1.10 Page 57
+        """
+
+        # Internal
+        self.state = 0
+        # L1 Cache Modules
+        self.cam_L1 = Cam(vma_size, L1_size)
+        self.mem_L1 = Memory(asid_size + pte_size, L1_size)
+
+        # Permission Validator
+        self.perm_validator = PermissionValidator(asid_size, pte_size)
+
+        # Inputs
+        self.supermode = Signal(1) # Supervisor Mode
+        self.super_access = Signal(1) # Supervisor Access
+        self.command = Signal(2) # 00=None, 01=Search, 10=Write L1, 11=Write L2
+        self.xwr = Signal(3) # Execute, Write, Read
+        self.mode = Signal(4) # 4 bits for access to Sv48 on Rv64
+        self.address_L1 = Signal(max=L1_size)
+        self.asid = Signal(asid_size) # Address Space IDentifier (ASID)
+        self.vma = Signal(vma_size) # Virtual Memory Address (VMA)
+        self.pte_in = Signal(pte_size) # To be saved Page Table Entry (PTE)
+
+        # Outputs
+        self.hit = Signal(1) # Denotes if the VMA had a mapped PTE
+        self.perm_valid = Signal(1) # Denotes if the permissions are correct
+        self.pte_out = Signal(pte_size) # PTE that was mapped to by the VMA
+
+    def search(self, m, read_L1, write_L1):
+        """ searches the TLB
+        """
+        m.d.comb += [
+            write_L1.en.eq(0),
+            self.cam_L1.write_enable.eq(0),
+            self.cam_L1.data_in.eq(self.vma)
+        ]
+        # Match found in L1 CAM
+        match_found = Signal(reset_less=True)
+        m.d.comb += match_found.eq(self.cam_L1.single_match
+                              | self.cam_L1.multiple_match)
+        with m.If(match_found):
+            # Memory shortcut variables
+            mem_address = self.cam_L1.match_address
+            # Memory Logic
+            m.d.comb += read_L1.addr.eq(mem_address)
+            # Permission Validator Logic
+            m.d.comb += [
+                self.hit.eq(1),
+                # Set permission validator data to the correct
+                # register file data according to CAM match
+                # address
+                self.perm_validator.data.eq(read_L1.data),
+                # Execute, Read, Write
+                self.perm_validator.xwr.eq(self.xwr),
+                # Supervisor Mode
+                self.perm_validator.super_mode.eq(self.supermode),
+                # Supverisor Access
+                self.perm_validator.super_access.eq(self.super_access),
+                # Address Space IDentifier (ASID)
+                self.perm_validator.asid.eq(self.asid),
+                # Output result of permission validation
+                self.perm_valid.eq(self.perm_validator.valid)
+            ]
+            # Only output PTE if permissions are valid
+            with m.If(self.perm_validator.valid):
+                # XXX TODO - dummy for now
+                reg_data = Signal.like(self.pte_out)
+                m.d.comb += [
+                    self.pte_out.eq(reg_data)
+                ]
+            with m.Else():
+                m.d.comb += [
+                    self.pte_out.eq(0)
+                ]
+        # Miss Logic
+        with m.Else():
+            m.d.comb += [
+                self.hit.eq(0),
+                self.perm_valid.eq(0),
+                self.pte_out.eq(0)
+            ]
+
+    def write_l1(self, m, read_L1, write_L1):
+        """ writes to the L1 cache
+        """
+        # Memory_L1 Logic
+        m.d.comb += [
+            write_L1.en.eq(1),
+            write_L1.addr.eq(self.address_L1),
+            # The Cat places arguments from LSB -> MSB
+            write_L1.data.eq(Cat(self.pte_in, self.asid))
+        ]
+        # CAM_L1 Logic
+        m.d.comb += [
+            self.cam_L1.write_enable.eq(1),
+            self.cam_L1.data_in.eq(self.vma), #data_in is sent to all entries
+            # self.cam_L1.address_in.eq(todo) # a CAM entry needs to be selected
+            
+        ]
+
+    def elaborate(self, platform):
+        m = Module()
+        # Add submodules
+        # Submodules for L1 Cache
+        m.submodules.cam_L1 = self.cam_L1
+        m.submodules.read_L1 = read_L1 = self.mem_L1.read_port()
+        m.submodules.write_L1 = write_L1 = self.mem_L1.write_port()
+        
+        # Permission Validator Submodule
+        m.submodules.perm_valididator = self.perm_validator
+
+        # When MODE specifies translation
+        # TODO add in different bit length handling ie prefix 0s
+        tlb_enable = Signal(reset_less=True)
+        m.d.comb += tlb_enable.eq(self.mode != 0)
+
+        with m.If(tlb_enable):
+            m.d.comb += [
+                self.cam_L1.enable.eq(1)
+            ]
+            with m.Switch(self.command):
+                # Search
+                with m.Case("01"):
+                    self.search(m, read_L1, write_L1)
+
+                # Write L1
+                # Expected that the miss will be handled in software
+                with m.Case("10"):
+                    self.write_l1(m, read_L1, write_L1)
+
+                # TODO
+                #with m.Case("11"):
+
+        # When disabled
+        with m.Else():
+            m.d.comb += [
+                self.cam_L1.enable.eq(0),
+                # XXX TODO - self.reg_file.enable.eq(0),
+                self.hit.eq(0),
+                self.perm_valid.eq(0), # XXX TODO, check this
+                self.pte_out.eq(0)
+            ]
+        return m
+
+
+if __name__ == '__main__':
+    tlb = TLB(15, 36, 64, 4)
+    main(tlb, ports=[ tlb.supermode, tlb.super_access, tlb.command,
+        tlb.xwr, tlb.mode, tlb.address_L1, tlb.asid,
+        tlb.vma, tlb.pte_in,
+        tlb.hit, tlb.perm_valid, tlb.pte_out,
+        ] + tlb.cam_L1.ports())
diff --git a/src/soc/TLB/__init__.py b/src/soc/TLB/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/soc/TLB/ariane/TreePLRU.cpp b/src/soc/TLB/ariane/TreePLRU.cpp
new file mode 100644
index 00000000..2f6aeea5
--- /dev/null
+++ b/src/soc/TLB/ariane/TreePLRU.cpp
@@ -0,0 +1,211 @@
+#include <cstdint>
+#include <iostream>
+#include <cmath>
+
+
+#define NWAY 4
+#define NLINE 256
+#define HIT 0
+#define MISS 1
+#define MS 1000
+/*
+Detailed TreePLRU inference see here: https://docs.google.com/spreadsheets/d/14zQpPYPwDAbCCjBT_a3KLaE5FEk-RNhI8Z7Qm_biW8g/edit?usp=sharing
+Ref: https://people.cs.clemson.edu/~mark/464/p_lru.txt
+four-way set associative - three bits
+   each bit represents one branch point in a binary decision tree; let 1
+   represent that the left side has been referenced more recently than the
+   right side, and 0 vice-versa
+              are all 4 lines valid?
+                   /       \
+                 yes        no, use an invalid line
+                  |
+                  |
+                  |
+             bit_0 == 0?            state | replace      ref to | next state
+              /       \             ------+--------      -------+-----------
+             y         n             00x  |  line_0      line_0 |    11_
+            /           \            01x  |  line_1      line_1 |    10_
+     bit_1 == 0?    bit_2 == 0?      1x0  |  line_2      line_2 |    0_1
+       /    \          /    \        1x1  |  line_3      line_3 |    0_0
+      y      n        y      n
+     /        \      /        \        ('x' means       ('_' means unchanged)
+   line_0  line_1  line_2  line_3      don't care)
+ 8-way set associative - 7  = 1+2+4 bits
+16-way set associative - 15 = 1+2+4+8 bits
+32-way set associative - 31 = 1+2+4+8+16 bits
+64-way set associative - 63 = 1+2+4+8+16+32 bits
+*/
+using namespace std;
+struct AddressField {
+    uint64_t wd_idx : 2;//Unused
+    uint64_t offset : 4;//Unused
+    uint64_t index  : 8;//NLINE = 256 = 2^8
+    uint64_t tag    : 50;
+};
+
+union Address {
+    uint32_t* p;
+    AddressField fields;
+};
+
+struct Cell {
+    bool v;
+    uint64_t tag;
+
+    Cell() : v(false), tag(0) {}
+
+    bool isHit(uint64_t tag) {
+        return v && (tag == this->tag);
+    }
+
+    void fetch(uint32_t* address) {
+        Address addr;
+        addr.p = address;
+        addr.fields.offset = 0;
+        addr.fields.wd_idx = 0;
+        tag = addr.fields.tag;
+        v = true;
+    }
+};
+
+ostream& operator<<(ostream & out, const Cell& cell) {
+    out << " v:" << cell.v << " tag:" << hex << cell.tag;
+    return out;
+}
+
+struct Block {
+    Cell cell[NWAY];
+    uint32_t state;
+    uint64_t *mask;//Mask the state to get accurate value for specified 1 bit.
+    uint64_t *value;
+    uint64_t *next_value;
+
+    Block() : state(0) {
+        switch (NWAY) {
+            case 4:
+                mask = new uint64_t[4]{0b110, 0b110, 0b101, 0b101};
+                value = new uint64_t[4]{0b000, 0b010, 0b100, 0b101};
+                next_value = new uint64_t[4]{0b110, 0b100, 0b001, 0b000};
+                break;
+            case 8:
+                mask = new uint64_t[8]{0b1101000, 0b1101000, 0b1100100, 0b1100100, 0b1010010, 0b1010010, 0b1010001,
+                                       0b1010001};
+                value = new uint64_t[8]{0b0000000, 0b0001000, 0b0100000, 0b0100100, 0b1000000, 0b1000010, 0b1010000,
+                                        0b1010001};
+                next_value = new uint64_t[8]{0b1101000, 0b1100000, 0b1000100, 0b1000000, 0b0010010, 0b0010000,
+                                             0b0000001, 0b0000000};
+                break;
+                //TODO - more NWAY goes here.
+            default:
+                std::cout << "Error definition NWAY = " << NWAY << std::endl;
+        }
+    }
+
+    uint32_t *getByTag(uint64_t tag, uint32_t *pway) {
+        for (int i = 0; i < NWAY; ++i) {
+            if (cell[i].isHit(tag)) {
+                *pway = i;
+                return pway;
+            }
+        }
+        return NULL;
+    }
+
+    void setLRU(uint32_t *address) {
+        int way = 0;
+        uint32_t st = state;
+        for (int i = 0; i < NWAY; ++i) {
+            if ((state & mask[i]) == value[i]) {
+                state ^= mask[i];
+                way = i;
+                break;
+            }
+        }
+        cell[way].fetch(address);
+        cout << "MISS: way:" << way << " address:" << address << " state:" << st << "->" << state << endl;
+    }
+
+    uint32_t *get(uint32_t *address, uint32_t *pway) {
+        Address addr;
+        addr.p = address;
+        uint32_t *d = getByTag(addr.fields.tag, pway);
+        if (d != NULL) {
+            return &d[addr.fields.offset];
+        }
+        return d;
+    }
+
+    int set(uint32_t *address) {
+        uint32_t way = 0;
+        uint32_t *p = get(address, &way);
+        if (p != NULL) {
+            printf("HIT: address:%p ref_to way:%d state %X --> ", address, way, state);
+            state &= ~mask[way];
+            printf("%X --> ", state);
+            state |= next_value[way];
+            printf("%X\n", state);
+            // *p = *address; //skip since address is fake.
+            return HIT;
+        } else {
+            setLRU(address);
+            return MISS;
+        }
+    }
+};
+
+ostream& operator<<(ostream & out, const Block& block) {
+    out << "state:" << block.state << " ";
+    for (int i = 0; i<NWAY; i++) {
+        out << block.cell[i];
+    }
+    return out;
+}
+
+struct Cache {
+    Block block[NLINE];
+    uint32_t count[2];
+    Cache() { count[HIT] = 0; count[MISS] = 0; }
+
+    void access(uint32_t* address) {
+        Address addr;
+        addr.p = address;
+        Block& b = block[addr.fields.index];
+        ++count[b.set(address)];
+    }
+
+};
+ostream& operator<<(ostream & out, const Cache& cache) {
+    out << "\n==Summary==\n\tHit: " << cache.count[HIT] <<  " Miss: " << cache.count[MISS] << std::endl;
+    for (int i = 0; i < NLINE; i++) {
+        out << cache.block[i] << endl;
+    }
+    return out;
+}
+
+Cache cache;
+void multiply(uint32_t* m1, uint32_t* m2, uint32_t* res)
+{
+    int x, i, j;
+    for (i = 0; i < MS; i++) {
+        for (j = 0; j < MS; j++) {
+            cache.access(res + i*MS +j);
+            for (x = 0; x < MS; x++) {
+                cache.access(m1 + i*MS + x);
+                cache.access(m2 + x*MS + j);
+                cache.access(res + i*MS +j);
+                // res[i][j] += m1[i][x] * m2[x][j];
+                cache.access(res + i*MS +j);
+            }
+        }
+    }
+}
+
+int main()
+{
+    uint32_t* m1 = (uint32_t*) 0xFACE00A000000000LL;  // fake virtual address; donât access it
+    uint32_t* m2 = (uint32_t*) 0xFACE00B000000000LL;  // fake virtual address; donât access it
+    uint32_t* res =  (uint32_t*) 0xFACE00C000000000LL; // fake virtual address; donât access it
+    multiply(m1, m2, res);
+    cout << cache << endl;
+    return 0;
+}
diff --git a/src/soc/TLB/ariane/__init__.py b/src/soc/TLB/ariane/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/soc/TLB/ariane/exceptcause.py b/src/soc/TLB/ariane/exceptcause.py
new file mode 100644
index 00000000..4c5cb2d5
--- /dev/null
+++ b/src/soc/TLB/ariane/exceptcause.py
@@ -0,0 +1,16 @@
+from nmigen import Const
+
+INSTR_ADDR_MISALIGNED = Const(0, 64)
+INSTR_ACCESS_FAULT    = Const(1, 64)
+ILLEGAL_INSTR         = Const(2, 64)
+BREAKPOINT            = Const(3, 64)
+LD_ADDR_MISALIGNED    = Const(4, 64)
+LD_ACCESS_FAULT       = Const(5, 64)
+ST_ADDR_MISALIGNED    = Const(6, 64)
+ST_ACCESS_FAULT       = Const(7, 64)
+ENV_CALL_UMODE        = Const(8, 64)  # environment call from user mode
+ENV_CALL_SMODE        = Const(9, 64)  # environment call from supervisor mode
+ENV_CALL_MMODE        = Const(11, 64) # environment call from machine mode
+INSTR_PAGE_FAULT      = Const(12, 64) # Instruction page fault
+LOAD_PAGE_FAULT       = Const(13, 64) # Load page fault
+STORE_PAGE_FAULT      = Const(15, 64) # Store page fault
diff --git a/src/soc/TLB/ariane/miss_handler.py b/src/soc/TLB/ariane/miss_handler.py
new file mode 100644
index 00000000..5ddc7255
--- /dev/null
+++ b/src/soc/TLB/ariane/miss_handler.py
@@ -0,0 +1,786 @@
+# Copyright 2018 ETH Zurich and University of Bologna.
+# Copyright and related rights are licensed under the Solderpad Hardware
+# License, Version 0.51 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# or agreed to in writing, software, hardware and materials distributed under
+# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Author: Florian Zaruba, ETH Zurich
+# Date: 12.11.2017
+# Description: Handles cache misses.
+from nmigen.lib.coding import Encoder, PriorityEncoder
+
+
+# --------------
+# MISS Handler
+# --------------
+import ariane_pkg::*;
+import std_cache_pkg::*;
+
+unsigned NR_PORTS         = 3
+
+class MissReq(RecordObject):
+    def __init__(self, name=None):
+        Record.__init__(self, name)
+        self.valid = Signal()
+        self.addr = Signal(64)
+        self.be = Signal(8)
+        self.size = Signal(2)
+        self.we = Signal()
+        self.wdata = Signal(64)
+        bypass = Signal()
+
+class CacheLine:
+    def __init__(self):
+        self.tag = Signal(DCACHE_TAG_WIDTH) # tag array
+        self.data = Signal(DCACHE_LINE_WIDTH) # data array
+        self.valid = Signal() # state array
+        self.dirty = Signal() # state array
+
+# cache line byte enable
+class CLBE:
+    def __init__(self):
+        self.tag = Signal(DCACHE_TAG_WIDTH+7)//8) # byte enable into tag array
+        self.data = Signal(DCACHE_LINE_WIDTH+7)//8) # byte enable data array
+        # bit enable into state array (valid for a pair of dirty/valid bits)
+        self.vldrty = Signal(DCACHE_SET_ASSOC)
+    } cl_be_t;
+
+
+
+    # FSM states
+"""
+    enum logic [3:0] {
+        IDLE,               # 0
+        FLUSHING,           # 1
+        FLUSH,              # 2
+        WB_CACHELINE_FLUSH, # 3
+        FLUSH_REQ_STATUS,   # 4
+        WB_CACHELINE_MISS,  # 5
+        WAIT_GNT_SRAM,      # 6
+        MISS,               # 7
+        REQ_CACHELINE,      # 8
+        MISS_REPL,          # 9
+        SAVE_CACHELINE,     # A
+        INIT,               # B
+        AMO_LOAD,           # C
+        AMO_SAVE_LOAD,      # D
+        AMO_STORE           # E
+    } state_d, state_q;
+"""
+
+class MissHandler(Elaboratable):
+    def __init__(self, NR_PORTS):
+        self.NR_PORTS = NR_PORTS
+        self.pwid = pwid = ceil(log(NR_PORTS) / log(2))
+        self.flush_i = Signal()      # flush request
+        self.flush_ack_o = Signal()  # acknowledge successful flush
+        self.miss_o = Signal()
+        self.busy_i = Signal()       # dcache is busy with something
+
+        # Bypass or miss
+        self.miss_req_i = Array(MissReq(name="missreq") for i in range(NR_PORTS))
+        # Bypass handling
+        self.bypass_gnt_o = Signal(NR_PORTS)
+        self.bypass_valid_o = Signal(NR_PORTS)
+        self.bypass_data_o = Array(Signal(name="bdata_o", 64) \
+                                    for i in range(NR_PORTS))
+
+        # AXI port
+        output ariane_axi::req_t                            axi_bypass_o,
+        input  ariane_axi::resp_t                           axi_bypass_i,
+
+        # Miss handling (~> cacheline refill)
+        self.miss_gnt_o = Signal(NR_PORTS)
+        self.active_serving_o = Signal(NR_PORTS)
+
+        self.critical_word_o = Signal(64)
+        self.critical_word_valid_o = Signal()
+        output ariane_axi::req_t                            axi_data_o,
+        input  ariane_axi::resp_t                           axi_data_i,
+
+        self.mshr_addr_i = Array(Signal(name="bdata_o", 56) \
+                                    for i in range(NR_PORTS))
+        self.mshr_addr_matches_o = Signal(NR_PORTS)
+        self.mshr_index_matches_o = Signal(NR_PORTS)
+
+        # AMO
+        self.amo_req_i = AMOReq()
+        self.amo_resp_o = AMOResp()
+        # Port to SRAMs, for refill and eviction
+        self.req_o = Signal(DCACHE_SET_ASSOC)
+        self.addr_o = Signal(DCACHE_INDEX_WIDTH) # address into cache array
+        self.data_o = CacheLine()
+        self.be_o = CLBE()
+        self.data_i = Array(CacheLine() \
+                                    for i in range(DCACHE_SET_ASSOC))
+        self.we_o = Signal()
+
+    def elaborate(self, platform):
+        # Registers
+        mshr_t                                  mshr_d, mshr_q;
+        logic [DCACHE_INDEX_WIDTH-1:0]          cnt_d, cnt_q;
+        logic [DCACHE_SET_ASSOC-1:0]            evict_way_d, evict_way_q;
+        # cache line to evict
+        cache_line_t                            evict_cl_d, evict_cl_q;
+
+        logic serve_amo_d, serve_amo_q;
+        # Request from one FSM
+        miss_req_valid = Signal(self.NR_PORTS)
+        miss_req_bypass = Signal(self.NR_PORTS)
+        miss_req_addr = Array(Signal(name="miss_req_addr", 64) \
+                                    for i in range(NR_PORTS))
+        miss_req_wdata = Array(Signal(name="miss_req_wdata", 64) \
+                                    for i in range(NR_PORTS))
+        miss_req_we = Signal(self.NR_PORTS)
+        miss_req_be = Array(Signal(name="miss_req_be", 8) \
+                                    for i in range(NR_PORTS))
+        miss_req_size = Array(Signal(name="miss_req_size", 2) \
+                                    for i in range(NR_PORTS))
+
+        # Cache Line Refill <-> AXI
+        req_fsm_miss_valid = Signal()
+        req_fsm_miss_addr = Signal(64)
+        req_fsm_miss_wdata = Signal(DCACHE_LINE_WIDTH)
+        req_fsm_miss_we = Signal()
+        req_fsm_miss_be = Signal(DCACHE_LINE_WIDTH//8)
+        ariane_axi::ad_req_t                     req_fsm_miss_req;
+        req_fsm_miss_size = Signal(2)
+
+        gnt_miss_fsm = Signal()
+        valid_miss_fsm = Signal()
+        nmiss = DCACHE_LINE_WIDTH//64
+        data_miss_fsm = Array(Signal(name="data_miss_fsm", 64) \
+                                    for i in range(nmiss))
+
+        # Cache Management <-> LFSR
+        lfsr_enable = Signal()
+        lfsr_oh = Signal(DCACHE_SET_ASSOC)
+        lfsr_bin = Signal($clog2(DCACHE_SET_ASSOC-1))
+        # AMOs
+        ariane_pkg::amo_t amo_op;
+        amo_operand_a = Signal(64)
+        amo_operand_b = Signal(64)
+        amo_result_o = Signal(64)
+
+        struct packed {
+            logic [63:3] address;
+            logic        valid;
+        } reservation_d, reservation_q;
+
+        # ------------------------------
+        # Cache Management
+        # ------------------------------
+        evict_way = Signal(DCACHE_SET_ASSOC)
+        valid_way = Signal(DCACHE_SET_ASSOC)
+
+        for (i in range(DCACHE_SET_ASSOC):
+            comb += evict_way[i].eq(data_i[i].valid & data_i[i].dirty)
+            comb += valid_way[i].eq(data_i[i].valid)
+
+        # ----------------------
+        # Default Assignments
+        # ----------------------
+        # to AXI refill
+        req_fsm_miss_req    = ariane_axi::CACHE_LINE_REQ;
+        req_fsm_miss_size   = Const(0b11, 2)
+        # core
+        serve_amo_d         = serve_amo_q;
+        # --------------------------------
+        # Flush and Miss operation
+        # --------------------------------
+        state_d      = state_q;
+        cnt_d        = cnt_q;
+        evict_way_d  = evict_way_q;
+        evict_cl_d   = evict_cl_q;
+        mshr_d       = mshr_q;
+        # communicate to the requester which unit we are currently serving
+        active_serving_o[mshr_q.id] = mshr_q.valid;
+        # AMOs
+        # silence the unit when not used
+        amo_op = amo_req_i.amo_op;
+
+        reservation_d = reservation_q;
+        with m.FSM() as state_q:
+
+            with m.Case("IDLE"):
+                # lowest priority are AMOs, wait until everything else
+                # is served before going for the AMOs
+                with m.If (amo_req_i.req & ~busy_i):
+                    # 1. Flush the cache
+                    with m.If(~serve_amo_q):
+                        m.next = "FLUSH_REQ_STATUS"
+                        serve_amo_d.eq(0b1
+                        cnt_d.eq(0
+                    # 2. Do the AMO
+                    with m.Else():
+                        m.next = "AMO_LOAD"
+                        serve_amo_d.eq(0b0
+
+                # check if we want to flush and can flush
+                # e.g.: we are not busy anymore
+                # TODO: Check that the busy flag is indeed needed
+                with m.If (flush_i & ~busy_i):
+                    m.next = "FLUSH_REQ_STATUS"
+                    cnt_d = 0
+
+                # check if one of the state machines missed
+                for i in range(NR_PORTS):
+                    # here comes the refill portion of code
+                    with m.If (miss_req_valid[i] & ~miss_req_bypass[i]):
+                        m.next = "MISS"
+                        # we are taking another request so don't
+                        # take the AMO
+                        serve_amo_d  = 0b0;
+                        # save to MSHR
+                        wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH
+                        comb += [ mshr_d.valid.eq(0b1),
+                                  mshr_d.we.eq(miss_req_we[i]),
+                                  mshr_d.id.eq(i),
+                                  mshr_d.addr.eq(miss_req_addr[i][0:wid]),
+                                  mshr_d.wdata.eq(miss_req_wdata[i]),
+                                  mshr_d.be.eq(miss_req_be[i]),
+                                ]
+                        break
+
+            #  ~> we missed on the cache
+            with m.Case("MISS"):
+                # 1. Check if there is an empty cache-line
+                # 2. If not -> evict one
+                comb += req_o.eq(1)
+                sync += addr_o.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH]
+                m.next = "MISS_REPL"
+                comb += miss_o.eq(1)
+
+            # ~> second miss cycle
+            with m.Case("MISS_REPL"):
+                # if all are valid we need to evict one, 
+                # pseudo random from LFSR
+                with m.If(~(~valid_way).bool()):
+                    comb += lfsr_enable.eq(0b1)
+                    comb += evict_way_d.eq(lfsr_oh)
+                    # do we need to write back the cache line?
+                    with m.If(data_i[lfsr_bin].dirty):
+                        state_d = WB_CACHELINE_MISS;
+                        comb += evict_cl_d.tag.eq(data_i[lfsr_bin].tag)
+                        comb += evict_cl_d.data.eq(data_i[lfsr_bin].data)
+                        comb += cnt_d.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH])
+                    # no - we can request a cache line now
+                    with m.Else():
+                        m.next = "REQ_CACHELINE"
+                # we have at least one free way
+                with m.Else():
+                    # get victim cache-line by looking for the
+                    # first non-valid bit
+                    comb += evict_way_d.eq(get_victim_cl(~valid_way)
+                    m.next = "REQ_CACHELINE"
+
+            # ~> we can just load the cache-line,
+            # the way is store in evict_way_q
+            with m.Case("REQ_CACHELINE"):
+                comb += req_fsm_miss_valid .eq(1)
+                sync += req_fsm_miss_addr  .eq(mshr_q.addr)
+
+                with m.If (gnt_miss_fsm):
+                    m.next = "SAVE_CACHELINE"
+                    comb += miss_gnt_o[mshr_q.id].eq(1)
+
+            # ~> replace the cacheline
+            with m.Case("SAVE_CACHELINE"):
+                # calculate cacheline offset
+                automatic logic [$clog2(DCACHE_LINE_WIDTH)-1:0] cl_offset;
+                sync += cl_offset.eq(mshr_q.addr[3:DCACHE_BYTE_OFFSET] << 6)
+                # we've got a valid response from refill unit
+                with m.If (valid_miss_fsm):
+                    wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH
+                    sync += addr_o      .eq(mshr_q.addr[:DCACHE_INDEX_WIDTH])
+                    sync += req_o       .eq(evict_way_q)
+                    comb += we_o        .eq(1)
+                    comb += be_o        .eq(1)
+                    sync += be_o.vldrty .eq(evict_way_q)
+                    sync += data_o.tag  .eq(mshr_q.addr[DCACHE_INDEX_WIDTH:wid]
+                    comb += data_o.data .eq(data_miss_fsm)
+                    comb += data_o.valid.eq(1)
+                    comb += data_o.dirty.eq(0)
+
+                    # is this a write?
+                    with m.If (mshr_q.we):
+                        # Yes, so safe the updated data now
+                        for i in range(8):
+                            # check if we really want to write
+                            # the corresponding byte
+                            with m.If (mshr_q.be[i]):
+                                sync += data_o.data[(cl_offset + i*8) +: 8].eq(mshr_q.wdata[i];
+                        # it's immediately dirty if we write
+                        comb += data_o.dirty.eq(1)
+
+                    # reset MSHR
+                    comb += mshr_d.valid.eq(0)
+                    # go back to idle
+                    m.next = 'IDLE'
+
+            # ------------------------------
+            # Write Back Operation
+            # ------------------------------
+            # ~> evict a cache line from way saved in evict_way_q
+            with m.Case("WB_CACHELINE_FLUSH"):
+            with m.Case("WB_CACHELINE_MISS"):
+
+                comb += req_fsm_miss_valid .eq(0b1)
+                sync += req_fsm_miss_addr  .eq({evict_cl_q.tag, cnt_q[DCACHE_INDEX_WIDTH-1:DCACHE_BYTE_OFFSET], {{DCACHE_BYTE_OFFSET}{0b0}}};
+                comb += req_fsm_miss_be    .eq(1)
+                comb += req_fsm_miss_we    .eq(0b1)
+                sync += req_fsm_miss_wdata .eq(evict_cl_q.data;
+
+                # we've got a grant --> this is timing critical, think about it
+                if (gnt_miss_fsm) begin
+                    # write status array
+                    sync += addr_o    .eq(cnt_q)
+                    comb += req_o     .eq(0b1)
+                    comb += we_o      .eq(0b1)
+                    comb += data_o.valid.eq(INVALIDATE_ON_FLUSH ? 0b0 : 0b1)
+                    # invalidate
+                    sync += be_o.vldrty.eq(evict_way_q)
+                    # go back to handling the miss or flushing,
+                    # depending on where we came from
+                    with m.If(state_q == WB_CACHELINE_MISS):
+                        m.next = "MISS"
+                    with m.Else():
+                        m.next = "FLUSH_REQ_STATUS"
+
+            # ------------------------------
+            # Flushing & Initialization
+            # ------------------------------
+            # ~> make another request to check the same
+            # cache-line if there are still some valid entries
+            with m.Case("FLUSH_REQ_STATUS"):
+                comb += req_o  .eq(1)
+                sync += addr_o .eq(cnt_q)
+                m.next = "FLUSHING"
+
+            with m.Case("FLUSHING"):
+                # this has priority
+                # at least one of the cache lines is dirty
+                with m.If(~evict_way):
+                    # evict cache line, look for the first
+                    # cache-line which is dirty
+                    comb += evict_way_d.eq(get_victim_cl(evict_way))
+                    comb += evict_cl_d .eq(data_i[one_hot_to_bin(evict_way)])
+                    state_d     = WB_CACHELINE_FLUSH;
+                # not dirty ~> increment and continue
+                with m.Else():
+                    # increment and re-request
+                    sync += cnt_d.eq(cnt_q + (1 << DCACHE_BYTE_OFFSET))
+                    m.next = "FLUSH_REQ_STATUS"
+                    sync += addr_o     .eq(cnt_q)
+                    comb += req_o      .eq(1)
+                    comb += be_o.vldrty.eq(INVALIDATE_ON_FLUSH ? 1 : 0)
+                    comb += we_o       .eq(1)
+                    # finished with flushing operation, go back to idle
+                    with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \
+                               == DCACHE_NUM_WORDS-1):
+                        # only acknowledge if the flush wasn't
+                        # triggered by an atomic
+                        sync += flush_ack_o.eq(~serve_amo_q)
+                        m.next = "IDLE"
+
+            # ~> only called after reset
+            with m.Case("INIT"):
+                # initialize status array
+                sync += addr_o.eq(cnt_q)
+                comb += req_o .eq(1)
+                comb += we_o  .eq(1)
+                # only write the dirty array
+                comb += be_o.vldrty.eq(1)
+                sync += cnt_d      .eq(cnt_q + (1 << DCACHE_BYTE_OFFSET))
+                # finished initialization
+                with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \
+                            == DCACHE_NUM_WORDS-1)
+                    m.next = "IDLE"
+
+            # ----------------------
+            # AMOs
+            # ----------------------
+            # TODO(zarubaf) Move this closer to memory
+            # ~> we are here because we need to do the AMO,
+            # the cache is clean at this point
+            # start by executing the load
+            with m.Case("AMO_LOAD"):
+                comb += req_fsm_miss_valid.eq(1)
+                # address is in operand a
+                comb += req_fsm_miss_addr.eq(amo_req_i.operand_a)
+                comb += req_fsm_miss_req.eq(ariane_axi::SINGLE_REQ)
+                comb += req_fsm_miss_size.eq(amo_req_i.size)
+                # the request has been granted
+                with m.If(gnt_miss_fsm):
+                    m.next = "AMO_SAVE_LOAD"
+            # save the load value
+            with m.Case("AMO_SAVE_LOAD"):
+                with m.If (valid_miss_fsm):
+                    # we are only concerned about the lower 64-bit
+                    comb += mshr_d.wdata.eq(data_miss_fsm[0])
+                    m.next = "AMO_STORE"
+            # and do the store
+            with m.Case("AMO_STORE"):
+                load_data = Signal(64)
+                # re-align load data
+                comb += load_data.eq(data_align(amo_req_i.operand_a[:3],
+                                                mshr_q.wdata))
+                # Sign-extend for word operation
+                with m.If (amo_req_i.size == 0b10):
+                    comb += amo_operand_a.eq(sext32(load_data[:32]))
+                    comb += amo_operand_b.eq(sext32(amo_req_i.operand_b[:32]))
+                with m.Else():
+                    comb += amo_operand_a.eq(load_data)
+                    comb += amo_operand_b.eq(amo_req_i.operand_b)
+
+                #  we do not need a store request for load reserved
+                # or a failing store conditional
+                #  we can bail-out without making any further requests
+                with m.If ((amo_req_i.amo_op == AMO_LR) | \
+                           ((amo_req_i.amo_op == AMO_SC) & \
+                           ((reservation_q.valid & \
+                            (reservation_q.address != \
+                             amo_req_i.operand_a[3:64])) | \
+                             ~reservation_q.valid))):
+                    comb += req_fsm_miss_valid.eq(0)
+                    m.next = "IDLE"
+                    comb += amo_resp_o.ack.eq(1)
+                    # write-back the result
+                    comb += amo_resp_o.result.eq(amo_operand_a)
+                    # we know that the SC failed
+                    with m.If (amo_req_i.amo_op == AMO_SC):
+                        comb += amo_resp_o.result.eq(1)
+                        # also clear the reservation
+                        comb += reservation_d.valid.eq(0)
+                with m.Else():
+                    comb += req_fsm_miss_valid.eq(1)
+
+                comb += req_fsm_miss_we  .eq(1)
+                comb += req_fsm_miss_req .eq(ariane_axi::SINGLE_REQ)
+                comb += req_fsm_miss_size.eq(amo_req_i.size)
+                comb += req_fsm_miss_addr.eq(amo_req_i.operand_a)
+
+                comb += req_fsm_miss_wdata.eq(
+                    data_align(amo_req_i.operand_a[0:3], amo_result_o))
+                comb += req_fsm_miss_be.eq(
+                    be_gen(amo_req_i.operand_a[0:3], amo_req_i.size))
+
+                # place a reservation on the memory
+                with m.If (amo_req_i.amo_op == AMO_LR):
+                    comb += reservation_d.address.eq(amo_req_i.operand_a[3:64])
+                    comb += reservation_d.valid.eq(1)
+
+                # the request is valid or we didn't need to go for another store
+                with m.If (valid_miss_fsm):
+                    m.next = "IDLE"
+                    comb += amo_resp_o.ack.eq(1)
+                    # write-back the result
+                    comb += amo_resp_o.result.eq(amo_operand_a;
+
+                    if (amo_req_i.amo_op == AMO_SC) begin
+                        comb += amo_resp_o.result.eq(0)
+                        # An SC must fail if there is another SC
+                        # (to any address) between the LR and the SC in
+                        # program order (even to the same address).
+                        # in any case destroy the reservation
+                        comb += reservation_d.valid.eq(0)
+
+        # check MSHR for aliasing
+
+        comb += mshr_addr_matches_o .eq(0)
+        comb += mshr_index_matches_o.eq()
+
+        for i in range(NR_PORTS):
+            # check mshr for potential matching of other units,
+            # exclude the unit currently being served
+            with m.If (mshr_q.valid & \
+                    (mshr_addr_i[i][DCACHE_BYTE_OFFSET:56] == \
+                     mshr_q.addr[DCACHE_BYTE_OFFSET:56])):
+                comb += mshr_addr_matches_o[i].eq(1)
+
+            # same as previous, but checking only the index
+            with m.If (mshr_q.valid & \
+                    (mshr_addr_i[i][DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] == \
+                     mshr_q.addr[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH])):
+                mshr_index_matches_o[i].eq(1)
+
+        # --------------------
+        # Sequential Process
+        # --------------------
+
+        """
+        #pragma translate_off
+        `ifndef VERILATOR
+        # assert that cache only hits on one way
+        assert property (
+          @(posedge clk_i) $onehot0(evict_way_q)) else $warning("Evict-way should be one-hot encoded");
+        `endif
+        #pragma translate_on
+        """
+
+        # ----------------------
+        # Bypass Arbiter
+        # ----------------------
+        # Connection Arbiter <-> AXI
+        req_fsm_bypass_valid = Signal()
+        req_fsm_bypass_addr = Signal(64)
+        req_fsm_bypass_wdata = Signal(64)
+        req_fsm_bypass_we = Signal()
+        req_fsm_bypass_be = Signal(8)
+        req_fsm_bypass_size = Signal(2)
+        gnt_bypass_fsm = Signal()
+        valid_bypass_fsm = Signal()
+        data_bypass_fsm = Signal(64)
+        logic [$clog2(NR_PORTS)-1:0] id_fsm_bypass;
+        logic [3:0]                  id_bypass_fsm;
+        logic [3:0]                  gnt_id_bypass_fsm;
+
+        i_bypass_arbiter = ib = AXIArbiter( NR_PORTS, 64)
+        comb += [
+            # Master Side
+            ib.data_req_i     .eq( miss_req_valid & miss_req_bypass         ),
+            ib.address_i      .eq( miss_req_addr                            ),
+            ib.data_wdata_i   .eq( miss_req_wdata                           ),
+            ib.data_we_i      .eq( miss_req_we                              ),
+            ib.data_be_i      .eq( miss_req_be                              ),
+            ib.data_size_i    .eq( miss_req_size                            ),
+            ib.data_gnt_o     .eq( bypass_gnt_o                             ),
+            ib.data_rvalid_o  .eq( bypass_valid_o                           ),
+            ib.data_rdata_o   .eq( bypass_data_o                            ),
+            # Slave Sid
+            ib.id_i           .eq( id_bypass_fsm[$clog2(NR_PORTS)-1:0]      ),
+            ib.id_o           .eq( id_fsm_bypass                            ),
+            ib.gnt_id_i       .eq( gnt_id_bypass_fsm[$clog2(NR_PORTS)-1:0]  ),
+            ib.address_o      .eq( req_fsm_bypass_addr                      ),
+            ib.data_wdata_o   .eq( req_fsm_bypass_wdata                     ),
+            ib.data_req_o     .eq( req_fsm_bypass_valid                     ),
+            ib.data_we_o      .eq( req_fsm_bypass_we                        ),
+            ib.data_be_o      .eq( req_fsm_bypass_be                        ),
+            ib.data_size_o    .eq( req_fsm_bypass_size                      ),
+            ib.data_gnt_i     .eq( gnt_bypass_fsm                           ),
+            ib.data_rvalid_i  .eq( valid_bypass_fsm                         ),
+            ib.data_rdata_i   .eq( data_bypass_fsm                          ),
+        ]
+
+        axi_adapter #(
+            .DATA_WIDTH            ( 64                 ),
+            .AXI_ID_WIDTH          ( 4                  ),
+            .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET )
+        ) i_bypass_axi_adapter (
+            .clk_i,
+            .rst_ni,
+            .req_i                 ( req_fsm_bypass_valid   ),
+            .type_i                ( ariane_axi::SINGLE_REQ ),
+            .gnt_o                 ( gnt_bypass_fsm         ),
+            .addr_i                ( req_fsm_bypass_addr    ),
+            .we_i                  ( req_fsm_bypass_we      ),
+            .wdata_i               ( req_fsm_bypass_wdata   ),
+            .be_i                  ( req_fsm_bypass_be      ),
+            .size_i                ( req_fsm_bypass_size    ),
+            .id_i                  ( Cat(id_fsm_bypass, 0, 0) ),
+            .valid_o               ( valid_bypass_fsm       ),
+            .rdata_o               ( data_bypass_fsm        ),
+            .gnt_id_o              ( gnt_id_bypass_fsm      ),
+            .id_o                  ( id_bypass_fsm          ),
+            .critical_word_o       (                        ), # not used for single requests
+            .critical_word_valid_o (                        ), # not used for single requests
+            .axi_req_o             ( axi_bypass_o           ),
+            .axi_resp_i            ( axi_bypass_i           )
+        );
+
+        # ----------------------
+        # Cache Line AXI Refill
+        # ----------------------
+        axi_adapter  #(
+            .DATA_WIDTH            ( DCACHE_LINE_WIDTH  ),
+            .AXI_ID_WIDTH          ( 4                  ),
+            .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET )
+        ) i_miss_axi_adapter (
+            .clk_i,
+            .rst_ni,
+            .req_i               ( req_fsm_miss_valid ),
+            .type_i              ( req_fsm_miss_req   ),
+            .gnt_o               ( gnt_miss_fsm       ),
+            .addr_i              ( req_fsm_miss_addr  ),
+            .we_i                ( req_fsm_miss_we    ),
+            .wdata_i             ( req_fsm_miss_wdata ),
+            .be_i                ( req_fsm_miss_be    ),
+            .size_i              ( req_fsm_miss_size  ),
+            .id_i                ( Const(0b1100, 4)   ),
+            .gnt_id_o            (                    ), # open
+            .valid_o             ( valid_miss_fsm     ),
+            .rdata_o             ( data_miss_fsm      ),
+            .id_o                (                    ),
+            .critical_word_o,
+            .critical_word_valid_o,
+            .axi_req_o           ( axi_data_o         ),
+            .axi_resp_i          ( axi_data_i         )
+        );
+
+        # -----------------
+        # Replacement LFSR
+        # -----------------
+        lfsr_8bit #(.WIDTH (DCACHE_SET_ASSOC)) i_lfsr (
+            .en_i           ( lfsr_enable ),
+            .refill_way_oh  ( lfsr_oh     ),
+            .refill_way_bin ( lfsr_bin    ),
+            .*
+        );
+
+        # -----------------
+        # AMO ALU
+        # -----------------
+        amo_alu i_amo_alu (
+            .amo_op_i        ( amo_op        ),
+            .amo_operand_a_i ( amo_operand_a ),
+            .amo_operand_b_i ( amo_operand_b ),
+            .amo_result_o    ( amo_result_o  )
+        );
+
+        # -----------------
+        # Struct Split
+        # -----------------
+
+        for i in range(NR_PORTS):
+            miss_req = MissReq()
+            comb += miss_req.eq(miss_req_i[i]);
+            comb += miss_req_valid  [i] .eq(miss_req.valid)
+            comb += miss_req_bypass [i] .eq(miss_req.bypass)
+            comb += miss_req_addr   [i] .eq(miss_req.addr)
+            comb += miss_req_wdata  [i] .eq(miss_req.wdata)
+            comb += miss_req_we     [i] .eq(miss_req.we)
+            comb += miss_req_be     [i] .eq(miss_req.be)
+            comb += miss_req_size   [i] .eq(miss_req.size)
+
+    # --------------
+    # AXI Arbiter
+    # --------------s
+    #
+    # Description: Arbitrates access to AXI refill/bypass
+    #
+class AXIArbiter:
+    def __init__(self, NR_PORTS   = 3, DATA_WIDTH = 64):
+        self.NR_PORTS = NR_PORTS
+        self.DATA_WIDTH = DATA_WIDTH
+        self.pwid = pwid = ceil(log(NR_PORTS) / log(2))
+        rst_ni = ResetSignal() # Asynchronous reset active low
+        # master ports
+        self.data_req_i = Signal(NR_PORTS)
+        self.address_i = Array(Signal(name="address_i", 64) \
+                                    for i in range(NR_PORTS))
+        self.data_wdata_i = Array(Signal(name="data_wdata_i", 64) \
+                                    for i in range(NR_PORTS))
+        self.data_we_i = Signal(NR_PORTS)
+        self.data_be_i = Array(Signal(name="data_wdata_i", DATA_WIDTH/8) \
+                                    for i in range(NR_PORTS))
+        self.data_size_i = Array(Signal(name="data_size_i", 2) \
+                                    for i in range(NR_PORTS))
+        self.data_gnt_o = Signal(NR_PORTS)
+        self.data_rvalid_o = Signal(NR_PORTS)
+        self.data_rdata_o = Array(Signal(name="data_rdata_o", 64) \
+                                    for i in range(NR_PORTS))
+
+        # slave port
+        self.id_i = Signal(pwid)
+        self.id_o = Signal(pwid)
+        self.gnt_id_i = Signal(pwid)
+        self.data_req_o = Signal()
+        self.address_o = Signal(64)
+        self.data_wdata_o = Signal(DATA_WIDTH)
+        self.data_we_o = Signal()
+        self.data_be_o = Signal(DATA_WIDTH/8)
+        self.data_size_o = Signal(2)
+        self.data_gnt_i = Signal()
+        self.data_rvalid_i = Signal()
+        self.data_rdata_i = Signal(DATA_WIDTH)
+
+    def elaborate(self, platform):
+        #enum logic [1:0] { IDLE, REQ, SERVING } state_d, state_q;
+
+        class Packet:
+            def __init__(self, pwid, DATA_WIDTH):
+                self.id = Signal(pwid)
+                self.address = Signal(64)
+                self.data = Signal(64)
+                self.size = Signal(2)
+                self.be = Signal(DATA_WIDTH/8)
+                self.we = Signal()
+
+        request_index = Signal(self.pwid)
+        req_q = Packet(self.pwid, self.DATA_WIDTH)
+        req_d = Packet(self.pwid, self.DATA_WIDTH)
+
+        # request register
+        sync += req_q.eq(req_d)
+
+        # request port
+        comb += self.address_o             .eq(req_q.address)
+        comb += self.data_wdata_o          .eq(req_q.data)
+        comb += self.data_be_o             .eq(req_q.be)
+        comb += self.data_size_o           .eq(req_q.size)
+        comb += self.data_we_o             .eq(req_q.we)
+        comb += self.id_o                  .eq(req_q.id)
+        comb += self.data_gnt_o            .eq(0)
+        # read port
+        comb += self.data_rvalid_o         .eq(0)
+        comb += self.data_rdata_o          .eq(0)
+        comb += self.data_rdata_o[req_q.id].eq(data_rdata_i)
+
+        m.submodules.pp = pp = PriorityEncoder(self.NR_PORTS)
+        comb += pp.i.eq(self.data_req_i) # select one request (priority-based)
+        comb += request_index.eq(pp.o)
+
+        with m.Switch("state") as s:
+
+            with m.Case("IDLE"):
+                # wait for incoming requests (priority encoder data_req_i)
+                with m.If(~pp.n): # one output valid from encoder
+                    comb += self.data_req_o   .eq(self.data_req_i[i])
+                    comb += self.data_gnt_o[i].eq(self.data_req_i[i])
+                    # save the request
+                    comb += req_d.address.eq(self.address_i[i])
+                    comb += req_d.id.eq(request_index)
+                    comb += req_d.data.eq(self.data_wdata_i[i])
+                    comb += req_d.size.eq(self.data_size_i[i])
+                    comb += req_d.be.eq(self.data_be_i[i])
+                    comb += req_d.we.eq(self.data_we_i[i])
+                    m.next = "SERVING"
+
+                comb += self.address_o    .eq(self.address_i[request_index])
+                comb += self.data_wdata_o .eq(self.data_wdata_i[request_index])
+                comb += self.data_be_o    .eq(self.data_be_i[request_index])
+                comb += self.data_size_o  .eq(self.data_size_i[request_index])
+                comb += self.data_we_o    .eq(self.data_we_i[request_index])
+                comb += self.id_o         .eq(request_index)
+
+            with m.Case("SERVING"):
+                comb += self.data_req_o.eq(1)
+                with m.If (self.data_rvalid_i):
+                    comb += self.data_rvalid_o[req_q.id].eq(1)
+                    m.next = "IDLE"
+
+        # ------------
+        # Assertions
+        # ------------
+
+        """
+#pragma translate_off
+`ifndef VERILATOR
+# make sure that we eventually get an rvalid after we received a grant
+assert property (@(posedge clk_i) data_gnt_i |-> ##[1:$] data_rvalid_i )
+    else begin $error("There was a grant without a rvalid"); $stop(); end
+# assert that there is no grant without a request
+assert property (@(negedge clk_i) data_gnt_i |-> data_req_o)
+    else begin $error("There was a grant without a request."); $stop(); end
+# assert that the address does not contain X when request is sent
+assert property ( @(posedge clk_i) (data_req_o) |-> (!$isunknown(address_o)) )
+  else begin $error("address contains X when request is set"); $stop(); end
+
+`endif
+#pragma translate_on
+        """
+
diff --git a/src/soc/TLB/ariane/mmu.py b/src/soc/TLB/ariane/mmu.py
new file mode 100644
index 00000000..a14862cd
--- /dev/null
+++ b/src/soc/TLB/ariane/mmu.py
@@ -0,0 +1,474 @@
+"""
+# Copyright 2018 ETH Zurich and University of Bologna.
+# Copyright and related rights are licensed under the Solderpad Hardware
+# License, Version 0.51 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# or agreed to in writing, software, hardware and materials distributed under
+# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Author: Florian Zaruba, ETH Zurich
+# Date: 19/04/2017
+# Description: Memory Management Unit for Ariane, contains TLB and
+#              address translation unit. SV48 as defined in
+#              Volume II: RISC-V Privileged Architectures V1.10 Page 63
+
+import ariane_pkg::*;
+"""
+
+from nmigen import Const, Signal, Cat, Module, Mux
+from nmigen.cli import verilog, rtlil
+
+from ptw import DCacheReqI, DCacheReqO, TLBUpdate, PTE, PTW
+from tlb import TLB
+from exceptcause import (INSTR_ACCESS_FAULT, INSTR_PAGE_FAULT,
+                         LOAD_PAGE_FAULT, STORE_PAGE_FAULT)
+
+PRIV_LVL_M = Const(0b11, 2)
+PRIV_LVL_S = Const(0b01, 2)
+PRIV_LVL_U = Const(0b00, 2)
+
+
+class RVException:
+    def __init__(self):
+         self.cause = Signal(64) # cause of exception
+         self.tval = Signal(64) # more info of causing exception
+                                # (e.g.: instruction causing it),
+                                #        address of LD/ST fault
+         self.valid = Signal()
+
+    def eq(self, inp):
+        res = []
+        for (o, i) in zip(self.ports(), inp.ports()):
+            res.append(o.eq(i))
+        return res
+
+    def __iter__(self):
+        yield self.cause
+        yield self.tval
+        yield self.valid
+
+    def ports(self):
+        return list(self)
+
+
+class ICacheReqI:
+    def __init__(self):
+        self.fetch_valid = Signal()   # address translation valid
+        self.fetch_paddr = Signal(64) # physical address in
+        self.fetch_exception = RVException() # exception occurred during fetch
+
+    def __iter__(self):
+        yield self.fetch_valid
+        yield self.fetch_paddr
+        yield from self.fetch_exception
+
+    def ports(self):
+        return list(self)
+
+
+class ICacheReqO:
+    def __init__(self):
+        self.fetch_req = Signal()     # address translation request
+        self.fetch_vaddr = Signal(64) # virtual address out
+
+    def __iter__(self):
+        yield self.fetch_req
+        yield self.fetch_vaddr
+
+    def ports(self):
+        return list(self)
+
+
+class MMU:
+    def __init__(self, instr_tlb_entries = 4,
+                       data_tlb_entries  = 4,
+                       asid_width        = 1):
+        self.instr_tlb_entries = instr_tlb_entries
+        self.data_tlb_entries = data_tlb_entries
+        self.asid_width = asid_width
+
+        self.flush_i = Signal()
+        self.enable_translation_i = Signal()
+        self.en_ld_st_translation_i = Signal() # enable VM translation for LD/ST
+        # IF interface
+        self.icache_areq_i = ICacheReqO()
+        self.icache_areq_o = ICacheReqI()
+        # LSU interface
+        # this is a more minimalistic interface because the actual addressing
+        # logic is handled in the LSU as we distinguish load and stores,
+        # what we do here is simple address translation
+        self.misaligned_ex_i = RVException()
+        self.lsu_req_i = Signal()   # request address translation
+        self.lsu_vaddr_i = Signal(64) # virtual address in
+        self.lsu_is_store_i = Signal() # the translation is requested by a store
+        # if we need to walk the page table we can't grant in the same cycle
+
+        # Cycle 0
+        self.lsu_dtlb_hit_o = Signal() # sent in the same cycle as the request
+                                       # if translation hits in the DTLB
+        # Cycle 1
+        self.lsu_valid_o = Signal()  # translation is valid
+        self.lsu_paddr_o = Signal(64) # translated address
+        self.lsu_exception_o = RVException() # addr translate threw exception
+
+        # General control signals
+        self.priv_lvl_i = Signal(2)
+        self.ld_st_priv_lvl_i = Signal(2)
+        self.sum_i = Signal()
+        self.mxr_i = Signal()
+        # input logic flag_mprv_i,
+        self.satp_ppn_i = Signal(44)
+        self.asid_i = Signal(self.asid_width)
+        self.flush_tlb_i = Signal()
+        # Performance counters
+        self.itlb_miss_o = Signal()
+        self.dtlb_miss_o = Signal()
+        # PTW memory interface
+        self.req_port_i = DCacheReqO()
+        self.req_port_o = DCacheReqI()
+
+    def elaborate(self, platform):
+        m = Module()
+
+        iaccess_err = Signal()   # insufficient priv to access instr page
+        daccess_err = Signal()   # insufficient priv to access data page
+        ptw_active = Signal()    # PTW is currently walking a page table
+        walking_instr = Signal() # PTW is walking because of an ITLB miss
+        ptw_error = Signal()     # PTW threw an exception
+
+        update_vaddr = Signal(48)				  # guessed
+        uaddr64 = Cat(update_vaddr, Const(0, 25)) # extend to 64bit with zeros
+        update_ptw_itlb = TLBUpdate(self.asid_width)
+        update_ptw_dtlb = TLBUpdate(self.asid_width)
+
+        itlb_lu_access = Signal()
+        itlb_content = PTE()
+        itlb_is_2M = Signal()
+        itlb_is_1G = Signal()
+        itlb_is_512G = Signal()
+        itlb_lu_hit = Signal()
+
+        dtlb_lu_access = Signal()
+        dtlb_content = PTE()
+        dtlb_is_2M = Signal()
+        dtlb_is_1G = Signal()
+        dtlb_is_512G = Signal()
+        dtlb_lu_hit = Signal()
+
+        # Assignments
+        m.d.comb += [itlb_lu_access.eq(self.icache_areq_i.fetch_req),
+                     dtlb_lu_access.eq(self.lsu_req_i)
+                    ]
+
+        # ITLB
+        m.submodules.i_tlb = i_tlb = TLB(self.instr_tlb_entries,
+                                         self.asid_width)
+        m.d.comb += [i_tlb.flush_i.eq(self.flush_tlb_i),
+                     i_tlb.update_i.eq(update_ptw_itlb),
+                     i_tlb.lu_access_i.eq(itlb_lu_access),
+                     i_tlb.lu_asid_i.eq(self.asid_i),
+                     i_tlb.lu_vaddr_i.eq(self.icache_areq_i.fetch_vaddr),
+                     itlb_content.eq(i_tlb.lu_content_o),
+                     itlb_is_2M.eq(i_tlb.lu_is_2M_o),
+                     itlb_is_1G.eq(i_tlb.lu_is_1G_o),
+                     itlb_is_512G.eq(i_tlb.lu_is_512G_o),
+                     itlb_lu_hit.eq(i_tlb.lu_hit_o),
+                    ]
+
+        # DTLB
+        m.submodules.d_tlb = d_tlb = TLB(self.data_tlb_entries,
+                                         self.asid_width)
+        m.d.comb += [d_tlb.flush_i.eq(self.flush_tlb_i),
+                     d_tlb.update_i.eq(update_ptw_dtlb),
+                     d_tlb.lu_access_i.eq(dtlb_lu_access),
+                     d_tlb.lu_asid_i.eq(self.asid_i),
+                     d_tlb.lu_vaddr_i.eq(self.lsu_vaddr_i),
+                     dtlb_content.eq(d_tlb.lu_content_o),
+                     dtlb_is_2M.eq(d_tlb.lu_is_2M_o),
+                     dtlb_is_1G.eq(d_tlb.lu_is_1G_o),
+                     dtlb_is_512G.eq(d_tlb.lu_is_512G_o),
+                     dtlb_lu_hit.eq(d_tlb.lu_hit_o),
+                    ]
+
+        # PTW
+        m.submodules.ptw = ptw = PTW(self.asid_width)
+        m.d.comb += [ptw_active.eq(ptw.ptw_active_o),
+                     walking_instr.eq(ptw.walking_instr_o),
+                     ptw_error.eq(ptw.ptw_error_o),
+                     ptw.enable_translation_i.eq(self.enable_translation_i),
+
+                     update_vaddr.eq(ptw.update_vaddr_o),
+                     update_ptw_itlb.eq(ptw.itlb_update_o),
+                     update_ptw_dtlb.eq(ptw.dtlb_update_o),
+
+                     ptw.itlb_access_i.eq(itlb_lu_access),
+                     ptw.itlb_hit_i.eq(itlb_lu_hit),
+                     ptw.itlb_vaddr_i.eq(self.icache_areq_i.fetch_vaddr),
+
+                     ptw.dtlb_access_i.eq(dtlb_lu_access),
+                     ptw.dtlb_hit_i.eq(dtlb_lu_hit),
+                     ptw.dtlb_vaddr_i.eq(self.lsu_vaddr_i),
+
+                     ptw.req_port_i.eq(self.req_port_i),
+                     self.req_port_o.eq(ptw.req_port_o),
+                    ]
+
+        # ila_1 i_ila_1 (
+        #     .clk(clk_i), # input wire clk
+        #     .probe0({req_port_o.address_tag, req_port_o.address_index}),
+        #     .probe1(req_port_o.data_req), # input wire [63:0]  probe1
+        #     .probe2(req_port_i.data_gnt), # input wire [0:0]  probe2
+        #     .probe3(req_port_i.data_rdata), # input wire [0:0]  probe3
+        #     .probe4(req_port_i.data_rvalid), # input wire [0:0]  probe4
+        #     .probe5(ptw_error), # input wire [1:0]  probe5
+        #     .probe6(update_vaddr), # input wire [0:0]  probe6
+        #     .probe7(update_ptw_itlb.valid), # input wire [0:0]  probe7
+        #     .probe8(update_ptw_dtlb.valid), # input wire [0:0]  probe8
+        #     .probe9(dtlb_lu_access), # input wire [0:0]  probe9
+        #     .probe10(lsu_vaddr_i), # input wire [0:0]  probe10
+        #     .probe11(dtlb_lu_hit), # input wire [0:0]  probe11
+        #     .probe12(itlb_lu_access), # input wire [0:0]  probe12
+        #     .probe13(icache_areq_i.fetch_vaddr), # input wire [0:0]  probe13
+        #     .probe14(itlb_lu_hit) # input wire [0:0]  probe13
+        # );
+
+        #-----------------------
+        # Instruction Interface
+        #-----------------------
+        # The instruction interface is a simple request response interface
+
+        # MMU disabled: just pass through
+        m.d.comb += [self.icache_areq_o.fetch_valid.eq(
+                                                self.icache_areq_i.fetch_req),
+                     # play through in case we disabled address translation
+                     self.icache_areq_o.fetch_paddr.eq(
+                                                self.icache_areq_i.fetch_vaddr)
+                    ]
+        # two potential exception sources:
+        # 1. HPTW threw an exception -> signal with a page fault exception
+        # 2. We got an access error because of insufficient permissions ->
+        #    throw an access exception
+        m.d.comb += self.icache_areq_o.fetch_exception.valid.eq(0)
+        # Check whether we are allowed to access this memory region
+        # from a fetch perspective
+
+        # PLATEN TODO: use PermissionValidator instead [we like modules]
+        m.d.comb += iaccess_err.eq(self.icache_areq_i.fetch_req & \
+                                   (((self.priv_lvl_i == PRIV_LVL_U) & \
+                                      ~itlb_content.u) | \
+                                   ((self.priv_lvl_i == PRIV_LVL_S) & \
+                                    itlb_content.u)))
+
+        # MMU enabled: address from TLB, request delayed until hit.
+        # Error when TLB hit and no access right or TLB hit and
+        # translated address not valid (e.g.  AXI decode error),
+        # or when PTW performs walk due to ITLB miss and raises
+        # an error.
+        with m.If (self.enable_translation_i):
+            # we work with SV48, so if VM is enabled, check that
+            # all bits [47:38] are equal
+            with m.If (self.icache_areq_i.fetch_req & \
+                ~(((~self.icache_areq_i.fetch_vaddr[47:64]) == 0) | \
+                 (self.icache_areq_i.fetch_vaddr[47:64]) == 0)):
+                fe = self.icache_areq_o.fetch_exception
+                m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT),
+                             fe.tval.eq(self.icache_areq_i.fetch_vaddr),
+                             fe.valid.eq(1)
+                            ]
+
+            m.d.comb += self.icache_areq_o.fetch_valid.eq(0)
+
+            # 4K page
+            paddr = Signal.like(self.icache_areq_o.fetch_paddr)
+            paddr4k = Cat(self.icache_areq_i.fetch_vaddr[0:12],
+                          itlb_content.ppn)
+            m.d.comb += paddr.eq(paddr4k)
+            # Mega page
+            with m.If(itlb_is_2M):
+                m.d.comb += paddr[12:21].eq(
+                          self.icache_areq_i.fetch_vaddr[12:21])
+            # Giga page
+            with m.If(itlb_is_1G):
+                m.d.comb += paddr[12:30].eq(
+                          self.icache_areq_i.fetch_vaddr[12:30])
+            m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr)
+            # Tera page
+            with m.If(itlb_is_512G):
+                m.d.comb += paddr[12:39].eq(
+                          self.icache_areq_i.fetch_vaddr[12:39])
+            m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr)
+
+            # ---------
+            # ITLB Hit
+            # --------
+            # if we hit the ITLB output the request signal immediately
+            with m.If(itlb_lu_hit):
+                m.d.comb += self.icache_areq_o.fetch_valid.eq(
+                                          self.icache_areq_i.fetch_req)
+                # we got an access error
+                with m.If (iaccess_err):
+                    # throw a page fault
+                    fe = self.icache_areq_o.fetch_exception
+                    m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT),
+                                 fe.tval.eq(self.icache_areq_i.fetch_vaddr),
+                                 fe.valid.eq(1)
+                                ]
+            # ---------
+            # ITLB Miss
+            # ---------
+            # watch out for exceptions happening during walking the page table
+            with m.Elif(ptw_active & walking_instr):
+                m.d.comb += self.icache_areq_o.fetch_valid.eq(ptw_error)
+                fe = self.icache_areq_o.fetch_exception
+                m.d.comb += [fe.cause.eq(INSTR_PAGE_FAULT),
+                             fe.tval.eq(uaddr64),
+                             fe.valid.eq(1)
+                            ]
+
+        #-----------------------
+        # Data Interface
+        #-----------------------
+
+        lsu_vaddr = Signal(64)
+        dtlb_pte = PTE()
+        misaligned_ex = RVException()
+        lsu_req = Signal()
+        lsu_is_store = Signal()
+        dtlb_hit = Signal()
+        #dtlb_is_2M = Signal()
+        #dtlb_is_1G = Signal()
+        #dtlb_is_512 = Signal()
+
+        # check if we need to do translation or if we are always
+        # ready (e.g.: we are not translating anything)
+        m.d.comb += self.lsu_dtlb_hit_o.eq(Mux(self.en_ld_st_translation_i,
+                                          dtlb_lu_hit, 1))
+
+        # The data interface is simpler and only consists of a
+        # request/response interface
+        m.d.comb += [
+            # save request and DTLB response
+            lsu_vaddr.eq(self.lsu_vaddr_i),
+            lsu_req.eq(self.lsu_req_i),
+            misaligned_ex.eq(self.misaligned_ex_i),
+            dtlb_pte.eq(dtlb_content),
+            dtlb_hit.eq(dtlb_lu_hit),
+            lsu_is_store.eq(self.lsu_is_store_i),
+            #dtlb_is_2M.eq(dtlb_is_2M),
+            #dtlb_is_1G.eq(dtlb_is_1G),
+            ##dtlb_is_512.eq(self.dtlb_is_512G) #????
+        ]
+        m.d.sync += [
+            self.lsu_paddr_o.eq(lsu_vaddr),
+            self.lsu_valid_o.eq(lsu_req),
+            self.lsu_exception_o.eq(misaligned_ex),
+        ]
+
+        sverr = Signal()
+        usrerr = Signal()
+
+        m.d.comb += [
+            # mute misaligned exceptions if there is no request
+            # otherwise they will throw accidental exceptions
+            misaligned_ex.valid.eq(self.misaligned_ex_i.valid & self.lsu_req_i),
+
+            # SUM is not set and we are trying to access a user
+            # page in supervisor mode
+            sverr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_S & ~self.sum_i & \
+                       dtlb_pte.u),
+            # this is not a user page but we are in user mode and
+            # trying to access it
+            usrerr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_U & ~dtlb_pte.u),
+
+            # Check if the User flag is set, then we may only
+            # access it in supervisor mode if SUM is enabled
+            daccess_err.eq(sverr | usrerr),
+            ]
+
+        # translation is enabled and no misaligned exception occurred
+        with m.If(self.en_ld_st_translation_i & ~misaligned_ex.valid):
+            m.d.comb += lsu_req.eq(0)
+            # 4K page
+            paddr = Signal.like(lsu_vaddr)
+            paddr4k = Cat(lsu_vaddr[0:12], itlb_content.ppn)
+            m.d.comb += paddr.eq(paddr4k)
+            # Mega page
+            with m.If(dtlb_is_2M):
+                m.d.comb += paddr[12:21].eq(lsu_vaddr[12:21])
+            # Giga page
+            with m.If(dtlb_is_1G):
+                m.d.comb += paddr[12:30].eq(lsu_vaddr[12:30])
+            m.d.sync += self.lsu_paddr_o.eq(paddr)
+            # TODO platen tera_page
+
+            # ---------
+            # DTLB Hit
+            # --------
+            with m.If(dtlb_hit & lsu_req):
+                m.d.comb += lsu_req.eq(1)
+                # this is a store
+                with m.If (lsu_is_store):
+                    # check if the page is write-able and
+                    # we are not violating privileges
+                    # also check if the dirty flag is set
+                    with m.If(~dtlb_pte.w | daccess_err | ~dtlb_pte.d):
+                        le = self.lsu_exception_o
+                        m.d.sync += [le.cause.eq(STORE_PAGE_FAULT),
+                                     le.tval.eq(lsu_vaddr),
+                                     le.valid.eq(1)
+                                    ]
+
+                # this is a load, check for sufficient access
+                # privileges - throw a page fault if necessary
+                with m.Elif(daccess_err):
+                    le = self.lsu_exception_o
+                    m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT),
+                                 le.tval.eq(lsu_vaddr),
+                                 le.valid.eq(1)
+                                ]
+            # ---------
+            # DTLB Miss
+            # ---------
+            # watch out for exceptions
+            with m.Elif (ptw_active & ~walking_instr):
+                # page table walker threw an exception
+                with m.If (ptw_error):
+                    # an error makes the translation valid
+                    m.d.comb += lsu_req.eq(1)
+                    # the page table walker can only throw page faults
+                    with m.If (lsu_is_store):
+                        le = self.lsu_exception_o
+                        m.d.sync += [le.cause.eq(STORE_PAGE_FAULT),
+                                     le.tval.eq(uaddr64),
+                                     le.valid.eq(1)
+                                    ]
+                    with m.Else():
+                        m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT),
+                                     le.tval.eq(uaddr64),
+                                     le.valid.eq(1)
+                                    ]
+
+        return m
+
+    def ports(self):
+        return [self.flush_i, self.enable_translation_i,
+                self.en_ld_st_translation_i,
+                self.lsu_req_i,
+                self.lsu_vaddr_i, self.lsu_is_store_i, self.lsu_dtlb_hit_o,
+                self.lsu_valid_o, self.lsu_paddr_o,
+                self.priv_lvl_i, self.ld_st_priv_lvl_i, self.sum_i, self.mxr_i,
+                self.satp_ppn_i, self.asid_i, self.flush_tlb_i,
+                self.itlb_miss_o, self.dtlb_miss_o] + \
+                self.icache_areq_i.ports() + self.icache_areq_o.ports() + \
+                self.req_port_i.ports() + self.req_port_o.ports() + \
+                self.misaligned_ex_i.ports() + self.lsu_exception_o.ports()
+
+if __name__ == '__main__':
+    mmu = MMU()
+    vl = rtlil.convert(mmu, ports=mmu.ports())
+    with open("test_mmu.il", "w") as f:
+        f.write(vl)
+
diff --git a/src/soc/TLB/ariane/p_lru.txt b/src/soc/TLB/ariane/p_lru.txt
new file mode 100644
index 00000000..4bac7680
--- /dev/null
+++ b/src/soc/TLB/ariane/p_lru.txt
@@ -0,0 +1,51 @@
+pseudo-LRU
+
+two-way set associative - one bit
+
+   indicates which line of the two has been reference more recently
+
+
+four-way set associative - three bits
+
+   each bit represents one branch point in a binary decision tree; let 1
+   represent that the left side has been referenced more recently than the
+   right side, and 0 vice-versa
+
+              are all 4 lines valid?
+                   /       \
+                 yes        no, use an invalid line
+                  |
+                  |
+                  |
+             bit_0 == 0?            state | replace      ref to | next state
+              /       \             ------+--------      -------+-----------
+             y         n             00x  |  line_0      line_0 |    11_
+            /           \            01x  |  line_1      line_1 |    10_
+     bit_1 == 0?    bit_2 == 0?      1x0  |  line_2      line_2 |    0_1
+       /    \          /    \        1x1  |  line_3      line_3 |    0_0
+      y      n        y      n
+     /        \      /        \        ('x' means       ('_' means unchanged)
+   line_0  line_1  line_2  line_3      don't care)
+
+   (see Figure 3-7, p. 3-18, in Intel Embedded Pentium Processor Family Dev.
+    Manual, 1998, http://www.intel.com/design/intarch/manuals/273204.htm)
+
+
+note that there is a 6-bit encoding for true LRU for four-way set associative
+
+  bit 0: bank[1] more recently used than bank[0]
+  bit 1: bank[2] more recently used than bank[0]
+  bit 2: bank[2] more recently used than bank[1]
+  bit 3: bank[3] more recently used than bank[0]
+  bit 4: bank[3] more recently used than bank[1]
+  bit 5: bank[3] more recently used than bank[2]
+
+  this results in 24 valid bit patterns within the 64 possible bit patterns
+  (4! possible valid traces for bank references)
+
+  e.g., a trace of 0 1 2 3, where 0 is LRU and 3 is MRU, is encoded as 111111
+
+  you can implement a state machine with a 256x6 ROM (6-bit state encoding
+  appended with a 2-bit bank reference input will yield a new 6-bit state),
+  and you can implement an LRU bank indicator with a 64x2 ROM
+
diff --git a/src/soc/TLB/ariane/plru.py b/src/soc/TLB/ariane/plru.py
new file mode 100644
index 00000000..a8db5c27
--- /dev/null
+++ b/src/soc/TLB/ariane/plru.py
@@ -0,0 +1,105 @@
+from nmigen import Signal, Module, Cat, Const
+from nmigen.hdl.ir import Elaboratable
+from math import log2
+
+
+class PLRU(Elaboratable):
+    """ PLRU - Pseudo Least Recently Used Replacement
+
+        PLRU-tree indexing:
+        lvl0        0
+                   / \
+                  /   \
+        lvl1     1     2
+                / \   / \
+        lvl2   3   4 5   6
+              / \ /\/\  /\
+             ... ... ... ...
+    """
+    def __init__(self, entries):
+        self.entries = entries
+        self.lu_hit = Signal(entries)
+        self.replace_en_o = Signal(entries)
+        self.lu_access_i = Signal()
+        # Tree (bit per entry)
+        self.TLBSZ = 2*(self.entries-1)
+        self.plru_tree = Signal(self.TLBSZ)
+        self.plru_tree_o = Signal(self.TLBSZ)
+
+    def elaborate(self, platform=None):
+        m = Module()
+
+        # Just predefine which nodes will be set/cleared
+        # E.g. for a TLB with 8 entries, the for-loop is semantically
+        # equivalent to the following pseudo-code:
+        # unique case (1'b1)
+        # lu_hit[7]: plru_tree[0, 2, 6] = {1, 1, 1};
+        # lu_hit[6]: plru_tree[0, 2, 6] = {1, 1, 0};
+        # lu_hit[5]: plru_tree[0, 2, 5] = {1, 0, 1};
+        # lu_hit[4]: plru_tree[0, 2, 5] = {1, 0, 0};
+        # lu_hit[3]: plru_tree[0, 1, 4] = {0, 1, 1};
+        # lu_hit[2]: plru_tree[0, 1, 4] = {0, 1, 0};
+        # lu_hit[1]: plru_tree[0, 1, 3] = {0, 0, 1};
+        # lu_hit[0]: plru_tree[0, 1, 3] = {0, 0, 0};
+        # default: begin /* No hit */ end
+        # endcase
+        LOG_TLB = int(log2(self.entries))
+        print(LOG_TLB)
+        for i in range(self.entries):
+            # we got a hit so update the pointer as it was least recently used
+            hit = Signal(reset_less=True)
+            m.d.comb += hit.eq(self.lu_hit[i] & self.lu_access_i)
+            with m.If(hit):
+                # Set the nodes to the values we would expect
+                for lvl in range(LOG_TLB):
+                    idx_base = (1<<lvl)-1
+                    # lvl0 <=> MSB, lvl1 <=> MSB-1, ...
+                    shift = LOG_TLB - lvl;
+                    new_idx = Const(~((i >> (shift-1)) & 1), (1, False))
+                    plru_idx = idx_base + (i >> shift)
+                    print ("plru", i, lvl, hex(idx_base),
+                                  plru_idx, shift, new_idx)
+                    m.d.comb += self.plru_tree_o[plru_idx].eq(new_idx)
+
+        # Decode tree to write enable signals
+        # Next for-loop basically creates the following logic for e.g.
+        # an 8 entry TLB (note: pseudo-code obviously):
+        # replace_en[7] = &plru_tree[ 6, 2, 0]; #plru_tree[0,2,6]=={1,1,1}
+        # replace_en[6] = &plru_tree[~6, 2, 0]; #plru_tree[0,2,6]=={1,1,0}
+        # replace_en[5] = &plru_tree[ 5,~2, 0]; #plru_tree[0,2,5]=={1,0,1}
+        # replace_en[4] = &plru_tree[~5,~2, 0]; #plru_tree[0,2,5]=={1,0,0}
+        # replace_en[3] = &plru_tree[ 4, 1,~0]; #plru_tree[0,1,4]=={0,1,1}
+        # replace_en[2] = &plru_tree[~4, 1,~0]; #plru_tree[0,1,4]=={0,1,0}
+        # replace_en[1] = &plru_tree[ 3,~1,~0]; #plru_tree[0,1,3]=={0,0,1}
+        # replace_en[0] = &plru_tree[~3,~1,~0]; #plru_tree[0,1,3]=={0,0,0}
+        # For each entry traverse the tree. If every tree-node matches
+        # the corresponding bit of the entry's index, this is
+        # the next entry to replace.
+        replace = []
+        for i in range(self.entries):
+            en = []
+            for lvl in range(LOG_TLB):
+                idx_base = (1<<lvl)-1
+                # lvl0 <=> MSB, lvl1 <=> MSB-1, ...
+                shift = LOG_TLB - lvl;
+                new_idx = (i >> (shift-1)) & 1;
+                plru_idx = idx_base + (i>>shift)
+                plru = Signal(reset_less=True,
+                              name="plru-%d-%d-%d" % (i, lvl, plru_idx))
+                m.d.comb += plru.eq(self.plru_tree[plru_idx])
+                # en &= plru_tree_q[idx_base + (i>>shift)] == new_idx;
+                if new_idx:
+                    en.append(~plru) # yes inverted (using bool())
+                else:
+                    en.append(plru)  # yes inverted (using bool())
+            print ("plru", i, en)
+            # boolean logic manipulation:
+            # plru0 & plru1 & plru2 == ~(~plru0 | ~plru1 | ~plru2)
+            replace.append(~Cat(*en).bool())
+        m.d.comb += self.replace_en_o.eq(Cat(*replace))
+
+        return m
+
+    def ports(self):
+        return [self.entries, self.lu_hit, self.replace_en_o,
+                self.lu_access_i, self.plru_tree, self.plru_tree_o]
diff --git a/src/soc/TLB/ariane/ptw.py b/src/soc/TLB/ariane/ptw.py
new file mode 100644
index 00000000..4046c711
--- /dev/null
+++ b/src/soc/TLB/ariane/ptw.py
@@ -0,0 +1,556 @@
+"""
+# Copyright 2018 ETH Zurich and University of Bologna.
+# Copyright and related rights are licensed under the Solderpad Hardware
+# License, Version 0.51 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# or agreed to in writing, software, hardware and materials distributed under
+# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Author: David Schaffenrath, TU Graz
+# Author: Florian Zaruba, ETH Zurich
+# Date: 24.4.2017
+# Description: Hardware-PTW
+
+/* verilator lint_off WIDTH */
+import ariane_pkg::*;
+
+see linux kernel source:
+
+* "arch/riscv/include/asm/page.h"
+* "arch/riscv/include/asm/mmu_context.h"
+* "arch/riscv/Kconfig" (CONFIG_PAGE_OFFSET)
+
+"""
+
+from nmigen import Const, Signal, Cat, Module, Elaboratable
+from nmigen.hdl.ast import ArrayProxy
+from nmigen.cli import verilog, rtlil
+from math import log2
+
+
+DCACHE_SET_ASSOC = 8
+CONFIG_L1D_SIZE =  32*1024
+DCACHE_INDEX_WIDTH = int(log2(CONFIG_L1D_SIZE / DCACHE_SET_ASSOC))
+DCACHE_TAG_WIDTH = 56 - DCACHE_INDEX_WIDTH
+
+ASID_WIDTH = 8
+
+
+class DCacheReqI:
+    def __init__(self):
+        self.address_index = Signal(DCACHE_INDEX_WIDTH)
+        self.address_tag = Signal(DCACHE_TAG_WIDTH)
+        self.data_wdata = Signal(64)
+        self.data_req = Signal()
+        self.data_we = Signal()
+        self.data_be = Signal(8)
+        self.data_size = Signal(2)
+        self.kill_req = Signal()
+        self.tag_valid = Signal()
+
+    def eq(self, inp):
+        res = []
+        for (o, i) in zip(self.ports(), inp.ports()):
+            res.append(o.eq(i))
+        return res
+
+    def ports(self):
+        return [self.address_index, self.address_tag,
+                self.data_wdata, self.data_req,
+                self.data_we, self.data_be, self.data_size,
+                self.kill_req, self.tag_valid,
+            ]
+
+class DCacheReqO:
+    def __init__(self):
+        self.data_gnt = Signal()
+        self.data_rvalid = Signal()
+        self.data_rdata = Signal(64) # actually in PTE object format
+
+    def eq(self, inp):
+        res = []
+        for (o, i) in zip(self.ports(), inp.ports()):
+            res.append(o.eq(i))
+        return res
+
+    def ports(self):
+        return [self.data_gnt, self.data_rvalid, self.data_rdata]
+
+
+class PTE: #(RecordObject):
+    def __init__(self):
+        self.v = Signal()
+        self.r = Signal()
+        self.w = Signal()
+        self.x = Signal()
+        self.u = Signal()
+        self.g = Signal()
+        self.a = Signal()
+        self.d = Signal()
+        self.rsw = Signal(2)
+        self.ppn = Signal(44)
+        self.reserved = Signal(10)
+
+    def flatten(self):
+        return Cat(*self.ports())
+
+    def eq(self, x):
+        if isinstance(x, ArrayProxy):
+            res = []
+            for o in self.ports():
+                i = getattr(x, o.name)
+                res.append(i)
+            x = Cat(*res)
+        else:
+            x = x.flatten()
+        return self.flatten().eq(x)
+
+    def __iter__(self):
+        """ order is critical so that flatten creates LSB to MSB
+        """
+        yield self.v
+        yield self.r
+        yield self.w
+        yield self.x
+        yield self.u
+        yield self.g
+        yield self.a
+        yield self.d
+        yield self.rsw
+        yield self.ppn
+        yield self.reserved
+
+    def ports(self):
+        return list(self)
+
+
+class TLBUpdate:
+    def __init__(self, asid_width):
+        self.valid = Signal()      # valid flag
+        self.is_2M = Signal()
+        self.is_1G = Signal()
+        self.is_512G = Signal()
+        self.vpn = Signal(36)
+        self.asid = Signal(asid_width)
+        self.content = PTE()
+
+    def flatten(self):
+        return Cat(*self.ports())
+
+    def eq(self, x):
+        return self.flatten().eq(x.flatten())
+
+    def ports(self):
+        return [self.valid, self.is_2M, self.is_1G, self.vpn, self.asid] + \
+                self.content.ports()
+
+
+# SV48 defines four levels of page tables
+LVL1 = Const(0, 2) # defined to 0 so that ptw_lvl default-resets to LVL1
+LVL2 = Const(1, 2)
+LVL3 = Const(2, 2)
+LVL4 = Const(3, 2)
+
+
+class PTW(Elaboratable):
+    def __init__(self, asid_width=8):
+        self.asid_width = asid_width
+
+        self.flush_i = Signal() # flush everything, we need to do this because
+        # actually everything we do is speculative at this stage
+        # e.g.: there could be a CSR instruction that changes everything
+        self.ptw_active_o = Signal(reset=1)    # active if not IDLE
+        self.walking_instr_o = Signal()        # set when walking for TLB
+        self.ptw_error_o = Signal()            # set when an error occurred
+        self.enable_translation_i = Signal()   # CSRs indicate to enable SV48
+        self.en_ld_st_translation_i = Signal() # enable VM translation for ld/st
+
+        self.lsu_is_store_i = Signal()       # translation triggered by store
+        # PTW memory interface
+        self.req_port_i = DCacheReqO()
+        self.req_port_o = DCacheReqI()
+
+        # to TLBs, update logic
+        self.itlb_update_o = TLBUpdate(asid_width)
+        self.dtlb_update_o = TLBUpdate(asid_width)
+
+        self.update_vaddr_o = Signal(48)
+
+        self.asid_i = Signal(self.asid_width)
+        # from TLBs
+        # did we miss?
+        self.itlb_access_i = Signal()
+        self.itlb_hit_i = Signal()
+        self.itlb_vaddr_i = Signal(64)
+
+        self.dtlb_access_i = Signal()
+        self.dtlb_hit_i = Signal()
+        self.dtlb_vaddr_i = Signal(64)
+        # from CSR file
+        self.satp_ppn_i = Signal(44) # ppn from satp
+        self.mxr_i = Signal()
+        # Performance counters
+        self.itlb_miss_o = Signal()
+        self.dtlb_miss_o = Signal()
+
+    def ports(self):
+        return [self.ptw_active_o, self.walking_instr_o, self.ptw_error_o,
+                ]
+        return [
+                self.enable_translation_i, self.en_ld_st_translation_i,
+                self.lsu_is_store_i, self.req_port_i, self.req_port_o,
+                self.update_vaddr_o,
+                self.asid_i,
+                self.itlb_access_i, self.itlb_hit_i, self.itlb_vaddr_i,
+                self.dtlb_access_i, self.dtlb_hit_i, self.dtlb_vaddr_i,
+                self.satp_ppn_i, self.mxr_i,
+                self.itlb_miss_o, self.dtlb_miss_o
+            ] + self.itlb_update_o.ports() + self.dtlb_update_o.ports()
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # input registers
+        data_rvalid = Signal()
+        data_rdata = Signal(64)
+
+        # NOTE: pte decodes the incoming bit-field (data_rdata). data_rdata
+        # is spec'd in 64-bit binary-format: better to spec as Record?
+        pte = PTE()
+        m.d.comb += pte.flatten().eq(data_rdata)
+
+        # SV48 defines four levels of page tables
+        ptw_lvl = Signal(2) # default=0=LVL1 on reset (see above)
+        ptw_lvl1 = Signal()
+        ptw_lvl2 = Signal()
+        ptw_lvl3 = Signal()
+        ptw_lvl4 = Signal()
+        m.d.comb += [ptw_lvl1.eq(ptw_lvl == LVL1),
+                     ptw_lvl2.eq(ptw_lvl == LVL2),
+                     ptw_lvl3.eq(ptw_lvl == LVL3),
+                     ptw_lvl4.eq(ptw_lvl == LVL4)
+                     ]
+
+        # is this an instruction page table walk?
+        is_instr_ptw = Signal()
+        global_mapping = Signal()
+        # latched tag signal
+        tag_valid = Signal()
+        # register the ASID
+        tlb_update_asid = Signal(self.asid_width)
+        # register VPN we need to walk, SV48 defines a 48 bit virtual addr
+        vaddr = Signal(64)
+        # 4 byte aligned physical pointer
+        ptw_pptr = Signal(56)
+
+        end = DCACHE_INDEX_WIDTH + DCACHE_TAG_WIDTH
+        m.d.sync += [
+            # Assignments
+            self.update_vaddr_o.eq(vaddr),
+
+            self.walking_instr_o.eq(is_instr_ptw),
+            # directly output the correct physical address
+            self.req_port_o.address_index.eq(ptw_pptr[0:DCACHE_INDEX_WIDTH]),
+            self.req_port_o.address_tag.eq(ptw_pptr[DCACHE_INDEX_WIDTH:end]),
+            # we are never going to kill this request
+            self.req_port_o.kill_req.eq(0),              # XXX assign comb?
+            # we are never going to write with the HPTW
+            self.req_port_o.data_wdata.eq(Const(0, 64)), # XXX assign comb?
+            # -----------
+            # TLB Update
+            # -----------
+            self.itlb_update_o.vpn.eq(vaddr[12:48]),
+            self.dtlb_update_o.vpn.eq(vaddr[12:48]),
+            # update the correct page table level
+            self.itlb_update_o.is_2M.eq(ptw_lvl3),
+            self.itlb_update_o.is_1G.eq(ptw_lvl2),
+            self.itlb_update_o.is_512G.eq(ptw_lvl1),
+            self.dtlb_update_o.is_2M.eq(ptw_lvl3),
+            self.dtlb_update_o.is_1G.eq(ptw_lvl2),
+            self.dtlb_update_o.is_512G.eq(ptw_lvl1),
+            
+            # output the correct ASID
+            self.itlb_update_o.asid.eq(tlb_update_asid),
+            self.dtlb_update_o.asid.eq(tlb_update_asid),
+            # set the global mapping bit
+            self.itlb_update_o.content.eq(pte),
+            self.itlb_update_o.content.g.eq(global_mapping),
+            self.dtlb_update_o.content.eq(pte),
+            self.dtlb_update_o.content.g.eq(global_mapping),
+
+            self.req_port_o.tag_valid.eq(tag_valid),
+        ]
+
+        #-------------------
+        # Page table walker   #needs update
+        #-------------------
+        # A virtual address va is translated into a physical address pa as
+        # follows:
+        # 1. Let a be sptbr.ppn Ã PAGESIZE, and let i = LEVELS-1. (For Sv48,
+        #    PAGESIZE=2^12 and LEVELS=4.)
+        # 2. Let pte be the value of the PTE at address a+va.vpn[i]ÃPTESIZE.
+        #    (For Sv32, PTESIZE=4.)
+        # 3. If pte.v = 0, or if pte.r = 0 and pte.w = 1, stop and raise an
+        #    access exception.
+        # 4. Otherwise, the PTE is valid. If pte.r = 1 or pte.x = 1, go to
+        #    step 5.  Otherwise, this PTE is a pointer to the next level of
+        #    the page table.
+        #    Let i=i-1. If i < 0, stop and raise an access exception.
+        #    Otherwise, let a = pte.ppn Ã PAGESIZE and go to step 2.
+        # 5. A leaf PTE has been found. Determine if the requested memory
+        #    access is allowed by the pte.r, pte.w, and pte.x bits. If not,
+        #    stop and raise an access exception. Otherwise, the translation is
+        #    successful.  Set pte.a to 1, and, if the memory access is a
+        #    store, set pte.d to 1.
+        #    The translated physical address is given as follows:
+        #      - pa.pgoff = va.pgoff.
+        #      - If i > 0, then this is a superpage translation and
+        #        pa.ppn[i-1:0] = va.vpn[i-1:0].
+        #      - pa.ppn[LEVELS-1:i] = pte.ppn[LEVELS-1:i].
+        # 6. If i > 0 and pa.ppn[i â 1 : 0] != 0, this is a misaligned
+        #    superpage stop and raise a page-fault exception.
+
+        m.d.sync += tag_valid.eq(0)
+
+        # default assignments
+        m.d.comb += [
+            # PTW memory interface
+            self.req_port_o.data_req.eq(0),
+            self.req_port_o.data_be.eq(Const(0xFF, 8)),
+            self.req_port_o.data_size.eq(Const(0b11, 2)),
+            self.req_port_o.data_we.eq(0),
+            self.ptw_error_o.eq(0),
+            self.itlb_update_o.valid.eq(0),
+            self.dtlb_update_o.valid.eq(0),
+
+            self.itlb_miss_o.eq(0),
+            self.dtlb_miss_o.eq(0),
+        ]
+
+        # ------------
+        # State Machine
+        # ------------
+
+        with m.FSM() as fsm:
+
+            with m.State("IDLE"):
+                self.idle(m, is_instr_ptw, ptw_lvl, global_mapping,
+                          ptw_pptr, vaddr, tlb_update_asid)
+
+            with m.State("WAIT_GRANT"):
+                self.grant(m, tag_valid, data_rvalid)
+
+            with m.State("PTE_LOOKUP"):
+                # we wait for the valid signal
+                with m.If(data_rvalid):
+                    self.lookup(m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4,
+                                data_rvalid, global_mapping,
+                                is_instr_ptw, ptw_pptr)
+
+            # Propagate error to MMU/LSU
+            with m.State("PROPAGATE_ERROR"):
+                m.next = "IDLE"
+                m.d.comb += self.ptw_error_o.eq(1)
+
+            # wait for the rvalid before going back to IDLE
+            with m.State("WAIT_RVALID"):
+                with m.If(data_rvalid):
+                    m.next = "IDLE"
+
+        m.d.sync += [data_rdata.eq(self.req_port_i.data_rdata),
+                     data_rvalid.eq(self.req_port_i.data_rvalid)
+                    ]
+
+        return m
+
+    def set_grant_state(self, m):
+        # should we have flushed before we got an rvalid,
+        # wait for it until going back to IDLE
+        with m.If(self.flush_i):
+            with m.If (self.req_port_i.data_gnt):
+                m.next = "WAIT_RVALID"
+            with m.Else():
+                m.next = "IDLE"
+        with m.Else():
+            m.next = "WAIT_GRANT"
+
+    def idle(self, m, is_instr_ptw, ptw_lvl, global_mapping,
+                          ptw_pptr, vaddr, tlb_update_asid):
+        # by default we start with the top-most page table
+        m.d.sync += [is_instr_ptw.eq(0),
+                     ptw_lvl.eq(LVL1),
+                     global_mapping.eq(0),
+                     self.ptw_active_o.eq(0), # deactive (IDLE)
+                    ]
+        # work out itlb/dtlb miss
+        m.d.comb += self.itlb_miss_o.eq(self.enable_translation_i & \
+                                 self.itlb_access_i & \
+                                 ~self.itlb_hit_i & \
+                                 ~self.dtlb_access_i)
+        m.d.comb += self.dtlb_miss_o.eq(self.en_ld_st_translation_i & \
+                                        self.dtlb_access_i & \
+                                        ~self.dtlb_hit_i)
+        # we got an ITLB miss?
+        with m.If(self.itlb_miss_o):
+            pptr = Cat(Const(0, 3), self.itlb_vaddr_i[30:48],
+                       self.satp_ppn_i)
+            m.d.sync += [ptw_pptr.eq(pptr),
+                         is_instr_ptw.eq(1),
+                         vaddr.eq(self.itlb_vaddr_i),
+                         tlb_update_asid.eq(self.asid_i),
+                        ]
+            self.set_grant_state(m)
+
+        # we got a DTLB miss?
+        with m.Elif(self.dtlb_miss_o):
+            pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:48],
+                       self.satp_ppn_i)
+            m.d.sync += [ptw_pptr.eq(pptr),
+                         vaddr.eq(self.dtlb_vaddr_i),
+                         tlb_update_asid.eq(self.asid_i),
+                        ]
+            self.set_grant_state(m)
+
+    def grant(self, m, tag_valid, data_rvalid):
+        # we've got a data WAIT_GRANT so tell the
+        # cache that the tag is valid
+
+        # send a request out
+        m.d.comb += self.req_port_o.data_req.eq(1)
+        # wait for the WAIT_GRANT
+        with m.If(self.req_port_i.data_gnt):
+            # send the tag valid signal one cycle later
+            m.d.sync += tag_valid.eq(1)
+            # should we have flushed before we got an rvalid,
+            # wait for it until going back to IDLE
+            with m.If(self.flush_i):
+                with m.If (~data_rvalid):
+                    m.next = "WAIT_RVALID"
+                with m.Else():
+                    m.next = "IDLE"
+            with m.Else():
+                m.next = "PTE_LOOKUP"
+
+    def lookup(self, m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4, 
+                            data_rvalid, global_mapping,
+                            is_instr_ptw, ptw_pptr):
+        # temporaries
+        pte_rx = Signal(reset_less=True)
+        pte_exe = Signal(reset_less=True)
+        pte_inv = Signal(reset_less=True)
+        pte_a = Signal(reset_less=True)
+        st_wd = Signal(reset_less=True)
+        m.d.comb += [pte_rx.eq(pte.r | pte.x),
+                    pte_exe.eq(~pte.x | ~pte.a),
+                    pte_inv.eq(~pte.v | (~pte.r & pte.w)),
+                    pte_a.eq(pte.a & (pte.r | (pte.x & self.mxr_i))),
+                    st_wd.eq(self.lsu_is_store_i & (~pte.w | ~pte.d))]
+
+        l1err = Signal(reset_less=True)
+        l2err = Signal(reset_less=True)
+        l3err = Signal(reset_less=True)
+        m.d.comb += [l3err.eq((ptw_lvl3) & pte.ppn[0:9] != Const(0,0)),
+                     l2err.eq((ptw_lvl2) & pte.ppn[0:18] != Const(0, 18)),
+                     l1err.eq((ptw_lvl1) & pte.ppn[0:27] != Const(0, 27))]
+
+        # check if the global mapping bit is set
+        with m.If (pte.g):
+            m.d.sync += global_mapping.eq(1)
+
+        m.next = "IDLE"
+
+        # -------------
+        # Invalid PTE
+        # -------------
+        # If pte.v = 0, or if pte.r = 0 and pte.w = 1,
+        # stop and raise a page-fault exception.
+        with m.If (pte_inv):
+            m.next = "PROPAGATE_ERROR"
+
+        # -----------
+        # Valid PTE
+        # -----------
+
+        # it is a valid PTE
+        # if pte.r = 1 or pte.x = 1 it is a valid PTE
+        with m.Elif (pte_rx):
+            # Valid translation found (either 1G, 2M or 4K)
+            with m.If(is_instr_ptw):
+                # ------------
+                # Update ITLB
+                # ------------
+                # If page not executable, we can directly raise error.
+                # This doesn't put a useless entry into the TLB.
+                # The same idea applies to the access flag since we let
+                # the access flag be managed by SW.
+                with m.If (pte_exe):
+                    m.next = "IDLE"
+                with m.Else():
+                    m.d.comb += self.itlb_update_o.valid.eq(1)
+
+            with m.Else():
+                # ------------
+                # Update DTLB
+                # ------------
+                # Check if the access flag has been set, otherwise
+                # throw page-fault and let software handle those bits.
+                # If page not readable (there are no write-only pages)
+                # directly raise an error. This doesn't put a useless
+                # entry into the TLB.
+                with m.If(pte_a):
+                    m.d.comb += self.dtlb_update_o.valid.eq(1)
+                with m.Else():
+                    m.next = "PROPAGATE_ERROR"
+                # Request is a store: perform additional checks
+                # If the request was a store and the page not
+                # write-able, raise an error
+                # the same applies if the dirty flag is not set
+                with m.If (st_wd):
+                    m.d.comb += self.dtlb_update_o.valid.eq(0)
+                    m.next = "PROPAGATE_ERROR"
+
+            # check if the ppn is correctly aligned: Case (6)
+            with m.If(l1err | l2err | l3err):
+                m.next = "PROPAGATE_ERROR"
+                m.d.comb += [self.dtlb_update_o.valid.eq(0),
+                             self.itlb_update_o.valid.eq(0)]
+
+        # this is a pointer to the next TLB level
+        with m.Else():
+            # pointer to next level of page table
+            with m.If (ptw_lvl1):
+                # we are in the second level now
+                pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:39], pte.ppn)
+                m.d.sync += [ptw_pptr.eq(pptr),
+                             ptw_lvl.eq(LVL2)
+                            ]
+            with m.If(ptw_lvl2):
+                # here we received a pointer to the third level
+                pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[21:30], pte.ppn)
+                m.d.sync += [ptw_pptr.eq(pptr),
+                             ptw_lvl.eq(LVL3)
+                            ]
+            with m.If(ptw_lvl3): #guess: shift page levels by one
+                # here we received a pointer to the fourth level
+                # the last one is near the page offset
+                pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[12:21], pte.ppn)
+                m.d.sync += [ptw_pptr.eq(pptr),
+                             ptw_lvl.eq(LVL4)
+                            ]
+            self.set_grant_state(m)
+
+            with m.If (ptw_lvl4):
+                # Should already be the last level
+                # page table => Error
+                m.d.sync += ptw_lvl.eq(LVL4)
+                m.next = "PROPAGATE_ERROR"
+
+
+if __name__ == '__main__':
+    ptw = PTW()
+    vl = rtlil.convert(ptw, ports=ptw.ports())
+    with open("test_ptw.il", "w") as f:
+        f.write(vl)
diff --git a/src/soc/TLB/ariane/test/test_plru.py b/src/soc/TLB/ariane/test/test_plru.py
new file mode 100644
index 00000000..68dcfa58
--- /dev/null
+++ b/src/soc/TLB/ariane/test/test_plru.py
@@ -0,0 +1,15 @@
+import sys
+sys.path.append("../src")
+sys.path.append("../../../TestUtil")
+
+from TLB.ariane.plru import PLRU
+
+from nmigen.compat.sim import run_simulation
+
+def tbench(dut):
+    yield
+
+if __name__ == "__main__":
+    dut = PLRU(4)
+    run_simulation(dut, tbench(dut), vcd_name="test_plru.vcd")
+    print("PLRU Unit Test Success")
diff --git a/src/soc/TLB/ariane/test/test_ptw.py b/src/soc/TLB/ariane/test/test_ptw.py
new file mode 100644
index 00000000..b5deb28b
--- /dev/null
+++ b/src/soc/TLB/ariane/test/test_ptw.py
@@ -0,0 +1,130 @@
+import sys
+sys.path.append("../src")
+sys.path.append("../../../TestUtil")
+
+from nmigen.compat.sim import run_simulation
+
+from TLB.ariane.ptw import PTW, PTE
+
+# unit was changed, test needs to be changed
+
+def tbench(dut):
+
+    addr = 0x8000000
+
+    #pte = PTE()
+    #yield pte.v.eq(1)
+    #yield pte.r.eq(1)
+
+    yield dut.req_port_i.data_gnt.eq(1)
+    yield dut.req_port_i.data_rvalid.eq(1)
+    yield dut.req_port_i.data_rdata.eq(0x43)#pte.flatten())
+
+    # data lookup
+    yield dut.en_ld_st_translation_i.eq(1)
+    yield dut.asid_i.eq(1)
+
+    yield dut.dtlb_access_i.eq(1)
+    yield dut.dtlb_hit_i.eq(0)
+    yield dut.dtlb_vaddr_i.eq(0x400000000)
+
+    yield
+    yield
+    yield
+
+    yield dut.dtlb_access_i.eq(1)
+    yield dut.dtlb_hit_i.eq(0)
+    yield dut.dtlb_vaddr_i.eq(0x200000)
+
+    yield
+    yield
+    yield
+
+    yield dut.req_port_i.data_gnt.eq(0)
+    yield dut.dtlb_access_i.eq(1)
+    yield dut.dtlb_hit_i.eq(0)
+    yield dut.dtlb_vaddr_i.eq(0x400000011)
+
+    yield
+    yield dut.req_port_i.data_gnt.eq(1)
+    yield
+    yield
+
+    # data lookup, PTW levels 1-2-3
+    addr = 0x4000000
+    yield dut.dtlb_vaddr_i.eq(addr)
+    yield dut.mxr_i.eq(0x1)
+    yield dut.req_port_i.data_gnt.eq(1)
+    yield dut.req_port_i.data_rvalid.eq(1)
+    yield dut.req_port_i.data_rdata.eq(0x41 | (addr>>12)<<10)#pte.flatten())
+
+    yield dut.en_ld_st_translation_i.eq(1)
+    yield dut.asid_i.eq(1)
+
+    yield dut.dtlb_access_i.eq(1)
+    yield dut.dtlb_hit_i.eq(0)
+    yield dut.dtlb_vaddr_i.eq(addr)
+
+    yield
+    yield
+    yield
+    yield
+    yield
+    yield
+    yield
+    yield
+
+    yield dut.req_port_i.data_gnt.eq(0)
+    yield dut.dtlb_access_i.eq(1)
+    yield dut.dtlb_hit_i.eq(0)
+    yield dut.dtlb_vaddr_i.eq(0x400000011)
+
+    yield
+    yield dut.req_port_i.data_gnt.eq(1)
+    yield
+    yield
+    yield
+    yield
+
+
+    # instruction lookup
+    yield dut.en_ld_st_translation_i.eq(0)
+    yield dut.enable_translation_i.eq(1)
+    yield dut.asid_i.eq(1)
+
+    yield dut.itlb_access_i.eq(1)
+    yield dut.itlb_hit_i.eq(0)
+    yield dut.itlb_vaddr_i.eq(0x800000)
+
+    yield
+    yield
+    yield
+
+    yield dut.itlb_access_i.eq(1)
+    yield dut.itlb_hit_i.eq(0)
+    yield dut.itlb_vaddr_i.eq(0x200000)
+
+    yield
+    yield
+    yield
+
+    yield dut.req_port_i.data_gnt.eq(0)
+    yield dut.itlb_access_i.eq(1)
+    yield dut.itlb_hit_i.eq(0)
+    yield dut.itlb_vaddr_i.eq(0x800011)
+
+    yield
+    yield dut.req_port_i.data_gnt.eq(1)
+    yield
+    yield
+
+    yield
+
+
+def test_ptw():
+    dut = PTW()
+    run_simulation(dut, tbench(dut), vcd_name="test_ptw.vcd")
+    print("PTW Unit Test Success")
+
+if __name__ == "__main__":
+    test_ptw()
diff --git a/src/soc/TLB/ariane/test/test_tlb.py b/src/soc/TLB/ariane/test/test_tlb.py
new file mode 100644
index 00000000..b94438ff
--- /dev/null
+++ b/src/soc/TLB/ariane/test/test_tlb.py
@@ -0,0 +1,70 @@
+import sys
+sys.path.append("../src")
+sys.path.append("../../../TestUtil")
+
+from nmigen.compat.sim import run_simulation
+
+from TLB.ariane.tlb import TLB
+
+def set_vaddr(addr):
+    yield dut.lu_vaddr_i.eq(addr)
+    yield dut.update_i.vpn.eq(addr>>12)
+
+
+def tbench(dut):
+    yield dut.lu_access_i.eq(1)
+    yield dut.lu_asid_i.eq(1)
+    yield dut.update_i.valid.eq(1)
+    yield dut.update_i.is_1G.eq(0)
+    yield dut.update_i.is_2M.eq(0)
+    yield dut.update_i.asid.eq(1)
+    yield dut.update_i.content.ppn.eq(0)
+    yield dut.update_i.content.rsw.eq(0)
+    yield dut.update_i.content.r.eq(1)
+
+    yield
+
+    addr = 0x80000
+    yield from set_vaddr(addr)
+    yield
+
+    addr = 0x90001
+    yield from set_vaddr(addr)
+    yield
+
+    addr = 0x28000000
+    yield from set_vaddr(addr)
+    yield
+
+    addr = 0x28000001
+    yield from set_vaddr(addr)
+
+    addr = 0x28000001
+    yield from set_vaddr(addr)
+    yield
+
+    addr = 0x1000040000
+    yield from set_vaddr(addr)
+    yield
+
+    addr = 0x1000040001
+    yield from set_vaddr(addr)
+    yield
+
+    yield dut.update_i.is_1G.eq(1)
+    addr = 0x2040000
+    yield from set_vaddr(addr)
+    yield
+
+    yield dut.update_i.is_1G.eq(1)
+    addr = 0x2040001
+    yield from set_vaddr(addr)
+    yield
+
+    yield
+
+
+if __name__ == "__main__":
+    dut = TLB()
+    run_simulation(dut, tbench(dut), vcd_name="test_tlb.vcd")
+    print("TLB Unit Test Success")
diff --git a/src/soc/TLB/ariane/test/test_tlb_content.py b/src/soc/TLB/ariane/test/test_tlb_content.py
new file mode 100644
index 00000000..145ded7d
--- /dev/null
+++ b/src/soc/TLB/ariane/test/test_tlb_content.py
@@ -0,0 +1,63 @@
+import sys
+sys.path.append("../src")
+sys.path.append("../../../TestUtil")
+
+from nmigen.compat.sim import run_simulation
+
+from TLB.ariane.tlb_content import TLBContent
+from TestUtil.test_helper import assert_op, assert_eq
+
+def update(dut,a,t,g,m):
+    yield dut.replace_en_i.eq(1)
+    yield dut.update_i.valid.eq(1)
+    yield dut.update_i.is_512G.eq(t)
+    yield dut.update_i.is_1G.eq(g)
+    yield dut.update_i.is_2M.eq(m)
+    yield dut.update_i.vpn.eq(a)
+    yield
+    yield
+
+def check_hit(dut,hit,pagesize):
+    hit_d = yield dut.lu_hit_o
+    assert_eq("hit", hit_d, hit)
+
+    if(hit):
+        if(pagesize=="t"):
+            hitp = yield dut.lu_is_512G_o
+            assert_eq("lu_is_512G_o", hitp, 1)
+        elif(pagesize=="g"):
+            hitp = yield dut.lu_is_1G_o
+            assert_eq("lu_is_1G_o", hitp, 1)
+        elif(pagesize=="m"):
+            hitp = yield dut.lu_is_2M_o
+            assert_eq("lu_is_2M_o", hitp, 1)
+
+def addr(a,b,c,d):
+    return a | b << 9 | c << 18 | d << 27  
+    
+def tbench(dut):
+    yield dut.vpn0.eq(0x0A)
+    yield dut.vpn1.eq(0x0B)
+    yield dut.vpn2.eq(0x0C)
+    yield dut.vpn3.eq(0x0D)
+    yield from update(dut,addr(0xFF,0xFF,0xFF,0x0D),1,0,0)
+    yield from check_hit(dut,1,"t")
+    
+    yield from update(dut,addr(0xFF,0xFF,0x0C,0x0D),0,1,0)
+    yield from check_hit(dut,1,"g")
+    
+    yield from update(dut,addr(0xFF,0x0B,0x0C,0x0D),0,0,1)
+    yield from check_hit(dut,1,"m")
+    
+    yield from update(dut,addr(0x0A,0x0B,0x0C,0x0D),0,0,0)
+    yield from check_hit(dut,1,"")
+
+    yield from update(dut,addr(0xAA,0xBB,0xCC,0xDD),0,0,0)
+    yield from check_hit(dut,0,"miss")
+    
+
+if __name__ == "__main__":
+    dut = TLBContent(4,4)
+    #
+    run_simulation(dut, tbench(dut), vcd_name="test_tlb_content.vcd")
+    print("TLBContent Unit Test Success")
diff --git a/src/soc/TLB/ariane/tlb.py b/src/soc/TLB/ariane/tlb.py
new file mode 100644
index 00000000..cf4af57a
--- /dev/null
+++ b/src/soc/TLB/ariane/tlb.py
@@ -0,0 +1,175 @@
+"""
+# Copyright 2018 ETH Zurich and University of Bologna.
+# Copyright and related rights are licensed under the Solderpad Hardware
+# License, Version 0.51 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# or agreed to in writing, software, hardware and materials distributed under
+# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Author: David Schaffenrath, TU Graz
+# Author: Florian Zaruba, ETH Zurich
+# Date: 21.4.2017
+# Description: Translation Lookaside Buffer, SV48
+#              fully set-associative
+
+Implementation in c++:
+https://raw.githubusercontent.com/Tony-Hu/TreePLRU/master/TreePLRU.cpp
+
+Text description:
+https://people.cs.clemson.edu/~mark/464/p_lru.txt
+
+Online simulator:
+http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/vm.html
+"""
+from math import log2
+from nmigen import Signal, Module, Cat, Const, Array, Elaboratable
+from nmigen.cli import verilog, rtlil
+from nmigen.lib.coding import Encoder
+
+from TLB.ariane.ptw import TLBUpdate, PTE, ASID_WIDTH
+from TLB.ariane.plru import PLRU
+from TLB.ariane.tlb_content import TLBContent
+
+TLB_ENTRIES = 8
+
+class TLB(Elaboratable):
+    def __init__(self, tlb_entries=8, asid_width=8):
+        self.tlb_entries = tlb_entries
+        self.asid_width = asid_width
+
+        self.flush_i = Signal()  # Flush signal
+        # Lookup signals
+        self.lu_access_i = Signal()
+        self.lu_asid_i = Signal(self.asid_width)
+        self.lu_vaddr_i = Signal(64)
+        self.lu_content_o = PTE()
+        self.lu_is_2M_o = Signal()
+        self.lu_is_1G_o = Signal()
+        self.lu_is_512G_o = Signal()
+        self.lu_hit_o = Signal()
+        # Update TLB
+        self.pte_width = len(self.lu_content_o.flatten())
+        self.update_i = TLBUpdate(asid_width)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        vpn3 = Signal(9) #FIXME unused signal
+        vpn2 = Signal(9)
+        vpn1 = Signal(9)
+        vpn0 = Signal(9)
+
+        #-------------
+        # Translation
+        #-------------
+
+        # SV48 defines four levels of page tables
+        m.d.comb += [ vpn0.eq(self.lu_vaddr_i[12:21]),
+                      vpn1.eq(self.lu_vaddr_i[21:30]),
+                      vpn2.eq(self.lu_vaddr_i[30:39]),
+                      vpn3.eq(self.lu_vaddr_i[39:48]),      ### FIXME
+                    ]
+
+        tc = []
+        for i in range(self.tlb_entries):
+            tlc = TLBContent(self.pte_width, self.asid_width)
+            setattr(m.submodules, "tc%d" % i, tlc)
+            tc.append(tlc)
+            # connect inputs
+            tlc.update_i = self.update_i     # saves a lot of graphviz links
+            m.d.comb += [tlc.vpn0.eq(vpn0),
+                         tlc.vpn1.eq(vpn1),
+                         tlc.vpn2.eq(vpn2),
+                         # TODO 4th
+                         tlc.flush_i.eq(self.flush_i),
+                         #tlc.update_i.eq(self.update_i),
+                         tlc.lu_asid_i.eq(self.lu_asid_i)]
+        tc = Array(tc)
+
+        #--------------
+        # Select hit
+        #--------------
+
+        # use Encoder to select hit index
+        # XXX TODO: assert that there's only one valid entry (one lu_hit)
+        hitsel = Encoder(self.tlb_entries)
+        m.submodules.hitsel = hitsel
+
+        hits = []
+        for i in range(self.tlb_entries):
+            hits.append(tc[i].lu_hit_o)
+        m.d.comb += hitsel.i.eq(Cat(*hits)) # (goes into plru as well)
+        idx = hitsel.o
+
+        active = Signal(reset_less=True)
+        m.d.comb += active.eq(~hitsel.n)
+        with m.If(active):
+            # active hit, send selected as output
+            m.d.comb += [ self.lu_is_512G_o.eq(tc[idx].lu_is_512G_o),
+                          self.lu_is_1G_o.eq(tc[idx].lu_is_1G_o),
+                          self.lu_is_2M_o.eq(tc[idx].lu_is_2M_o),
+                          self.lu_hit_o.eq(1),
+                          self.lu_content_o.flatten().eq(tc[idx].lu_content_o),
+                        ]
+
+        #--------------
+        # PLRU.
+        #--------------
+
+        p = PLRU(self.tlb_entries)
+        plru_tree = Signal(p.TLBSZ)
+        m.submodules.plru = p
+
+        # connect PLRU inputs/outputs
+        # XXX TODO: assert that there's only one valid entry (one replace_en)
+        en = []
+        for i in range(self.tlb_entries):
+            en.append(tc[i].replace_en_i)
+        m.d.comb += [Cat(*en).eq(p.replace_en_o), # output from PLRU into tags
+                     p.lu_hit.eq(hitsel.i),
+                     p.lu_access_i.eq(self.lu_access_i),
+                     p.plru_tree.eq(plru_tree)]
+        m.d.sync += plru_tree.eq(p.plru_tree_o)
+
+        #--------------
+        # Sanity checks
+        #--------------
+
+        assert (self.tlb_entries % 2 == 0) and (self.tlb_entries > 1), \
+            "TLB size must be a multiple of 2 and greater than 1"
+        assert (self.asid_width >= 1), \
+            "ASID width must be at least 1"
+
+        return m
+
+        """
+        # Just for checking
+        function int countSetBits(logic[self.tlb_entries-1:0] vector);
+          automatic int count = 0;
+          foreach (vector[idx]) begin
+            count += vector[idx];
+          end
+          return count;
+        endfunction
+
+        assert property (@(posedge clk_i)(countSetBits(lu_hit) <= 1))
+          else $error("More then one hit in TLB!"); $stop(); end
+        assert property (@(posedge clk_i)(countSetBits(replace_en) <= 1))
+          else $error("More then one TLB entry selected for next replace!");
+        """
+
+    def ports(self):
+        return [self.flush_i, self.lu_access_i,
+                 self.lu_asid_i, self.lu_vaddr_i,
+                 self.lu_is_2M_o, self.lu_1G_o, self.lu_is_512G_o, self.lu_hit_o
+                ] + self.lu_content_o.ports() + self.update_i.ports()
+
+if __name__ == '__main__':
+    tlb = TLB()
+    vl = rtlil.convert(tlb, ports=tlb.ports())
+    with open("test_tlb.il", "w") as f:
+        f.write(vl)
+
diff --git a/src/soc/TLB/ariane/tlb_content.py b/src/soc/TLB/ariane/tlb_content.py
new file mode 100644
index 00000000..3384c885
--- /dev/null
+++ b/src/soc/TLB/ariane/tlb_content.py
@@ -0,0 +1,145 @@
+from nmigen import Signal, Module, Cat, Const, Elaboratable
+
+from TLB.ariane.ptw import TLBUpdate, PTE
+
+
+class TLBEntry:
+    def __init__(self, asid_width):
+        self.asid = Signal(asid_width,name="ent_asid")
+        # SV48 defines four levels of page tables
+        self.vpn0 = Signal(9,name="ent_vpn0")
+        self.vpn1 = Signal(9,name="ent_vpn1")
+        self.vpn2 = Signal(9,name="ent_vpn2")
+        self.vpn3 = Signal(9,name="ent_vpn3")
+        self.is_2M = Signal(name="ent_is_2M")
+        self.is_1G = Signal(name="ent_is_1G")
+        self.is_512G = Signal(name="ent_is_512G")
+        self.valid = Signal(name="ent_valid")
+        
+    def flatten(self):
+        return Cat(*self.ports())
+
+    def eq(self, x):
+        return self.flatten().eq(x.flatten())
+
+    def ports(self):
+        return [self.asid, self.vpn0, self.vpn1, self.vpn2,
+                self.is_2M, self.is_1G, self.valid]
+        
+
+class TLBContent(Elaboratable):
+    def __init__(self, pte_width, asid_width):
+        self.asid_width = asid_width
+        self.pte_width = pte_width
+        self.flush_i = Signal()  # Flush signal
+        # Update TLB
+        self.update_i = TLBUpdate(asid_width)
+        self.vpn3 = Signal(9)
+        self.vpn2 = Signal(9)
+        self.vpn1 = Signal(9)
+        self.vpn0 = Signal(9)
+        self.replace_en_i = Signal() # replace the following entry,
+                                     # set by replacement strategy
+        # Lookup signals
+        self.lu_asid_i = Signal(asid_width)
+        self.lu_content_o = Signal(pte_width)
+        self.lu_is_512G_o = Signal()
+        self.lu_is_2M_o = Signal()
+        self.lu_is_1G_o = Signal()
+        self.lu_hit_o = Signal()
+
+    def elaborate(self, platform):
+        m = Module()
+
+        tags = TLBEntry(self.asid_width)
+        
+        
+        content = Signal(self.pte_width)
+
+        m.d.comb += [self.lu_hit_o.eq(0),
+                     self.lu_is_512G_o.eq(0),
+                     self.lu_is_2M_o.eq(0),
+                     self.lu_is_1G_o.eq(0)]
+
+        # temporaries for lookup
+        asid_ok = Signal(reset_less=True)
+        # tags_ok = Signal(reset_less=True)
+
+        vpn3_ok = Signal(reset_less=True)
+        vpn2_ok = Signal(reset_less=True)
+        vpn1_ok = Signal(reset_less=True)
+        vpn0_ok = Signal(reset_less=True)
+
+        #tags_2M = Signal(reset_less=True)
+        vpn0_or_2M = Signal(reset_less=True)
+    
+        m.d.comb += [
+                     #compare asid and vpn*
+                     asid_ok.eq(tags.asid == self.lu_asid_i),
+                     vpn3_ok.eq(tags.vpn3 == self.vpn3),
+                     vpn2_ok.eq(tags.vpn2 == self.vpn2),
+                     vpn1_ok.eq(tags.vpn1 == self.vpn1),
+                     vpn0_ok.eq(tags.vpn0 == self.vpn0),
+                     vpn0_or_2M.eq(tags.is_2M | vpn0_ok)
+        ]
+        
+        
+        with m.If(asid_ok & tags.valid):
+            # first level, only vpn3 needs to match
+            with m.If (tags.is_512G & vpn3_ok):
+                m.d.comb += [ self.lu_content_o.eq(content),
+                              self.lu_is_512G_o.eq(1),
+                              self.lu_hit_o.eq(1),
+                            ]
+            # second level , second level vpn2 and vpn3 need to match
+            with m.Elif (tags.is_1G & vpn2_ok & vpn3_ok):
+                m.d.comb += [ self.lu_content_o.eq(content),
+                              self.lu_is_1G_o.eq(1),
+                              self.lu_hit_o.eq(1),
+                            ]
+            # not a giga page hit nor a tera page hit so check further
+            with m.Elif(vpn1_ok):
+                # this could be a 2 mega page hit or a 4 kB hit
+                # output accordingly
+                with m.If(vpn0_or_2M):
+                    m.d.comb += [ self.lu_content_o.eq(content),
+                                  self.lu_is_2M_o.eq(tags.is_2M),
+                                  self.lu_hit_o.eq(1),
+                                ]
+        # ------------------
+        # Update or Flush
+        # ------------------
+
+        # temporaries
+        replace_valid = Signal(reset_less=True)
+        m.d.comb += replace_valid.eq(self.update_i.valid & self.replace_en_i)
+
+        # flush
+        with m.If (self.flush_i):
+            # invalidate (flush) conditions: all if zero or just this ASID
+            with m.If (self.lu_asid_i == Const(0, self.asid_width) |
+                      (self.lu_asid_i == tags.asid)):
+                m.d.sync += tags.valid.eq(0)
+
+        # normal replacement
+        with m.Elif(replace_valid):
+            m.d.sync += [ # update tag array
+                          tags.asid.eq(self.update_i.asid),
+                          tags.vpn3.eq(self.update_i.vpn[27:36]),
+                          tags.vpn2.eq(self.update_i.vpn[18:27]),
+                          tags.vpn1.eq(self.update_i.vpn[9:18]),
+                          tags.vpn0.eq(self.update_i.vpn[0:9]),
+                          tags.is_512G.eq(self.update_i.is_512G),
+                          tags.is_1G.eq(self.update_i.is_1G),
+                          tags.is_2M.eq(self.update_i.is_2M),
+                          tags.valid.eq(1),
+                          # and content as well
+                          content.eq(self.update_i.content.flatten())
+                        ]
+        return m
+
+    def ports(self):
+        return [self.flush_i,
+                 self.lu_asid_i,
+                 self.lu_is_2M_o, self.lu_is_1G_o,self.lu_is_512G_o, self.lu_hit_o,
+                ] + self.update_i.content.ports() + self.update_i.ports()
diff --git a/src/soc/TLB/test/__init__.py b/src/soc/TLB/test/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/soc/TLB/test/test_LFSR2.py b/src/soc/TLB/test/test_LFSR2.py
new file mode 100644
index 00000000..c05f55b7
--- /dev/null
+++ b/src/soc/TLB/test/test_LFSR2.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# See Notices.txt for copyright information
+from TLB.LFSR import LFSR, LFSRPolynomial, LFSR_POLY_3
+
+from nmigen.back.pysim import Simulator, Delay, Tick
+import unittest
+
+
+class TestLFSR(unittest.TestCase):
+    def test_poly(self):
+        v = LFSRPolynomial()
+        self.assertEqual(repr(v), "LFSRPolynomial([0])")
+        self.assertEqual(str(v), "1")
+        v = LFSRPolynomial([1])
+        self.assertEqual(repr(v), "LFSRPolynomial([1, 0])")
+        self.assertEqual(str(v), "x + 1")
+        v = LFSRPolynomial([0, 1])
+        self.assertEqual(repr(v), "LFSRPolynomial([1, 0])")
+        self.assertEqual(str(v), "x + 1")
+        v = LFSRPolynomial([1, 2])
+        self.assertEqual(repr(v), "LFSRPolynomial([2, 1, 0])")
+        self.assertEqual(str(v), "x^2 + x + 1")
+        v = LFSRPolynomial([2])
+        self.assertEqual(repr(v), "LFSRPolynomial([2, 0])")
+        self.assertEqual(str(v), "x^2 + 1")
+        self.assertEqual(str(LFSR_POLY_3), "x^3 + x^2 + 1")
+
+    def test_lfsr_3(self):
+        module = LFSR(LFSR_POLY_3)
+        traces = [module.state, module.enable]
+        with Simulator(module,
+                       vcd_file=open("Waveforms/test_LFSR2.vcd", "w"),
+                       gtkw_file=open("Waveforms/test_LFSR2.gtkw", "w"),
+                       traces=traces) as sim:
+            sim.add_clock(1e-6, 0.25e-6)
+            delay = Delay(1e-7)
+
+            def async_process():
+                yield module.enable.eq(0)
+                yield Tick()
+                self.assertEqual((yield module.state), 0x1)
+                yield Tick()
+                self.assertEqual((yield module.state), 0x1)
+                yield module.enable.eq(1)
+                yield Tick()
+                yield delay
+                self.assertEqual((yield module.state), 0x2)
+                yield Tick()
+                yield delay
+                self.assertEqual((yield module.state), 0x5)
+                yield Tick()
+                yield delay
+                self.assertEqual((yield module.state), 0x3)
+                yield Tick()
+                yield delay
+                self.assertEqual((yield module.state), 0x7)
+                yield Tick()
+                yield delay
+                self.assertEqual((yield module.state), 0x6)
+                yield Tick()
+                yield delay
+                self.assertEqual((yield module.state), 0x4)
+                yield Tick()
+                yield delay
+                self.assertEqual((yield module.state), 0x1)
+                yield Tick()
+
+            sim.add_process(async_process)
+            sim.run()
+
diff --git a/src/soc/TLB/test/test_address_encoder.py b/src/soc/TLB/test/test_address_encoder.py
new file mode 100644
index 00000000..0aad35b4
--- /dev/null
+++ b/src/soc/TLB/test/test_address_encoder.py
@@ -0,0 +1,105 @@
+from nmigen.compat.sim import run_simulation
+from TLB.AddressEncoder import AddressEncoder
+from TestUtil.test_helper import assert_eq, assert_ne, assert_op
+
+
+# This function allows for the easy setting of values to the AddressEncoder
+# Arguments:
+#   dut: The AddressEncoder being tested
+#   i (Input): The array of single bits to be written
+def set_encoder(dut, i):
+    yield dut.i.eq(i)
+    yield
+
+# Checks the single match of the AddressEncoder
+# Arguments:
+#   dut: The AddressEncoder being tested
+#   sm (Single Match): The expected match result
+#   op (Operation): (0 => ==), (1 => !=)
+def check_single_match(dut, sm, op):
+    out_sm = yield dut.single_match
+    assert_op("Single Match", out_sm, sm, op)
+
+# Checks the multiple match of the AddressEncoder
+# Arguments:
+#   dut: The AddressEncoder being tested
+#   mm (Multiple Match): The expected match result
+#   op (Operation): (0 => ==), (1 => !=)
+def check_multiple_match(dut, mm, op):
+    out_mm = yield dut.multiple_match
+    assert_op("Multiple Match", out_mm, mm, op)
+
+# Checks the output of the AddressEncoder
+# Arguments:
+#   dut: The AddressEncoder being tested
+#   o (Output): The expected output
+#   op (Operation): (0 => ==), (1 => !=)
+def check_output(dut, o, op):
+    out_o = yield dut.o
+    assert_op("Output", out_o, o, op)
+
+# Checks the state of the AddressEncoder
+# Arguments:
+#   dut: The AddressEncoder being tested
+#   sm (Single Match): The expected match result
+#   mm (Multiple Match): The expected match result
+#   o (Output): The expected output
+#   ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+#   mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+#   o_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+def check_all(dut, sm, mm, o, sm_op, mm_op, o_op):
+    yield from check_single_match(dut, sm, sm_op)
+    yield from check_multiple_match(dut, mm, mm_op)
+    yield from check_output(dut, o, o_op)
+
+def tbench(dut):
+    # Check invalid input
+    in_val = 0b000
+    single_match = 0
+    multiple_match = 0
+    output = 0
+    yield from set_encoder(dut, in_val)
+    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
+
+    # Check single bit
+    in_val = 0b001
+    single_match = 1
+    multiple_match = 0
+    output = 0
+    yield from set_encoder(dut, in_val)
+    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
+
+    # Check another single bit
+    in_val = 0b100
+    single_match = 1
+    multiple_match = 0
+    output = 2
+    yield from set_encoder(dut, in_val)
+    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
+
+    # Check multiple match
+    # We expected the lowest bit to be returned which is address 0
+    in_val = 0b101
+    single_match = 0
+    multiple_match = 1
+    output = 0
+    yield from set_encoder(dut, in_val)
+    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
+
+    # Check another multiple match
+    # We expected the lowest bit to be returned which is address 1
+    in_val = 0b110
+    single_match = 0
+    multiple_match = 1
+    output = 1
+    yield from set_encoder(dut, in_val)
+    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
+
+def test_addr():
+    dut = AddressEncoder(4)
+    run_simulation(dut, tbench(dut), 
+                   vcd_name="Waveforms/test_address_encoder.vcd")
+    print("AddressEncoder Unit Test Success")
+
+if __name__ == "__main__":
+    test_addr()
diff --git a/src/soc/TLB/test/test_cam.py b/src/soc/TLB/test/test_cam.py
new file mode 100644
index 00000000..f11c48ad
--- /dev/null
+++ b/src/soc/TLB/test/test_cam.py
@@ -0,0 +1,206 @@
+from nmigen.compat.sim import run_simulation
+
+from TLB.Cam import Cam
+
+from TestUtil.test_helper import assert_eq, assert_ne, assert_op
+
+# This function allows for the easy setting of values to the Cam
+# Arguments:
+#   dut: The Cam being tested
+#   e (Enable): Whether the block is going to be enabled
+#   we (Write Enable): Whether the Cam will write on the next cycle
+#   a (Address): Where the data will be written if write enable is high
+#   d (Data): Either what we are looking for or will write to the address
+def set_cam(dut, e, we, a, d):
+    yield dut.enable.eq(e)
+    yield dut.write_enable.eq(we)
+    yield dut.address_in.eq(a)
+    yield dut.data_in.eq(d)
+    yield
+
+# Checks the multiple match of the Cam
+# Arguments:
+#   dut: The Cam being tested
+#   mm (Multiple Match): The expected match result
+#   op (Operation): (0 => ==), (1 => !=)
+def check_multiple_match(dut, mm, op):
+    out_mm = yield dut.multiple_match
+    assert_op("Multiple Match", out_mm, mm, op)
+
+# Checks the single match of the Cam
+# Arguments:
+#   dut: The Cam being tested
+#   sm (Single Match): The expected match result
+#   op (Operation): (0 => ==), (1 => !=)
+def check_single_match(dut, sm, op):
+    out_sm = yield dut.single_match
+    assert_op("Single Match", out_sm, sm, op)
+
+# Checks the address output of the Cam
+# Arguments:
+#   dut: The Cam being tested
+#   ma (Match Address): The expected match result
+#   op (Operation): (0 => ==), (1 => !=)
+def check_match_address(dut, ma, op):
+    out_ma = yield dut.match_address
+    assert_op("Match Address", out_ma, ma, op)
+
+# Checks the state of the Cam
+# Arguments:
+#   dut: The Cam being tested
+#   sm (Single Match): The expected match result
+#   mm (Multiple Match): The expected match result
+#   ma: (Match Address): The expected address output
+#   ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+#   mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+#   ma_op (Operation): Operation for the address assertion (0 => ==), (1 => !=)
+def check_all(dut, mm, sm, ma, mm_op, sm_op, ma_op):
+    yield from check_multiple_match(dut, mm, mm_op)
+    yield from check_single_match(dut, sm, sm_op)
+    yield from check_match_address(dut, ma, ma_op)
+
+def tbench(dut):
+    # NA
+    enable = 0
+    write_enable = 0
+    address = 0
+    data = 0
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_single_match(dut, single_match, 0)
+
+    # Read Miss Multiple
+    # Note that the default starting entry data bits are all 0
+    enable = 1
+    write_enable = 0
+    address = 0
+    data = 0
+    multiple_match = 1
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_multiple_match(dut, multiple_match, 0)
+
+    # Read Miss
+    # Note that the default starting entry data bits are all 0
+    enable = 1
+    write_enable = 0
+    address = 0
+    data = 1
+    multiple_match = 0
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_single_match(dut, single_match, 0)
+
+    # Write Entry 0
+    enable = 1
+    write_enable = 1
+    address = 0
+    data = 4
+    multiple_match = 0
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_single_match(dut, single_match, 0)
+
+    # Read Hit Entry 0
+    enable = 1
+    write_enable = 0
+    address = 0
+    data = 4
+    multiple_match = 0
+    single_match = 1
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
+
+    # Search Hit
+    enable = 1
+    write_enable = 0
+    address = 0
+    data = 4
+    multiple_match = 0
+    single_match = 1
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
+
+    # Search Miss
+    enable = 1
+    write_enable = 0
+    address = 0
+    data = 5
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_single_match(dut, single_match, 0)
+
+    # Multiple Match test
+    # Write Entry 1
+    enable = 1
+    write_enable = 1
+    address = 1
+    data = 5
+    multiple_match = 0
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_single_match(dut, single_match, 0)
+
+    # Write Entry 2
+    # Same data as Entry 1
+    enable = 1
+    write_enable = 1
+    address = 2
+    data = 5
+    multiple_match = 0
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_single_match(dut, single_match, 0)
+
+    # Read Hit Data 5
+    enable = 1
+    write_enable = 0
+    address = 1
+    data = 5
+    multiple_match = 1
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_all(dut, multiple_match, single_match, address,0,0,0)
+
+    # Verify read_warning is not caused
+    # Write Entry 0
+    enable = 1
+    write_enable = 1
+    address = 0
+    data = 7
+    multiple_match = 0
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    # Note there is no yield we immediately attempt to read in the next cycle
+
+    # Read Hit Data 7
+    enable = 1
+    write_enable = 0
+    address = 0
+    data = 7
+    multiple_match = 0
+    single_match = 1
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_single_match(dut, single_match, 0)
+
+    yield
+
+
+def test_cam():
+    dut = Cam(4, 4)
+    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam.vcd")
+    print("Cam Unit Test Success")
+
+if __name__ == "__main__":
+    test_cam()
diff --git a/src/soc/TLB/test/test_cam_entry.py b/src/soc/TLB/test/test_cam_entry.py
new file mode 100644
index 00000000..43b699d2
--- /dev/null
+++ b/src/soc/TLB/test/test_cam_entry.py
@@ -0,0 +1,110 @@
+from nmigen.compat.sim import run_simulation
+
+from TestUtil.test_helper import assert_eq, assert_ne, assert_op
+from TLB.CamEntry import CamEntry
+
+# This function allows for the easy setting of values to the Cam Entry
+# Arguments:
+#   dut: The CamEntry being tested
+#   c (command): NA (0), Read (1), Write (2), Reserve (3)
+#   d (data): The data to be set
+def set_cam_entry(dut, c, d):
+    # Write desired values
+    yield dut.command.eq(c)
+    yield dut.data_in.eq(d)
+    yield
+    # Reset all lines
+    yield dut.command.eq(0)
+    yield dut.data_in.eq(0)
+    yield
+
+# Checks the data state of the CAM entry
+# Arguments:
+#   dut: The CamEntry being tested
+#   d (Data): The expected data
+#   op (Operation): (0 => ==), (1 => !=)
+def check_data(dut, d, op):
+    out_d = yield dut.data
+    assert_op("Data", out_d, d, op)
+
+# Checks the match state of the CAM entry
+# Arguments:
+#   dut: The CamEntry being tested
+#   m (Match): The expected match
+#   op (Operation): (0 => ==), (1 => !=)
+def check_match(dut, m, op):
+    out_m = yield dut.match
+    assert_op("Match", out_m, m, op)
+
+# Checks the state of the CAM entry
+# Arguments:
+#   dut: The CamEntry being tested
+#   d (data): The expected data
+#   m (match): The expected match
+#   d_op (Operation): Operation for the data assertion (0 => ==), (1 => !=)
+#   m_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+def check_all(dut, d, m, d_op, m_op):
+    yield from check_data(dut, d, d_op)
+    yield from check_match(dut, m, m_op)
+
+# This tbench goes through the paces of testing the CamEntry module
+# It is done by writing and then reading various combinations of key/data pairs
+# and reading the results with varying keys to verify the resulting stored
+# data is correct.
+def tbench(dut):
+    # Check write
+    command = 2
+    data = 1
+    match = 0
+    yield from set_cam_entry(dut, command, data)
+    yield from check_all(dut, data, match, 0, 0)
+
+    # Check read miss
+    command = 1
+    data = 2
+    match = 0
+    yield from set_cam_entry(dut, command, data)
+    yield from check_all(dut, data, match, 1, 0)
+
+    # Check read hit
+    command = 1
+    data = 1
+    match = 1
+    yield from set_cam_entry(dut, command, data)
+    yield from check_all(dut, data, match, 0, 0)
+
+    # Check overwrite
+    command = 2
+    data = 5
+    match = 0
+    yield from set_cam_entry(dut, command, data)
+    yield
+    yield from check_all(dut, data, match, 0, 0)
+
+    # Check read hit
+    command = 1
+    data = 5
+    match = 1
+    yield from set_cam_entry(dut, command, data)
+    yield from check_all(dut, data, match, 0, 0)
+
+    # Check reset
+    command = 3
+    data = 0
+    match = 0
+    yield from set_cam_entry(dut, command, data)
+    yield from check_all(dut, data, match, 0, 0)
+
+    # Extra clock cycle for waveform
+    yield
+
+
+def test_camentry():
+    dut = CamEntry(4)
+    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam_entry.vcd")
+    print("CamEntry Unit Test Success")
+
+
+if __name__ == "__main__":
+    test_camentry()
+
diff --git a/src/soc/TLB/test/test_permission_validator.py b/src/soc/TLB/test/test_permission_validator.py
new file mode 100644
index 00000000..81873d79
--- /dev/null
+++ b/src/soc/TLB/test/test_permission_validator.py
@@ -0,0 +1,146 @@
+from nmigen.compat.sim import run_simulation
+
+from TLB.PermissionValidator import PermissionValidator
+
+from TestUtil.test_helper import assert_op
+
+
+def set_validator(dut, d, xwr, sm, sa, asid):
+    yield dut.data.eq(d)
+    yield dut.xwr.eq(xwr)
+    yield dut.super_mode.eq(sm)
+    yield dut.super_access.eq(sa)
+    yield dut.asid.eq(asid)
+    yield
+
+def check_valid(dut, v, op):
+    out_v = yield dut.valid
+    assert_op("Valid", out_v, v, op)
+
+def tbench(dut):
+    # 80 bits represented. Ignore the MSB as it will be truncated
+    # ASID is bits first 4 hex values (bits 64 - 78)
+
+    # Test user mode entry valid
+    # Global Bit matching ASID
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000031
+    # Ignore MSB it will be truncated
+    asid = 0x7FFF
+    super_mode = 0
+    super_access = 0
+    xwr = 0
+    valid = 1
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test user mode entry valid
+    # Global Bit nonmatching ASID
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000031
+    # Ignore MSB it will be truncated
+    asid = 0x7FF6
+    super_mode = 0
+    super_access = 0
+    xwr = 0
+    valid = 1
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test user mode entry invalid
+    # Global Bit nonmatching ASID
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000021
+    # Ignore MSB it will be truncated
+    asid = 0x7FF6
+    super_mode = 0
+    super_access = 0
+    xwr = 0
+    valid = 0
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test user mode entry valid
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000011
+    # Ignore MSB it will be truncated
+    asid = 0x7FFF
+    super_mode = 0
+    super_access = 0
+    xwr = 0
+    valid = 1
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test user mode entry invalid
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000011
+    # Ignore MSB it will be truncated
+    asid = 0x7FF6
+    super_mode = 0
+    super_access = 0
+    xwr = 0
+    valid = 0
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test supervisor mode entry valid
+    # The entry is NOT in user mode
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000001
+    # Ignore MSB it will be truncated
+    asid = 0x7FFF
+    super_mode = 1
+    super_access = 0
+    xwr = 0
+    valid = 1
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test supervisor mode entry invalid
+    # The entry is in user mode
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000011
+    # Ignore MSB it will be truncated
+    asid = 0x7FFF
+    super_mode = 1
+    super_access = 0
+    xwr = 0
+    valid = 0
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test supervisor mode entry valid
+    # The entry is NOT in user mode with access
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000001
+    # Ignore MSB it will be truncated
+    asid = 0x7FFF
+    super_mode = 1
+    super_access = 1
+    xwr = 0
+    valid = 1
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test supervisor mode entry valid
+    # The entry is in user mode with access
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000011
+    # Ignore MSB it will be truncated
+    asid = 0x7FFF
+    super_mode = 1
+    super_access = 1
+    xwr = 0
+    valid = 1
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+
+def test_permv():
+    dut = PermissionValidator(15, 64);
+    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_permission_validator.vcd")
+    print("PermissionValidator Unit Test Success")
+
+if __name__ == "__main__":
+    test_permv()
diff --git a/src/soc/TLB/test/test_pte_entry.py b/src/soc/TLB/test/test_pte_entry.py
new file mode 100644
index 00000000..5c0c34dc
--- /dev/null
+++ b/src/soc/TLB/test/test_pte_entry.py
@@ -0,0 +1,102 @@
+from nmigen.compat.sim import run_simulation
+
+from TLB.PteEntry import PteEntry
+
+from TestUtil.test_helper import assert_op
+
+def set_entry(dut, i):
+    yield dut.i.eq(i)
+    yield
+
+def check_dirty(dut, d, op):
+    out_d = yield dut.d
+    assert_op("Dirty", out_d, d, op)
+
+def check_accessed(dut, a, op):
+    out_a = yield dut.a
+    assert_op("Accessed", out_a, a, op)
+
+def check_global(dut, o, op):
+    out = yield dut.g
+    assert_op("Global", out, o, op)
+
+def check_user(dut, o, op):
+    out = yield dut.u
+    assert_op("User Mode", out, o, op)
+
+def check_xwr(dut, o, op):
+    out = yield dut.xwr
+    assert_op("XWR", out, o, op)
+
+def check_asid(dut, o, op):
+    out = yield dut.asid
+    assert_op("ASID", out, o, op)
+
+def check_pte(dut, o, op):
+    out = yield dut.pte
+    assert_op("ASID", out, o, op)
+
+def check_valid(dut, v, op):
+    out_v = yield dut.v
+    assert_op("Valid", out_v, v, op)
+
+def check_all(dut, d, a, g, u, xwr, v, asid, pte):
+    yield from check_dirty(dut, d, 0)
+    yield from check_accessed(dut, a, 0)
+    yield from check_global(dut, g, 0)
+    yield from check_user(dut, u, 0)
+    yield from check_xwr(dut, xwr, 0)
+    yield from check_asid(dut, asid, 0)
+    yield from check_pte(dut, pte, 0)
+    yield from check_valid(dut, v, 0)
+
+def tbench(dut):
+    # 80 bits represented. Ignore the MSB as it will be truncated
+    # ASID is bits first 4 hex values (bits 64 - 78)
+
+    i = 0x7FFF0000000000000031
+    dirty = 0
+    access = 0
+    glob = 1
+    user = 1
+    xwr = 0
+    valid = 1
+    asid = 0x7FFF
+    pte = 0x0000000000000031
+    yield from set_entry(dut, i)
+    yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
+
+    i = 0x0FFF00000000000000FF
+    dirty = 1
+    access = 1
+    glob = 1
+    user = 1
+    xwr = 7
+    valid = 1
+    asid = 0x0FFF
+    pte = 0x00000000000000FF
+    yield from set_entry(dut, i)
+    yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
+
+    i = 0x0721000000001100001F
+    dirty = 0
+    access = 0
+    glob = 0
+    user = 1
+    xwr = 7
+    valid = 1
+    asid = 0x0721
+    pte = 0x000000001100001F
+    yield from set_entry(dut, i)
+    yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
+
+    yield
+
+
+def test_pteentry():
+    dut = PteEntry(15, 64);
+    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_pte_entry.vcd")
+    print("PteEntry Unit Test Success")
+
+if __name__ == "__main__":
+    test_pteentry()
diff --git a/src/soc/TLB/test/test_set_associative_cache.py b/src/soc/TLB/test/test_set_associative_cache.py
new file mode 100644
index 00000000..0641b556
--- /dev/null
+++ b/src/soc/TLB/test/test_set_associative_cache.py
@@ -0,0 +1,38 @@
+from nmigen.compat.sim import run_simulation
+
+from TLB.SetAssociativeCache import SetAssociativeCache
+
+from TestUtil.test_helper import assert_eq, assert_ne, assert_op
+
+def set_sac(dut, e, c, s, t, d):
+    yield dut.enable.eq(e)
+    yield dut.command.eq(c)
+    yield dut.cset.eq(s)
+    yield dut.tag.eq(t)
+    yield dut.data_i.eq(d)
+    yield
+
+def tbench(dut):
+    enable = 1
+    command = 2
+    cset = 1
+    tag = 2
+    data = 3
+    yield from set_sac(dut, enable, command, cset, tag, data)
+    yield
+
+    enable = 1
+    command = 2
+    cset = 1
+    tag = 5
+    data = 8
+    yield from set_sac(dut, enable, command, cset, tag, data)
+    yield
+
+def test_assoc_cache():
+    dut = SetAssociativeCache(4, 4, 4, 4)
+    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_set_associative_cache.vcd")
+    print("Set Associative Cache Unit Test Success")
+
+if __name__ == "__main__":
+    test_assoc_cache()
diff --git a/src/soc/TLB/test/test_tlb.py b/src/soc/TLB/test/test_tlb.py
new file mode 100644
index 00000000..e9cc9d69
--- /dev/null
+++ b/src/soc/TLB/test/test_tlb.py
@@ -0,0 +1,80 @@
+#import tracemalloc
+#tracemalloc.start()
+
+from nmigen.compat.sim import run_simulation
+
+from TLB.TLB import TLB
+
+from TestUtil.test_helper import assert_op, assert_eq
+
+#self.supermode = Signal(1) # Supervisor Mode
+#self.super_access = Signal(1) # Supervisor Access
+#self.command = Signal(2) # 00=None, 01=Search, 10=Write L1, 11=Write L2
+#self.xwr = Signal(3) # Execute, Write, Read
+#self.mode = Signal(4) # 4 bits for access to Sv48 on Rv64
+#self.address_L1 = Signal(max=L1_size)
+#self.asid = Signal(asid_size) # Address Space IDentifier (ASID)
+#self.vma = Signal(vma_size) # Virtual Memory Address (VMA)
+#self.pte_in = Signal(pte_size) # To be saved Page Table Entry (PTE)
+#
+#self.hit = Signal(1) # Denotes if the VMA had a mapped PTE
+#self.perm_valid = Signal(1) # Denotes if the permissions are correct
+#self.pte_out = Signal(pte_size) # PTE that was mapped to by the VMA
+
+COMMAND_READ=1
+COMMAND_WRITE_L1=2
+
+# Checks the data state of the CAM entry
+# Arguments:
+#   dut: The CamEntry being tested
+#   d (Data): The expected data
+#   op (Operation): (0 => ==), (1 => !=)
+def check_hit(dut, d):
+    hit_d = yield dut.hit
+    #assert_eq("hit", hit_d, d)
+
+def test_command(dut,cmd,xwr,cycles):
+    yield dut.command.eq(cmd)
+    yield dut.xwr.eq(xwr)
+    for i in range(0,cycles):
+        yield
+
+def test_write_L1(dut,vma,address_L1,asid,pte_in):
+    yield dut.address_L1.eq(address_L1)
+    yield dut.asid.eq(asid)
+    yield dut.vma.eq(vma)
+    yield dut.pte_in.eq(pte_in)
+    yield from test_command(dut,COMMAND_WRITE_L1,7,2)
+
+def test_search(dut,vma,found):
+    yield dut.vma.eq(vma)
+    yield from test_command(dut,COMMAND_READ,7,1)
+    yield from check_hit(dut,found)
+
+def zero(dut):
+    yield dut.supermode.eq(0)
+    yield dut.super_access.eq(0)
+    yield dut.mode.eq(0)
+    yield dut.address_L1.eq(0)
+    yield dut.asid.eq(0)
+    yield dut.vma.eq(0)
+    yield dut.pte_in.eq(0)
+
+def tbench(dut):
+    yield from zero(dut)
+    yield dut.mode.eq(0xF) # enable TLB
+    #test hit
+    yield from test_write_L1(dut,0xFEEDFACE,0,0xFFFF,0xF0F0)
+    yield from test_search(dut,0xFEEDFACE,1)
+    yield from test_search(dut,0xFACEFEED,0)
+    
+
+    
+
+def test_tlb():
+    dut = TLB(15,36,64,8)
+    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_tlb.vcd")
+    print("TLB Unit Test Success")
+
+if __name__ == "__main__":
+    test_tlb()
diff --git a/src/soc/TestUtil/test_helper.py b/src/soc/TestUtil/test_helper.py
new file mode 100644
index 00000000..c42990d6
--- /dev/null
+++ b/src/soc/TestUtil/test_helper.py
@@ -0,0 +1,30 @@
+def assert_op(pre, o, e, op):
+    """ Verifies the given values given the particular operand
+        Arguments:
+            p (Prefix): Appended to the front of the assert statement
+            e (Expected): The expected value
+            o (Output): The output result
+            op (Operation): (0 => ==), (1 => !=)
+    """
+    if op == 0:
+        assert_eq(pre, o, e)
+    else:
+        assert_ne(pre, o, e)    
+
+def assert_eq(p, o, e):
+    """ Verifies the given values are equal
+        Arguments:
+           p (Prefix): Appended to the front of the assert statement
+           e (Expected): The expected value
+           o (Output): The output result
+    """
+    assert o == e, p + " Output " + str(o) + " Expected " + str(e)
+    
+def assert_ne(p, o, e):
+    """ Verifies the given values are not equal
+        Arguments:
+           p (Prefix): Appended to the front of the assert statement
+           e (Expected): The expected value
+           o (Output): The output result
+    """
+    assert o != e, p + " Output " + str(o) + " Not Expecting " + str(e) 
diff --git a/src/soc/decoder/.gitignore b/src/soc/decoder/.gitignore
new file mode 100644
index 00000000..afed0735
--- /dev/null
+++ b/src/soc/decoder/.gitignore
@@ -0,0 +1 @@
+*.csv
diff --git a/src/soc/decoder/power_decoder.py b/src/soc/decoder/power_decoder.py
new file mode 100644
index 00000000..5b5e7103
--- /dev/null
+++ b/src/soc/decoder/power_decoder.py
@@ -0,0 +1,275 @@
+"""Cascading Power ISA Decoder
+
+This module uses CSV tables in a hierarchical/peer cascading fashion,
+to create a multi-level instruction decoder by recognising appropriate
+patterns.  The output is a flattened (1-level) series of fields suitable
+for a simple RISC engine.
+
+This is based on Anton Blanchard's excellent microwatt work:
+https://github.com/antonblanchard/microwatt/blob/master/decode1.vhdl
+
+The basic principle is that the python code does the heavy lifting
+(reading the CSV files, constructing the hierarchy), creating the HDL
+AST with for-loops generating switch-case statements.
+
+PowerDecoder takes a *list* of CSV files with an associated bit-range
+that it is requested to match against the "opcode" row of the CSV file.
+This pattern can be either an integer, a binary number, *or* a wildcard
+nmigen Case pattern of the form "001--1-100".
+
+Subdecoders are *additional* cases with further decoding.  The "pattern"
+argument is specified as one of the Case statements (a peer of the opcode
+row in the CSV file), and thus further fields of the opcode may be decoded
+giving increasing levels of detail.
+
+Top Level:
+
+    [ (extra.csv: bit-fields entire 32-bit range
+        opcode                           -> matches
+        000000---------------01000000000 -> ILLEGAL instruction
+        01100000000000000000000000000000 -> SIM_CONFIG instruction
+        ................................ ->
+      ),
+      (major.csv: first 6 bits ONLY
+        opcode                           -> matches
+        001100                           -> ALU,OP_ADD (add)
+        001101                           -> ALU,OP_ADD (another type of add)
+        ......                           -> ...
+        ......                           -> ...
+        subdecoders:
+        001011 this must match *MAJOR*.CSV
+            [ (minor_19.csv: bits 21 through 30 inclusive:
+                opcode                  -> matches
+                0b0000000000            -> ALU,OP_MCRF
+                ............            -> ....
+              ),
+              (minor_19_00000.csv: bits 21 through 25 inclusive:
+                opcode                  -> matches
+                0b00010                 -> ALU,add_pcis
+              )
+            ]
+      ),
+    ]
+
+"""
+
+from nmigen import Module, Elaboratable, Signal
+from nmigen.cli import rtlil
+from power_enums import (Function, Form, InternalOp, In1Sel, In2Sel, In3Sel,
+                         OutSel, RC, LdstLen, CryIn, get_csv, single_bit_flags,
+                         get_signal_name, default_values)
+from collections import namedtuple
+from power_fields import DecodeFields
+from power_fieldsn import SigDecode, SignalBitRange
+
+Subdecoder = namedtuple("Subdecoder", ["pattern", "opcodes", "opint",
+                                       "bitsel", "suffix", "subdecoders"])
+
+
+class PowerOp:
+    """PowerOp: spec for execution.  op type (ADD etc.) reg specs etc.
+    """
+
+    def __init__(self):
+        self.function_unit = Signal(Function, reset_less=True)
+        self.internal_op = Signal(InternalOp, reset_less=True)
+        self.form = Signal(Form, reset_less=True)
+        self.in1_sel = Signal(In1Sel, reset_less=True)
+        self.in2_sel = Signal(In2Sel, reset_less=True)
+        self.in3_sel = Signal(In3Sel, reset_less=True)
+        self.out_sel = Signal(OutSel, reset_less=True)
+        self.ldst_len = Signal(LdstLen, reset_less=True)
+        self.rc_sel = Signal(RC, reset_less=True)
+        self.cry_in = Signal(CryIn, reset_less=True)
+        for bit in single_bit_flags:
+            name = get_signal_name(bit)
+            setattr(self, name, Signal(reset_less=True, name=name))
+
+    def _eq(self, row=None):
+        if row is None:
+            row = default_values
+        res = [self.function_unit.eq(Function[row['unit']]),
+               self.form.eq(Form[row['form']]),
+               self.internal_op.eq(InternalOp[row['internal op']]),
+               self.in1_sel.eq(In1Sel[row['in1']]),
+               self.in2_sel.eq(In2Sel[row['in2']]),
+               self.in3_sel.eq(In3Sel[row['in3']]),
+               self.out_sel.eq(OutSel[row['out']]),
+               self.ldst_len.eq(LdstLen[row['ldst len']]),
+               self.rc_sel.eq(RC[row['rc']]),
+               self.cry_in.eq(CryIn[row['cry in']]),
+               ]
+        for bit in single_bit_flags:
+            sig = getattr(self, get_signal_name(bit))
+            res.append(sig.eq(int(row.get(bit, 0))))
+        return res
+
+    def eq(self, otherop):
+        res = [self.function_unit.eq(otherop.function_unit),
+               self.form.eq(otherop.form),
+               self.internal_op.eq(otherop.internal_op),
+               self.in1_sel.eq(otherop.in1_sel),
+               self.in2_sel.eq(otherop.in2_sel),
+               self.in3_sel.eq(otherop.in3_sel),
+               self.out_sel.eq(otherop.out_sel),
+               self.rc_sel.eq(otherop.rc_sel),
+               self.ldst_len.eq(otherop.ldst_len),
+               self.cry_in.eq(otherop.cry_in)]
+        for bit in single_bit_flags:
+            sig = getattr(self, get_signal_name(bit))
+            res.append(sig.eq(getattr(otherop, get_signal_name(bit))))
+        return res
+
+    def ports(self):
+        regular = [self.function_unit,
+                   self.in1_sel,
+                   self.in2_sel,
+                   self.in3_sel,
+                   self.out_sel,
+                   self.ldst_len,
+                   self.rc_sel,
+                   self.internal_op,
+                   self.form]
+        single_bit_ports = [getattr(self, get_signal_name(x))
+                            for x in single_bit_flags]
+        return regular + single_bit_ports
+
+
+class PowerDecoder(Elaboratable):
+    """PowerDecoder - decodes an incoming opcode into the type of operation
+    """
+
+    def __init__(self, width, dec):
+        if not isinstance(dec, list):
+            dec = [dec]
+        self.dec = dec
+        self.opcode_in = Signal(width, reset_less=True)
+
+        self.op = PowerOp()
+        for d in dec:
+            if d.suffix is not None and d.suffix >= width:
+                d.suffix = None
+        self.width = width
+
+    def suffix_mask(self, d):
+        return ((1 << d.suffix) - 1)
+
+    def divide_opcodes(self, d):
+        divided = {}
+        mask = self.suffix_mask(d)
+        print("mask", hex(mask))
+        for row in d.opcodes:
+            opcode = row['opcode']
+            if d.opint and '-' not in opcode:
+                opcode = int(opcode, 0)
+            key = opcode & mask
+            opcode = opcode >> d.suffix
+            if key not in divided:
+                divided[key] = []
+            r = row.copy()
+            r['opcode'] = opcode
+            divided[key].append(r)
+        return divided
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # note: default opcode is "illegal" as this is a combinatorial block
+
+        # go through the list of CSV decoders first
+        for d in self.dec:
+            opcode_switch = Signal(d.bitsel[1] - d.bitsel[0],
+                                   reset_less=True)
+            comb += opcode_switch.eq(self.opcode_in[d.bitsel[0]:d.bitsel[1]])
+            if d.suffix:
+                opcodes = self.divide_opcodes(d)
+                opc_in = Signal(d.suffix, reset_less=True)
+                comb += opc_in.eq(opcode_switch[:d.suffix])
+                with m.Switch(opc_in):
+                    for key, row in opcodes.items():
+                        bitsel = (d.suffix+d.bitsel[0], d.bitsel[1])
+                        sd = Subdecoder(pattern=None, opcodes=row,
+                                        bitsel=bitsel, suffix=None,
+                                        opint=False, subdecoders=[])
+                        subdecoder = PowerDecoder(width=32, dec=sd)
+                        setattr(m.submodules, "dec_sub%d" % key, subdecoder)
+                        comb += subdecoder.opcode_in.eq(self.opcode_in)
+                        with m.Case(key):
+                            comb += self.op.eq(subdecoder.op)
+            else:
+                # TODO: arguments, here (all of them) need to be a list.
+                # a for-loop around the *list* of decoder args.
+                with m.Switch(opcode_switch):
+                    self.handle_subdecoders(m, d)
+                    for row in d.opcodes:
+                        opcode = row['opcode']
+                        if d.opint and '-' not in opcode:
+                            opcode = int(opcode, 0)
+                        if not row['unit']:
+                            continue
+                        with m.Case(opcode):
+                            comb += self.op._eq(row)
+        return m
+
+    def handle_subdecoders(self, m, d):
+        for dec in d.subdecoders:
+            subdecoder = PowerDecoder(self.width, dec)
+            if isinstance(dec, list): # XXX HACK: take first pattern
+                dec = dec[0]
+            setattr(m.submodules, "dec%d" % dec.pattern, subdecoder)
+            m.d.comb += subdecoder.opcode_in.eq(self.opcode_in)
+            with m.Case(dec.pattern):
+                m.d.comb += self.op.eq(subdecoder.op)
+
+    def ports(self):
+        return [self.opcode_in] + self.op.ports()
+
+
+class TopPowerDecoder(PowerDecoder, DecodeFields):
+
+    def __init__(self, width, dec):
+        PowerDecoder.__init__(self, width, dec)
+        DecodeFields.__init__(self, SignalBitRange, [self.opcode_in])
+        self.create_specs()
+
+
+def create_pdecode():
+
+    # minor 19 has extra patterns
+    m19 = []
+    m19.append(Subdecoder(pattern=19, opcodes=get_csv("minor_19.csv"),
+                   opint=True, bitsel=(1, 11), suffix=None, subdecoders=[]))
+    m19.append(Subdecoder(pattern=19, opcodes=get_csv("minor_19_00000.csv"),
+                   opint=True, bitsel=(1, 6), suffix=None, subdecoders=[]))
+
+    # minor opcodes.
+    pminor = [
+        m19,
+        Subdecoder(pattern=30, opcodes=get_csv("minor_30.csv"),
+                   opint=True, bitsel=(1, 6), suffix=None, subdecoders=[]),
+        Subdecoder(pattern=31, opcodes=get_csv("minor_31.csv"),
+                   opint=True, bitsel=(1, 11), suffix=0b00101, subdecoders=[]),
+        Subdecoder(pattern=58, opcodes=get_csv("minor_58.csv"),
+                   opint=True, bitsel=(0, 2), suffix=None, subdecoders=[]),
+        Subdecoder(pattern=62, opcodes=get_csv("minor_62.csv"),
+                   opint=True, bitsel=(0, 2), suffix=None, subdecoders=[]),
+    ]
+
+    # top level: extra merged with major
+    dec = []
+    opcodes = get_csv("major.csv")
+    dec.append(Subdecoder(pattern=None, opint=True, opcodes=opcodes,
+                     bitsel=(26, 32), suffix=None, subdecoders=pminor))
+    opcodes = get_csv("extra.csv")
+    dec.append(Subdecoder(pattern=None, opint=False, opcodes=opcodes,
+                     bitsel=(0, 32), suffix=None, subdecoders=[]))
+
+    return TopPowerDecoder(32, dec)
+
+
+if __name__ == '__main__':
+    pdecode = create_pdecode()
+    vl = rtlil.convert(pdecode, ports=pdecode.ports())
+    with open("decoder.il", "w") as f:
+        f.write(vl)
diff --git a/src/soc/decoder/power_decoder2.py b/src/soc/decoder/power_decoder2.py
new file mode 100644
index 00000000..1b7435a0
--- /dev/null
+++ b/src/soc/decoder/power_decoder2.py
@@ -0,0 +1,429 @@
+"""Power ISA Decoder second stage
+
+based on Anton Blanchard microwatt decode2.vhdl
+
+"""
+from nmigen import Module, Elaboratable, Signal, Mux, Const
+from nmigen.cli import rtlil
+
+from power_decoder import create_pdecode
+from power_enums import (InternalOp, CryIn, Function, LdstLen,
+                         In1Sel, In2Sel, In3Sel, OutSel, SPR, RC)
+
+
+class DecodeA(Elaboratable):
+    """DecodeA from instruction
+
+    decodes register RA, whether immediate-zero, implicit and
+    explicit CSRs
+    """
+
+    def __init__(self, dec):
+        self.dec = dec
+        self.sel_in = Signal(In1Sel, reset_less=True)
+        self.insn_in = Signal(32, reset_less=True)
+        self.reg_out = Data(5, name="reg_a")
+        self.immz_out = Signal(reset_less=True)
+        self.spr_out = Data(10, "spr_a")
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # select Register A field
+        with m.If((self.sel_in == In1Sel.RA) |
+                  ((self.sel_in == In1Sel.RA_OR_ZERO) &
+                   (self.reg_out.data != Const(0, 5)))):
+            comb += self.reg_out.data.eq(self.dec.RA[0:-1])
+            comb += self.reg_out.ok.eq(1)
+
+        # zero immediate requested
+        with m.If((self.sel_in == In1Sel.RA_OR_ZERO) &
+                   (self.reg_out.data == Const(0, 5))):
+            comb += self.immz_out.eq(1)
+
+        # decode SPR1 based on instruction type
+        op = self.dec.op
+        # BC or BCREG: potential implicit register (CTR)
+        with m.If((op.internal_op == InternalOp.OP_BC) |
+                  (op.internal_op == InternalOp.OP_BCREG)):
+            with m.If(~self.dec.BO[2]): # 3.0B p38 BO2=0, use CTR reg
+                comb += self.spr_out.data.eq(SPR.CTR) # constant: CTR
+                comb += self.spr_out.ok.eq(1)
+        # MFSPR or MTSPR: move-from / move-to SPRs
+        with m.If((op.internal_op == InternalOp.OP_MFSPR) |
+                  (op.internal_op == InternalOp.OP_MTSPR)):
+            comb += self.spr_out.data.eq(self.dec.SPR[0:-1]) # SPR field, XFX
+            comb += self.spr_out.ok.eq(1)
+
+        return m
+
+class Data:
+
+    def __init__(self, width, name):
+
+        self.data = Signal(width, name=name, reset_less=True)
+        self.ok = Signal(name="%s_ok" % name, reset_less=True)
+
+    def eq(self, rhs):
+        return [self.data.eq(rhs.data),
+                self.ok.eq(rhs.ok)]
+
+    def ports(self):
+        return [self.data, self.ok]
+
+
+class DecodeB(Elaboratable):
+    """DecodeB from instruction
+
+    decodes register RB, different forms of immediate (signed, unsigned),
+    and implicit SPRs
+    """
+
+    def __init__(self, dec):
+        self.dec = dec
+        self.sel_in = Signal(In2Sel, reset_less=True)
+        self.insn_in = Signal(32, reset_less=True)
+        self.reg_out = Data(5, "reg_b")
+        self.imm_out = Data(64, "imm_b")
+        self.spr_out = Data(10, "spr_b")
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # select Register B field
+        with m.Switch(self.sel_in):
+            with m.Case(In2Sel.RB):
+                comb += self.reg_out.data.eq(self.dec.RB[0:-1])
+                comb += self.reg_out.ok.eq(1)
+            with m.Case(In2Sel.CONST_UI):
+                comb += self.imm_out.data.eq(self.dec.UI[0:-1])
+                comb += self.imm_out.ok.eq(1)
+            with m.Case(In2Sel.CONST_SI): # TODO: sign-extend here?
+                comb += self.imm_out.data.eq(self.dec.SI[0:-1])
+                comb += self.imm_out.ok.eq(1)
+            with m.Case(In2Sel.CONST_UI_HI):
+                comb += self.imm_out.data.eq(self.dec.UI[0:-1]<<4)
+                comb += self.imm_out.ok.eq(1)
+            with m.Case(In2Sel.CONST_SI_HI): # TODO: sign-extend here?
+                comb += self.imm_out.data.eq(self.dec.SI[0:-1]<<4)
+                comb += self.imm_out.ok.eq(1)
+            with m.Case(In2Sel.CONST_LI):
+                comb += self.imm_out.data.eq(self.dec.LI[0:-1]<<2)
+                comb += self.imm_out.ok.eq(1)
+            with m.Case(In2Sel.CONST_BD):
+                comb += self.imm_out.data.eq(self.dec.BD[0:-1]<<2)
+                comb += self.imm_out.ok.eq(1)
+            with m.Case(In2Sel.CONST_DS):
+                comb += self.imm_out.data.eq(self.dec.DS[0:-1]<<2)
+                comb += self.imm_out.ok.eq(1)
+            with m.Case(In2Sel.CONST_M1):
+                comb += self.imm_out.data.eq(~Const(0, 64)) # all 1s
+                comb += self.imm_out.ok.eq(1)
+            with m.Case(In2Sel.CONST_SH):
+                comb += self.imm_out.data.eq(self.dec.sh[0:-1])
+                comb += self.imm_out.ok.eq(1)
+            with m.Case(In2Sel.CONST_SH32):
+                comb += self.imm_out.data.eq(self.dec.SH32[0:-1])
+                comb += self.imm_out.ok.eq(1)
+
+        # decode SPR2 based on instruction type
+        op = self.dec.op
+        # BCREG implicitly uses CTR or LR for 2nd reg
+        with m.If(op.internal_op == InternalOp.OP_BCREG):
+            with m.If(self.dec.FormXL.XO[9]): # 3.0B p38 top bit of XO
+                comb += self.spr_out.data.eq(SPR.CTR)
+            with m.Else():
+                comb += self.spr_out.data.eq(SPR.LR)
+            comb += self.spr_out.ok.eq(1)
+
+        return m
+
+
+class DecodeC(Elaboratable):
+    """DecodeC from instruction
+
+    decodes register RC
+    """
+
+    def __init__(self, dec):
+        self.dec = dec
+        self.sel_in = Signal(In3Sel, reset_less=True)
+        self.insn_in = Signal(32, reset_less=True)
+        self.reg_out = Data(5, "reg_c")
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # select Register C field
+        with m.If(self.sel_in == In3Sel.RS):
+            comb += self.reg_out.data.eq(self.dec.RS[0:-1])
+            comb += self.reg_out.ok.eq(1)
+
+        return m
+
+
+class DecodeOut(Elaboratable):
+    """DecodeOut from instruction
+
+    decodes output register RA, RT or SPR
+    """
+
+    def __init__(self, dec):
+        self.dec = dec
+        self.sel_in = Signal(OutSel, reset_less=True)
+        self.insn_in = Signal(32, reset_less=True)
+        self.reg_out = Data(5, "reg_o")
+        self.spr_out = Data(10, "spr_o")
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # select Register out field
+        with m.Switch(self.sel_in):
+            with m.Case(OutSel.RT):
+                comb += self.reg_out.data.eq(self.dec.RT[0:-1])
+                comb += self.reg_out.ok.eq(1)
+            with m.Case(OutSel.RA):
+                comb += self.reg_out.data.eq(self.dec.RA[0:-1])
+                comb += self.reg_out.ok.eq(1)
+            with m.Case(OutSel.SPR):
+                comb += self.spr_out.data.eq(self.dec.SPR[0:-1]) # from XFX
+                comb += self.spr_out.ok.eq(1)
+
+        return m
+
+
+class DecodeRC(Elaboratable):
+    """DecodeRc from instruction
+
+    decodes Record bit Rc
+    """
+    def __init__(self, dec):
+        self.dec = dec
+        self.sel_in = Signal(RC, reset_less=True)
+        self.insn_in = Signal(32, reset_less=True)
+        self.rc_out = Data(1, "rc")
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # select Record bit out field
+        with m.Switch(self.sel_in):
+            with m.Case(RC.RC):
+                comb += self.rc_out.data.eq(self.dec.Rc[0:-1])
+                comb += self.rc_out.ok.eq(1)
+            with m.Case(RC.ONE):
+                comb += self.rc_out.data.eq(1)
+                comb += self.rc_out.ok.eq(1)
+            with m.Case(RC.NONE):
+                comb += self.rc_out.data.eq(0)
+                comb += self.rc_out.ok.eq(1)
+
+        return m
+
+
+class DecodeOE(Elaboratable):
+    """DecodeOE from instruction
+
+    decodes OE field: uses RC decode detection which might not be good
+
+    -- For now, use "rc" in the decode table to decide whether oe exists.
+    -- This is not entirely correct architecturally: For mulhd and
+    -- mulhdu, the OE field is reserved. It remains to be seen what an
+    -- actual POWER9 does if we set it on those instructions, for now we
+    -- test that further down when assigning to the multiplier oe input.
+    """
+    def __init__(self, dec):
+        self.dec = dec
+        self.sel_in = Signal(RC, reset_less=True)
+        self.insn_in = Signal(32, reset_less=True)
+        self.oe_out = Data(1, "oe")
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # select OE bit out field
+        with m.Switch(self.sel_in):
+            with m.Case(RC.RC):
+                comb += self.oe_out.data.eq(self.dec.OE[0:-1])
+                comb += self.oe_out.ok.eq(1)
+
+        return m
+
+
+class XerBits:
+    def __init__(self):
+        self.ca = Signal(reset_less=True)
+        self.ca32 = Signal(reset_less=True)
+        self.ov = Signal(reset_less=True)
+        self.ov32 = Signal(reset_less=True)
+        self.so = Signal(reset_less=True)
+
+    def ports(self):
+        return [self.ca, self.ca32, self.ov, self.ov32, self.so, ]
+
+
+class Decode2ToExecute1Type:
+
+    def __init__(self):
+
+        self.valid = Signal(reset_less=True)
+        self.insn_type = Signal(InternalOp, reset_less=True)
+        self.nia = Signal(64, reset_less=True)
+        self.write_reg = Data(5, name="rego")
+        self.read_reg1 = Data(5, name="reg1")
+        self.read_reg2 = Data(5, name="reg2")
+        self.read_reg3 = Data(5, name="reg3")
+        self.imm_data = Data(64, name="imm")
+        self.write_spr = Data(10, name="spro")
+        self.read_spr1 = Data(10, name="spr1")
+        self.read_spr2 = Data(10, name="spr2")
+        #self.read_data1 = Signal(64, reset_less=True)
+        #self.read_data2 = Signal(64, reset_less=True)
+        #self.read_data3 = Signal(64, reset_less=True)
+        #self.cr = Signal(32, reset_less=True) # NO: this is from the CR SPR
+        #self.xerc = XerBits() # NO: this is from the XER SPR
+        self.lk = Signal(reset_less=True)
+        self.rc = Data(1, "rc")
+        self.oe = Data(1, "oe")
+        self.invert_a = Signal(reset_less=True)
+        self.invert_out = Signal(reset_less=True)
+        self.input_carry = Signal(CryIn, reset_less=True)
+        self.output_carry = Signal(reset_less=True)
+        self.input_cr = Signal(reset_less=True)
+        self.output_cr = Signal(reset_less=True)
+        self.is_32bit = Signal(reset_less=True)
+        self.is_signed = Signal(reset_less=True)
+        self.insn = Signal(32, reset_less=True)
+        self.data_len = Signal(4, reset_less=True) # bytes
+        self.byte_reverse  = Signal(reset_less=True)
+        self.sign_extend  = Signal(reset_less=True)# do we need this?
+        self.update  = Signal(reset_less=True) # is this an update instruction?
+
+    def ports(self):
+        return [self.valid, self.insn_type, self.nia,
+                #self.read_data1, self.read_data2, self.read_data3,
+                #self.cr,
+                self.lk,
+                self.invert_a, self.invert_out,
+                self.input_carry, self.output_carry,
+                self.input_cr, self.output_cr,
+                self.is_32bit, self.is_signed,
+                self.insn,
+                self.data_len, self.byte_reverse , self.sign_extend ,
+                self.update] + \
+                self.oe.ports() + \
+                self.rc.ports() + \
+                self.write_spr.ports() + \
+                self.read_spr1.ports() + \
+                self.read_spr2.ports() + \
+                self.write_reg.ports() + \
+                self.read_reg1.ports() + \
+                self.read_reg2.ports() + \
+                self.read_reg3.ports() + \
+                self.imm_data.ports()
+                # + self.xerc.ports()
+
+class PowerDecode2(Elaboratable):
+
+    def __init__(self, dec):
+
+        self.dec = dec
+        self.e = Decode2ToExecute1Type()
+
+    def ports(self):
+        return self.dec.ports() + self.e.ports()
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # set up submodule decoders
+        m.submodules.dec = self.dec
+        m.submodules.dec_a = dec_a = DecodeA(self.dec)
+        m.submodules.dec_b = dec_b = DecodeB(self.dec)
+        m.submodules.dec_c = dec_c = DecodeC(self.dec)
+        m.submodules.dec_o = dec_o = DecodeOut(self.dec)
+        m.submodules.dec_rc = dec_rc = DecodeRC(self.dec)
+        m.submodules.dec_oe = dec_oe = DecodeOE(self.dec)
+
+        # copy instruction through...
+        for i in [self.e.insn, dec_a.insn_in, dec_b.insn_in,
+                  dec_c.insn_in, dec_o.insn_in, dec_rc.insn_in,
+                  dec_oe.insn_in]:
+            comb += i.eq(self.dec.opcode_in)
+
+        # ...and subdecoders' input fields
+        comb += dec_a.sel_in.eq(self.dec.op.in1_sel)
+        comb += dec_b.sel_in.eq(self.dec.op.in2_sel)
+        comb += dec_c.sel_in.eq(self.dec.op.in3_sel)
+        comb += dec_o.sel_in.eq(self.dec.op.out_sel)
+        comb += dec_rc.sel_in.eq(self.dec.op.rc_sel)
+        comb += dec_oe.sel_in.eq(self.dec.op.rc_sel) # XXX should be OE sel
+
+        # decode LD/ST length
+        with m.Switch(self.dec.op.ldst_len):
+            with m.Case(LdstLen.is1B):
+                comb += self.e.data_len.eq(1)
+            with m.Case(LdstLen.is2B):
+                comb += self.e.data_len.eq(2)
+            with m.Case(LdstLen.is4B):
+                comb += self.e.data_len.eq(4)
+            with m.Case(LdstLen.is8B):
+                comb += self.e.data_len.eq(8)
+
+        #comb += self.e.nia.eq(self.dec.nia) # XXX TODO
+        itype = Mux(self.dec.op.function_unit == Function.NONE,
+                    InternalOp.OP_ILLEGAL,
+                    self.dec.op.internal_op)
+        comb += self.e.insn_type.eq(itype)
+
+        # registers a, b, c and out
+        comb += self.e.read_reg1.eq(dec_a.reg_out)
+        comb += self.e.read_reg2.eq(dec_b.reg_out)
+        comb += self.e.read_reg3.eq(dec_c.reg_out)
+        comb += self.e.write_reg.eq(dec_o.reg_out)
+        comb += self.e.imm_data.eq(dec_b.imm_out)
+
+        # rc and oe out
+        comb += self.e.rc.eq(dec_rc.rc_out)
+        comb += self.e.oe.eq(dec_oe.oe_out)
+
+        # SPRs out
+        comb += self.e.read_spr1.eq(dec_a.spr_out)
+        comb += self.e.read_spr2.eq(dec_b.spr_out)
+        comb += self.e.write_spr.eq(dec_o.spr_out)
+
+        # decoded/selected instruction flags
+        comb += self.e.invert_a.eq(self.dec.op.inv_a)
+        comb += self.e.invert_out.eq(self.dec.op.inv_out)
+        comb += self.e.input_carry.eq(self.dec.op.cry_in)
+        comb += self.e.output_carry.eq(self.dec.op.cry_out)
+        comb += self.e.is_32bit.eq(self.dec.op.is_32b)
+        comb += self.e.is_signed.eq(self.dec.op.sgn)
+        with m.If(self.dec.op.lk):
+            comb += self.e.lk.eq(self.dec.LK[0:-1]) # XXX TODO: accessor
+
+        comb += self.e.byte_reverse.eq(self.dec.op.br)
+        comb += self.e.sign_extend.eq(self.dec.op.sgn_ext)
+        comb += self.e.update.eq(self.dec.op.upd)
+
+        comb += self.e.input_cr.eq(self.dec.op.cr_in)
+        comb += self.e.output_cr.eq(self.dec.op.cr_out)
+
+        return m
+
+
+if __name__ == '__main__':
+    pdecode = create_pdecode()
+    dec2 = PowerDecode2(pdecode)
+    vl = rtlil.convert(dec2, ports=dec2.ports() + pdecode.ports())
+    with open("dec2.il", "w") as f:
+        f.write(vl)
+
diff --git a/src/soc/decoder/power_enums.py b/src/soc/decoder/power_enums.py
new file mode 100644
index 00000000..dcf5cad2
--- /dev/null
+++ b/src/soc/decoder/power_enums.py
@@ -0,0 +1,229 @@
+from enum import Enum, unique
+import csv
+import os
+import requests
+
+
+def get_csv(name):
+    file_dir = os.path.dirname(os.path.realpath(__file__))
+    file_path = os.path.join(file_dir, name)
+    if not os.path.isfile(file_path):
+        url = 'https://libre-riscv.org/openpower/isatables/' + name
+        r = requests.get(url, allow_redirects=True)
+        with open(file_path, 'w') as outfile:
+            outfile.write(r.content.decode("utf-8"))
+    with open(file_path, 'r') as csvfile:
+        reader = csv.DictReader(csvfile)
+        return list(reader)
+
+
+# names of the fields in the tables that don't correspond to an enum
+single_bit_flags = ['CR in', 'CR out', 'inv A', 'inv out',
+                    'cry out', 'BR', 'sgn ext', 'upd', 'rsrv', '32b',
+                    'sgn', 'lk', 'sgl pipe']
+
+# default values for fields in the table
+default_values = {'unit': "NONE", 'internal op': "OP_ILLEGAL",
+                   'in1': "RA", 'in2': 'NONE', 'in3': 'NONE', 'out': 'NONE',
+                   'ldst len': 'NONE',
+                   'rc' : 'NONE', 'cry in' : 'ZERO', 'form': 'NONE'}
+
+def get_signal_name(name):
+    if name[0].isdigit():
+        name = "is_" + name
+    return name.lower().replace(' ', '_')
+
+
+@unique
+class Function(Enum):
+    NONE = 0
+    ALU = 1
+    LDST = 2
+
+
+@unique
+class Form(Enum):
+    NONE = 0
+    I = 1
+    B = 2
+    SC = 3
+    D = 4
+    DS = 5
+    DQ = 6
+    DX = 7
+    X = 8
+    XL = 9
+    XFX = 10
+    XFL = 11
+    XX1 = 12
+    XX2 = 13
+    XX3 = 14
+    XX4 = 15
+    XS = 16
+    XO = 17
+    A = 18
+    M = 19
+    MD = 20
+    MDS = 21
+    VA = 22
+    VC = 23
+    VX = 24
+    EVX = 25
+    EVS = 26
+    Z22 = 27
+    Z23 = 28
+
+
+
+@unique
+class InternalOp(Enum):
+    OP_ILLEGAL = 0
+    OP_NOP = 1
+    OP_ADD = 2
+    OP_ADDPCIS = 3
+    OP_AND = 4
+    OP_ATTN = 5
+    OP_B = 6
+    OP_BC = 7
+    OP_BCREG = 8
+    OP_BPERM = 9
+    OP_CMP = 10
+    OP_CMPB = 11
+    OP_CMPEQB = 12
+    OP_CMPRB = 13
+    OP_CNTZ = 14
+    OP_CRAND = 15
+    OP_CRANDC = 16
+    OP_CREQV = 17
+    OP_CRNAND = 18
+    OP_CRNOR = 19
+    OP_CROR = 20
+    OP_CRORC = 21
+    OP_CRXOR = 22
+    OP_DARN = 23
+    OP_DCBF = 24
+    OP_DCBST = 25
+    OP_DCBT = 26
+    OP_DCBTST = 27
+    OP_DCBZ = 28
+    OP_DIV = 29
+    OP_DIVE = 30
+    OP_EXTS = 31
+    OP_EXTSWSLI = 32
+    OP_ICBI = 33
+    OP_ICBT = 34
+    OP_ISEL = 35
+    OP_ISYNC = 36
+    OP_LOAD = 37
+    OP_STORE = 38
+    OP_MADDHD = 39
+    OP_MADDHDU = 40
+    OP_MADDLD = 41
+    OP_MCRF = 42
+    OP_MCRXR = 43
+    OP_MCRXRX = 44
+    OP_MFCR = 45
+    OP_MFSPR = 46
+    OP_MOD = 47
+    OP_MTCRF = 48
+    OP_MTSPR = 49
+    OP_MUL_L64 = 50
+    OP_MUL_H64 = 51
+    OP_MUL_H32 = 52
+    OP_OR = 53
+    OP_POPCNT = 54
+    OP_PRTY = 55
+    OP_RLC = 56
+    OP_RLCL = 57
+    OP_RLCR = 58
+    OP_SETB = 59
+    OP_SHL = 60
+    OP_SHR = 61
+    OP_SYNC = 62
+    OP_TD = 63
+    OP_TDI = 64
+    OP_TW = 65
+    OP_TWI = 66
+    OP_XOR = 67
+    OP_SIM_CONFIG = 68
+
+
+@unique
+class In1Sel(Enum):
+    RA = 0
+    RA_OR_ZERO = 1
+    NONE = 2
+    SPR = 3
+
+
+@unique
+class In2Sel(Enum):
+    NONE = 0
+    RB = 1
+    CONST_UI = 2
+    CONST_SI = 3
+    CONST_UI_HI = 4
+    CONST_SI_HI = 5
+    CONST_LI = 6
+    CONST_BD = 7
+    CONST_DS = 8
+    CONST_M1 = 9
+    CONST_SH = 10
+    CONST_SH32 = 11
+    SPR = 12
+
+
+@unique
+class In3Sel(Enum):
+    NONE = 0
+    RS = 1
+
+
+@unique
+class OutSel(Enum):
+    NONE = 0
+    RT = 1
+    RA = 2
+    SPR = 3
+
+
+@unique
+class LdstLen(Enum):
+    NONE = 0
+    is1B = 1
+    is2B = 2
+    is4B = 3
+    is8B = 4
+
+
+@unique
+class RC(Enum):
+    NONE = 0
+    ONE = 1
+    RC = 2
+
+
+@unique
+class CryIn(Enum):
+    ZERO = 0
+    ONE = 1
+    CA = 2
+
+@unique
+class SPR(Enum):
+    XER    = 1
+    LR     = 8
+    CTR    = 9
+    TB     = 268
+    SRR0   = 26
+    SRR1   = 27
+    HSRR0  = 314
+    HSRR1  = 315
+    SPRG0  = 272
+    SPRG1  = 273
+    SPRG2  = 274
+    SPRG3  = 275
+    SPRG3U = 259
+    HSPRG0 = 304
+    HSPRG1 = 305
+
diff --git a/src/soc/decoder/power_fields.py b/src/soc/decoder/power_fields.py
new file mode 100644
index 00000000..3457331e
--- /dev/null
+++ b/src/soc/decoder/power_fields.py
@@ -0,0 +1,242 @@
+from collections import OrderedDict, namedtuple
+
+
+class BitRange(OrderedDict):
+    """BitRange: remaps from straight indices (0,1,2..) to bit numbers
+    """
+    def __getitem__(self, subscript):
+        if isinstance(subscript, slice):
+            return list(self)[subscript]
+        else:
+            return self[subscript]
+
+def decode_instructions(form):
+    res = {}
+    accum = []
+    for l in form:
+        if l.strip().startswith("Formats"):
+            l = l.strip().split(":")[-1]
+            l = l.replace(" ", "")
+            l = l.split(",")
+            for fmt in l:
+                if fmt not in res:
+                    res[fmt] = [accum[0]]
+                else:
+                    res[fmt].append(accum[0])
+            accum = []
+        else:
+            accum.append(l.strip())
+    return res
+
+def decode_form_header(hdr):
+    res = {}
+    count = 0
+    hdr = hdr.strip()
+    print (hdr.split('|'))
+    for f in hdr.split("|"):
+        if not f:
+            continue
+        if f[0].isdigit():
+            idx = int(f.strip().split(' ')[0])
+            res[count] = idx
+        count += len(f) + 1
+    return res
+
+def find_unique(d, key):
+    if key not in d:
+        return key
+    idx = 1
+    while "%s_%d" % (key, idx) in d:
+        idx += 1
+    return "%s_%d" % (key, idx)
+
+
+def decode_line(header, line):
+    line = line.strip()
+    res = {}
+    count = 0
+    print ("line", line)
+    prev_fieldname = None
+    for f in line.split("|"):
+        if not f:
+            continue
+        end = count + len(f) + 1
+        fieldname = f.strip()
+        if not fieldname or fieldname.startswith('/'):
+            if prev_fieldname is not None:
+                res[prev_fieldname] = (res[prev_fieldname], header[count])
+                prev_fieldname = None
+            count = end
+            continue
+        bitstart = header[count]
+        if prev_fieldname is not None:
+            res[prev_fieldname] = (res[prev_fieldname], bitstart)
+        res[fieldname] = bitstart
+        count = end
+        prev_fieldname = fieldname
+    res[prev_fieldname] = (bitstart, 32)
+    return res
+
+
+def decode_form(form):
+    header = decode_form_header(form[0])
+    res = []
+    print ("header", header)
+    for line in form[1:]:
+        dec = decode_line(header, line)
+        if dec:
+            res.append(dec)
+    fields = {}
+    falternate = {}
+    for l in res:
+        for k, (start,end) in l.items():
+            if k in fields:
+                if (start, end) == fields[k]:
+                    continue # already in and matching for this Form
+                if k in falternate:
+                    alternate = "%s_%d" % (k, falternate[k])
+                    if (start, end) == fields[alternate]:
+                        continue
+                falternate[k] = fidx = falternate.get(k, 0) + 1
+                fields["%s_%d" % (k, fidx)] = (start, end)
+            else:
+                fields[k] = (start, end)
+    return fields
+
+
+class DecodeFields:
+
+    def __init__(self, bitkls=BitRange, bitargs=(), fname="fields.txt"):
+        self.bitkls = bitkls
+        self.bitargs = bitargs
+        self.fname = fname
+
+    def create_specs(self):
+        self.forms, self.instrs = self.decode_fields()
+        self.form_names = forms = self.instrs.keys()
+        for form in forms:
+            fields = self.instrs[form]
+            fk = fields.keys()
+            Fields = namedtuple("Fields", fk)
+            instr = Fields(**fields)
+            setattr(self, "Form%s" % form, instr)
+        # now add in some commonly-used fields (should be done automatically)
+        # note that these should only be ones which are the same on all Forms
+        # note: these are from microwatt insn_helpers.vhdl
+        self.RS = self.FormX.RS
+        self.RT = self.FormX.RT
+        self.RA = self.FormX.RA
+        self.RB = self.FormX.RB
+        self.SI = self.FormD.SI
+        self.UI = self.FormD.UI
+        self.L = self.FormD.L
+        self.SH32 = self.FormM.SH
+        self.sh = self.FormMD.sh
+        self.MB32 = self.FormM.MB
+        self.ME32 = self.FormM.ME
+        self.LI = self.FormI.LI
+        self.LK = self.FormI.LK
+        self.AA = self.FormB.AA
+        self.Rc = self.FormX.Rc
+        self.OE = self.FormXO.Rc
+        self.BD = self.FormB.BD
+        self.BF = self.FormX.BF
+        self.CR = self.FormXL.XO # used by further mcrf decoding
+        self.BB = self.FormXL.BB
+        self.BA = self.FormXL.BA
+        self.BT = self.FormXL.BT
+        self.FXM = self.FormXFX.FXM
+        self.BO = self.FormXL.BO
+        self.BI = self.FormXL.BI
+        self.BH = self.FormXL.BH
+        self.D = self.FormD.D
+        self.DS = self.FormDS.DS
+        self.TO = self.FormX.TO
+        self.BC = self.FormA.BC
+        self.SH = self.FormX.SH
+        self.ME = self.FormM.ME
+        self.MB = self.FormM.MB
+        self.SPR = self.FormXFX.SPR
+
+    def decode_fields(self):
+        with open(self.fname) as f:
+            txt = f.readlines()
+        forms = {}
+        reading_data = False
+        for l in txt:
+            print ("line", l)
+            l = l.strip()
+            if len(l) == 0:
+                continue
+            if reading_data:
+                if l[0] == '#':
+                    reading_data = False
+                else:
+                    forms[heading].append(l)
+            if not reading_data:
+                assert l[0] == '#'
+                heading = l[1:].strip()
+                #if heading.startswith('1.6.28'): # skip instr fields for now
+                    #break
+                heading = heading.split(' ')[-1]
+                print ("heading", heading)
+                reading_data = True
+                forms[heading] = []
+
+        res = {}
+        inst = {}
+
+        for hdr, form in forms.items():
+            print ("heading", hdr)
+            if heading == 'Fields':
+                i = decode_instructions(form)
+                for form, field in i.items():
+                    inst[form] = self.decode_instruction_fields(field)
+            #else:
+            #    res[hdr] = decode_form(form)
+        return res, inst
+
+    def decode_instruction_fields(self, fields):
+        res = {}
+        for field in fields:
+            f, spec = field.strip().split(" ")
+            d = self.bitkls(*self.bitargs)
+            idx = 0
+            for s in spec[1:-1].split(","):
+                s = s.split(':')
+                if len(s) == 1:
+                    d[idx] = int(s[0])
+                    idx += 1
+                else:
+                    start = int(s[0])
+                    end = int(s[1])
+                    while start <= end:
+                        d[idx] = start
+                        idx += 1
+                        start += 1
+            f = f.replace(",", "_")
+            unique = find_unique(res, f)
+            res[unique] = d
+
+        return res
+
+if __name__ == '__main__':
+    dec = DecodeFields()
+    dec.create_specs()
+    forms, instrs = dec.forms, dec.instrs
+    for hdr, form in forms.items():
+        print ()
+        print (hdr)
+        for k, v in form.items():
+            #print ("line", l)
+            #for k, v in l.items():
+            print ("%s: %d-%d" % (k, v[0], v[1]))
+    for form, field in instrs.items():
+        print ()
+        print (form)
+        for f, vals in field.items():
+            print ("    ", f, vals)
+    print (dec.FormX)
+    print (dec.FormX.A)
+    print (dir(dec.FormX))
+    print (dec.FormX._fields)
diff --git a/src/soc/decoder/power_fieldsn.py b/src/soc/decoder/power_fieldsn.py
new file mode 100644
index 00000000..e603bbd3
--- /dev/null
+++ b/src/soc/decoder/power_fieldsn.py
@@ -0,0 +1,74 @@
+from collections import OrderedDict
+from power_fields import DecodeFields, BitRange
+from nmigen import Module, Elaboratable, Signal, Cat
+from nmigen.cli import rtlil
+
+
+class SignalBitRange(BitRange):
+    def __init__(self, signal):
+        BitRange.__init__(self)
+        self.signal = signal
+
+    def __getitem__(self, subs):
+        # *sigh* field numberings are bit-inverted.  PowerISA 3.0B section 1.3.2
+        width = self.signal.shape()[0]
+        print (dir(self))
+        print (self.items())
+        if isinstance(subs, slice):
+            res = []
+            print (subs)
+            start, stop, step = subs.start, subs.stop, subs.step
+            if step is None:
+                step = 1
+            if start is None:
+                start = 0
+            if stop is None:
+                stop = -1
+            if start < 0:
+                start = len(self) - start - 1
+            if stop < 0:
+                stop = len(self) - stop - 1
+            print ("range", start, stop, step)
+            for t in range(start, stop, step):
+                k = OrderedDict.__getitem__(self, t)
+                print ("t", t, k)
+                res.append(self.signal[width-k-1])
+            return Cat(*res)
+        else:
+            k = OrderedDict.__getitem__(self, subs)
+            return self.signal[width-k-1]
+
+        print ("translated", subs, translated)
+
+
+class SigDecode(Elaboratable):
+
+    def __init__(self, width):
+        self.opcode_in = Signal(width, reset_less=False)
+        self.df = DecodeFields(SignalBitRange, [self.opcode_in])
+        self.df.create_specs()
+        self.x_s = Signal(len(self.df.FormX.S), reset_less=True)
+        self.x_sh = Signal(len(self.df.FormX.SH), reset_less=True)
+        self.dq_xs_s = Signal(len(self.df.FormDQ.SX_S), reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        comb += self.x_s.eq(self.df.FormX.S[0])
+        comb += self.x_sh.eq(self.df.FormX.SH[0:-1])
+        comb += self.dq_xs_s.eq(self.df.FormDQ.SX_S[0:-1])
+        return m
+
+    def ports(self):
+        return [self.opcode_in, self.x_s, self.x_sh]
+
+def create_sigdecode():
+    s = SigDecode(32)
+    return s
+
+if __name__ == '__main__':
+    sigdecode = create_sigdecode()
+    vl = rtlil.convert(sigdecode, ports=sigdecode.ports())
+    with open("decoder.il", "w") as f:
+        f.write(vl)
+
diff --git a/src/soc/decoder/test/test_power_decoder.py b/src/soc/decoder/test/test_power_decoder.py
new file mode 100644
index 00000000..f64f4b96
--- /dev/null
+++ b/src/soc/decoder/test/test_power_decoder.py
@@ -0,0 +1,130 @@
+from nmigen import Module, Signal
+from nmigen.back.pysim import Simulator, Delay
+from nmigen.test.utils import FHDLTestCase
+from nmigen.cli import rtlil
+import sys
+import os
+import unittest
+sys.path.append("../")
+from power_decoder import (PowerDecoder, pdecode)
+from power_enums import (Function, InternalOp, In1Sel, In2Sel, In3Sel,
+                         OutSel, RC, LdstLen, CryIn, single_bit_flags,
+                         get_signal_name, get_csv)
+
+
+class DecoderTestCase(FHDLTestCase):
+
+    def run_tst(self, bitsel, csvname, minor=None, suffix=None, opint=True):
+        m = Module()
+        comb = m.d.comb
+        opcode = Signal(32)
+        function_unit = Signal(Function)
+        internal_op = Signal(InternalOp)
+        in1_sel = Signal(In1Sel)
+        in2_sel = Signal(In2Sel)
+        in3_sel = Signal(In3Sel)
+        out_sel = Signal(OutSel)
+        rc_sel = Signal(RC)
+        ldst_len = Signal(LdstLen)
+        cry_in = Signal(CryIn)
+
+        # opcodes = get_csv(csvname)
+        # m.submodules.dut = dut = PowerDecoder(32, opcodes, bitsel=bitsel,
+        #                                       opint=opint, suffix=suffix)
+        m.submodules.dut = dut = pdecode
+        comb += [dut.opcode_in.eq(opcode),
+                 function_unit.eq(dut.op.function_unit),
+                 in1_sel.eq(dut.op.in1_sel),
+                 in2_sel.eq(dut.op.in2_sel),
+                 in3_sel.eq(dut.op.in3_sel),
+                 out_sel.eq(dut.op.out_sel),
+                 rc_sel.eq(dut.op.rc_sel),
+                 ldst_len.eq(dut.op.ldst_len),
+                 cry_in.eq(dut.op.cry_in),
+                 internal_op.eq(dut.op.internal_op)]
+
+        sim = Simulator(m)
+        opcodes = get_csv(csvname)
+
+        def process():
+            for row in opcodes:
+                if not row['unit']:
+                    continue
+                op = row['opcode']
+                if not opint: # HACK: convert 001---10 to 0b00100010
+                    op = "0b" + op.replace('-', '0')
+                print ("opint", opint, row['opcode'], op)
+                print(row)
+                yield opcode.eq(0)
+                yield opcode[bitsel[0]:bitsel[1]].eq(int(op, 0))
+                if minor:
+                    print(minor)
+                    minorbits = minor[1]
+                    yield opcode[minorbits[0]:minorbits[1]].eq(minor[0])
+                yield Delay(1e-6)
+                signals = [(function_unit, Function, 'unit'),
+                           (internal_op, InternalOp, 'internal op'),
+                           (in1_sel, In1Sel, 'in1'),
+                           (in2_sel, In2Sel, 'in2'),
+                           (in3_sel, In3Sel, 'in3'),
+                           (out_sel, OutSel, 'out'),
+                           (rc_sel, RC, 'rc'),
+                           (cry_in, CryIn, 'cry in'),
+                           (ldst_len, LdstLen, 'ldst len')]
+                for sig, enm, name in signals:
+                    result = yield sig
+                    expected = enm[row[name]]
+                    msg = f"{sig.name} == {enm(result)}, expected: {expected}"
+                    self.assertEqual(enm(result), expected, msg)
+                for bit in single_bit_flags:
+                    sig = getattr(dut.op, get_signal_name(bit))
+                    result = yield sig
+                    expected = int(row[bit])
+                    msg = f"{sig.name} == {result}, expected: {expected}"
+                    self.assertEqual(expected, result, msg)
+        sim.add_process(process)
+        prefix = os.path.splitext(csvname)[0]
+        with sim.write_vcd("%s.vcd" % prefix, "%s.gtkw" % prefix, traces=[
+                opcode, function_unit, internal_op,
+                in1_sel, in2_sel]):
+            sim.run()
+
+    def generate_ilang(self):
+        vl = rtlil.convert(pdecode, ports=pdecode.ports())
+        with open("decoder.il", "w") as f:
+            f.write(vl)
+
+    def test_major(self):
+        self.run_tst((26, 32), "major.csv")
+        self.generate_ilang()
+
+    def test_minor_19(self):
+        self.run_tst((1, 11), "minor_19.csv", minor=(19, (26, 32)),
+                     suffix=(0, 5))
+
+    # def test_minor_19_00000(self):
+    #     self.run_tst((1, 11), "minor_19_00000.csv")
+
+    def test_minor_30(self):
+        self.run_tst((1, 5), "minor_30.csv", minor=(30, (26, 32)))
+
+    def test_minor_31(self):
+        self.run_tst((1, 11), "minor_31.csv", minor=(31, (26, 32)))
+
+    def test_minor_58(self):
+        self.run_tst((0, 2), "minor_58.csv", minor=(58, (26, 32)))
+
+    def test_minor_62(self):
+        self.run_tst((0, 2), "minor_62.csv", minor=(62, (26, 32)))
+
+
+    # #def test_minor_31_prefix(self):
+    # #    self.run_tst(10, "minor_31.csv", suffix=(5, 10))
+
+    # def test_extra(self):
+    #     self.run_tst(32, "extra.csv", opint=False)
+    #     self.generate_ilang(32, "extra.csv", opint=False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/src/soc/experiment/alu_hier.py b/src/soc/experiment/alu_hier.py
new file mode 100644
index 00000000..9659059c
--- /dev/null
+++ b/src/soc/experiment/alu_hier.py
@@ -0,0 +1,239 @@
+from nmigen import Elaboratable, Signal, Module, Const, Mux
+from nmigen.cli import main
+from nmigen.cli import verilog, rtlil
+
+import operator
+
+
+class Adder(Elaboratable):
+    def __init__(self, width):
+        self.a   = Signal(width)
+        self.b   = Signal(width)
+        self.o   = Signal(width)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.o.eq(self.a + self.b)
+        return m
+
+
+class Subtractor(Elaboratable):
+    def __init__(self, width):
+        self.a   = Signal(width)
+        self.b   = Signal(width)
+        self.o   = Signal(width)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.o.eq(self.a - self.b)
+        return m
+
+
+class Multiplier(Elaboratable):
+    def __init__(self, width):
+        self.a   = Signal(width)
+        self.b   = Signal(width)
+        self.o   = Signal(width)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.o.eq(self.a * self.b)
+        return m
+
+
+class Shifter(Elaboratable):
+    def __init__(self, width):
+        self.width = width
+        self.a   = Signal(width)
+        self.b   = Signal(width)
+        self.o   = Signal(width)
+
+    def elaborate(self, platform):
+        m = Module()
+        btrunc = Signal(self.width)
+        m.d.comb += btrunc.eq(self.b & Const((1<<self.width)-1))
+        m.d.comb += self.o.eq(self.a >> btrunc)
+        return m
+
+
+class ALU(Elaboratable):
+    def __init__(self, width):
+        self.p_valid_i = Signal()
+        self.p_ready_o = Signal()
+        self.n_ready_i = Signal()
+        self.n_valid_o = Signal()
+        self.counter   = Signal(4)
+        self.op  = Signal(2)
+        self.a   = Signal(width)
+        self.b   = Signal(width)
+        self.o   = Signal(width)
+        self.width = width
+
+    def elaborate(self, platform):
+        m = Module()
+        add = Adder(self.width)
+        sub = Subtractor(self.width)
+        mul = Multiplier(self.width)
+        shf = Shifter(self.width)
+
+        m.submodules.add = add
+        m.submodules.sub = sub
+        m.submodules.mul = mul
+        m.submodules.shf = shf
+        for mod in [add, sub, mul, shf]:
+            m.d.comb += [
+                mod.a.eq(self.a),
+                mod.b.eq(self.b),
+            ]
+        go_now = Signal(reset_less=True) # testing no-delay ALU
+
+        with m.If(self.p_valid_i):
+            # input is valid. next check, if we already said "ready" or not
+            with m.If(~self.p_ready_o):
+                # we didn't say "ready" yet, so say so and initialise
+                m.d.sync += self.p_ready_o.eq(1)
+
+                # as this is a "fake" pipeline, just grab the output right now
+                with m.Switch(self.op):
+                    for i, mod in enumerate([add, sub, mul, shf]):
+                        with m.Case(i):
+                            m.d.sync += self.o.eq(mod.o)
+                with m.If(self.op == 2): # MUL, to take 5 instructions
+                    m.d.sync += self.counter.eq(5)
+                with m.Elif(self.op == 3): # SHIFT to take 7
+                    m.d.sync += self.counter.eq(7)
+                with m.Elif(self.op == 1): # SUB to take 1, straight away
+                    m.d.sync += self.counter.eq(1)
+                    m.d.comb += go_now.eq(1)
+                with m.Else(): # ADD to take 2
+                    m.d.sync += self.counter.eq(2)
+        with m.Else():
+            # input says no longer valid, so drop ready as well.
+            # a "proper" ALU would have had to sync in the opcode and a/b ops
+            m.d.sync += self.p_ready_o.eq(0)
+
+        # ok so the counter's running: when it gets to 1, fire the output
+        with m.If((self.counter == 1) | go_now):
+            # set the output as valid if the recipient is ready for it
+            m.d.sync += self.n_valid_o.eq(1)
+        with m.If(self.n_ready_i & self.n_valid_o):
+            m.d.sync += self.n_valid_o.eq(0)
+            # recipient said it was ready: reset back to known-good.
+            m.d.sync += self.counter.eq(0) # reset the counter
+            m.d.sync += self.o.eq(0) # clear the output for tidiness sake
+
+        # countdown to 1 (transition from 1 to 0 only on acknowledgement)
+        with m.If(self.counter > 1):
+            m.d.sync += self.counter.eq(self.counter - 1)
+
+        return m
+
+    def __iter__(self):
+        yield self.op
+        yield self.a
+        yield self.b
+        yield self.o
+
+    def ports(self):
+        return list(self)
+
+
+class BranchOp(Elaboratable):
+    def __init__(self, width, op):
+        self.a   = Signal(width)
+        self.b   = Signal(width)
+        self.o   = Signal(width)
+        self.op = op
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
+        return m
+
+
+class BranchALU(Elaboratable):
+    def __init__(self, width):
+        self.p_valid_i = Signal()
+        self.p_ready_o = Signal()
+        self.n_ready_i = Signal()
+        self.n_valid_o = Signal()
+        self.counter   = Signal(4)
+        self.op  = Signal(2)
+        self.a   = Signal(width)
+        self.b   = Signal(width)
+        self.o   = Signal(width)
+        self.width = width
+
+    def elaborate(self, platform):
+        m = Module()
+        bgt = BranchOp(self.width, operator.gt)
+        blt = BranchOp(self.width, operator.lt)
+        beq = BranchOp(self.width, operator.eq)
+        bne = BranchOp(self.width, operator.ne)
+
+        m.submodules.bgt = bgt
+        m.submodules.blt = blt
+        m.submodules.beq = beq
+        m.submodules.bne = bne
+        for mod in [bgt, blt, beq, bne]:
+            m.d.comb += [
+                mod.a.eq(self.a),
+                mod.b.eq(self.b),
+            ]
+
+        go_now = Signal(reset_less=True) # testing no-delay ALU
+        with m.If(self.p_valid_i):
+            # input is valid. next check, if we already said "ready" or not
+            with m.If(~self.p_ready_o):
+                # we didn't say "ready" yet, so say so and initialise
+                m.d.sync += self.p_ready_o.eq(1)
+
+                # as this is a "fake" pipeline, just grab the output right now
+                with m.Switch(self.op):
+                    for i, mod in enumerate([bgt, blt, beq, bne]):
+                        with m.Case(i):
+                            m.d.sync += self.o.eq(mod.o)
+                m.d.sync += self.counter.eq(5) # branch to take 5 cycles (fake)
+                #m.d.comb += go_now.eq(1)
+        with m.Else():
+            # input says no longer valid, so drop ready as well.
+            # a "proper" ALU would have had to sync in the opcode and a/b ops
+            m.d.sync += self.p_ready_o.eq(0)
+
+        # ok so the counter's running: when it gets to 1, fire the output
+        with m.If((self.counter == 1) | go_now):
+            # set the output as valid if the recipient is ready for it
+            m.d.sync += self.n_valid_o.eq(1)
+        with m.If(self.n_ready_i & self.n_valid_o):
+            m.d.sync += self.n_valid_o.eq(0)
+            # recipient said it was ready: reset back to known-good.
+            m.d.sync += self.counter.eq(0) # reset the counter
+            m.d.sync += self.o.eq(0) # clear the output for tidiness sake
+
+        # countdown to 1 (transition from 1 to 0 only on acknowledgement)
+        with m.If(self.counter > 1):
+            m.d.sync += self.counter.eq(self.counter - 1)
+
+        return m
+
+    def __iter__(self):
+        yield self.op
+        yield self.a
+        yield self.b
+        yield self.o
+
+    def ports(self):
+        return list(self)
+
+
+if __name__ == "__main__":
+    alu = ALU(width=16)
+    vl = rtlil.convert(alu, ports=alu.ports())
+    with open("test_alu.il", "w") as f:
+        f.write(vl)
+
+    alu = BranchALU(width=16)
+    vl = rtlil.convert(alu, ports=alu.ports())
+    with open("test_branch_alu.il", "w") as f:
+        f.write(vl)
+
diff --git a/src/soc/experiment/compalu.py b/src/soc/experiment/compalu.py
new file mode 100644
index 00000000..7da6b5cf
--- /dev/null
+++ b/src/soc/experiment/compalu.py
@@ -0,0 +1,207 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Mux, Elaboratable
+
+from nmutil.latch import SRLatch, latchregister
+
+""" Computation Unit (aka "ALU Manager").
+
+    This module runs a "revolving door" set of three latches, based on
+    * Issue
+    * Go_Read
+    * Go_Write
+    where one of them cannot be set on any given cycle.
+    (Note however that opc_l has been inverted (and qn used), due to SRLatch
+     default reset state being "0" rather than "1")
+
+    * When issue is first raised, a busy signal is sent out.
+      The src1 and src2 registers and the operand can be latched in
+      at this point
+
+    * Read request is set, which is acknowledged through the Scoreboard
+      to the priority picker, which generates (one and only one) Go_Read
+      at a time.  One of those will (eventually) be this Computation Unit.
+
+    * Once Go_Read is set, the src1/src2/operand latch door shuts (locking
+      src1/src2/operand in place), and the ALU is told to proceed.
+
+    * As this is currently a "demo" unit, a countdown timer is activated
+      to simulate an ALU "pipeline", which activates "write request release",
+      and the ALU's output is captured into a temporary register.
+
+    * Write request release will go through a similar process as Read request,
+      resulting (eventually) in Go_Write being asserted.
+
+    * When Go_Write is asserted, two things happen: (1) the data in the temp
+      register is placed combinatorially onto the output, and (2) the
+      req_l latch is cleared, busy is dropped, and the Comp Unit is back
+      through its revolving door to do another task.
+
+    Notes on oper_i:
+
+    * bits[0:2] are for the ALU, add=0, sub=1, shift=2, mul=3
+    * bit[2] are the immediate (bit[2]=1 == immediate mode)
+"""
+
+class ComputationUnitNoDelay(Elaboratable):
+    def __init__(self, rwid, opwid, alu):
+        self.opwid = opwid
+        self.rwid = rwid
+        self.alu = alu
+
+        self.counter = Signal(4)
+        self.go_rd_i = Signal(reset_less=True) # go read in
+        self.go_wr_i = Signal(reset_less=True) # go write in
+        self.issue_i = Signal(reset_less=True) # fn issue in
+        self.shadown_i = Signal(reset=1) # shadow function, defaults to ON
+        self.go_die_i = Signal() # go die (reset)
+
+        self.oper_i = Signal(opwid, reset_less=True) # opcode in
+        self.imm_i = Signal(rwid, reset_less=True) # immediate in
+        self.src1_i = Signal(rwid, reset_less=True) # oper1 in
+        self.src2_i = Signal(rwid, reset_less=True) # oper2 in
+
+        self.busy_o = Signal(reset_less=True) # fn busy out
+        self.data_o = Signal(rwid, reset_less=True) # Dest out
+        self.rd_rel_o = Signal(reset_less=True) # release src1/src2 request
+        self.req_rel_o = Signal(reset_less=True) # release request out (valid_o)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.alu = self.alu
+        m.submodules.src_l = src_l = SRLatch(sync=False)
+        m.submodules.opc_l = opc_l = SRLatch(sync=False)
+        m.submodules.req_l = req_l = SRLatch(sync=False)
+
+        # shadow/go_die
+        reset_w = Signal(reset_less=True)
+        reset_r = Signal(reset_less=True)
+        m.d.comb += reset_w.eq(self.go_wr_i | self.go_die_i)
+        m.d.comb += reset_r.eq(self.go_rd_i | self.go_die_i)
+
+        # This is fascinating and very important to observe that this
+        # is in effect a "3-way revolving door".  At no time may all 3
+        # latches be set at the same time.
+
+        # opcode latch (not using go_rd_i) - inverted so that busy resets to 0
+        m.d.sync += opc_l.s.eq(self.issue_i) # XXX NOTE: INVERTED FROM book!
+        m.d.sync += opc_l.r.eq(reset_w)      # XXX NOTE: INVERTED FROM book!
+
+        # src operand latch (not using go_wr_i)
+        m.d.sync += src_l.s.eq(self.issue_i)
+        m.d.sync += src_l.r.eq(reset_r)
+
+        # dest operand latch (not using issue_i)
+        m.d.sync += req_l.s.eq(self.go_rd_i)
+        m.d.sync += req_l.r.eq(reset_w)
+
+
+        # create a latch/register for the operand
+        oper_r = Signal(self.opwid+1, reset_less=True) # opcode reg
+        latchregister(m, self.oper_i, oper_r, self.issue_i)
+
+        # and one for the output from the ALU
+        data_r = Signal(self.rwid, reset_less=True) # Dest register
+        latchregister(m, self.alu.o, data_r, req_l.q)
+
+        # get the top 2 bits for the ALU
+        m.d.comb += self.alu.op.eq(oper_r[0:2])
+
+        # 3rd bit is whether this is an immediate or not
+        op_is_imm = Signal(reset_less=True)
+        m.d.comb += op_is_imm.eq(oper_r[2])
+
+        # select immediate if opcode says so.  however also change the latch
+        # to trigger *from* the opcode latch instead.
+        src2_or_imm = Signal(self.rwid, reset_less=True)
+        src_sel = Signal(reset_less=True)
+        m.d.comb += src_sel.eq(Mux(op_is_imm, opc_l.qn, src_l.q))
+        m.d.comb += src2_or_imm.eq(Mux(op_is_imm, self.imm_i, self.src2_i))
+
+        # create a latch/register for src1/src2
+        latchregister(m, self.src1_i, self.alu.a, src_l.q)
+        latchregister(m, src2_or_imm, self.alu.b, src_sel)
+
+        # -----
+        # outputs
+        # -----
+
+        # all request signals gated by busy_o.  prevents picker problems
+        busy_o = self.busy_o
+        m.d.comb += busy_o.eq(opc_l.q) # busy out
+        m.d.comb += self.rd_rel_o.eq(src_l.q & busy_o) # src1/src2 req rel
+
+        # on a go_read, tell the ALU we're accepting data.
+        # NOTE: this spells TROUBLE if the ALU isn't ready!
+        # go_read is only valid for one clock!
+        with m.If(self.go_rd_i):                     # src operands ready, GO!
+            with m.If(~self.alu.p_ready_o):          # no ACK yet
+                m.d.comb += self.alu.p_valid_i.eq(1) # so indicate valid
+
+        # only proceed if ALU says its output is valid
+        with m.If(self.alu.n_valid_o):
+            # when ALU ready, write req release out. waits for shadow
+            m.d.comb += self.req_rel_o.eq(req_l.q & busy_o & self.shadown_i)
+            # when output latch is ready, and ALU says ready, accept ALU output
+            with m.If(self.req_rel_o):
+                m.d.comb += self.alu.n_ready_i.eq(1) # tells ALU "thanks got it"
+
+        # output the data from the latch on go_write
+        with m.If(self.go_wr_i):
+            m.d.comb += self.data_o.eq(data_r)
+
+        return m
+
+    def __iter__(self):
+        yield self.go_rd_i
+        yield self.go_wr_i
+        yield self.issue_i
+        yield self.shadown_i
+        yield self.go_die_i
+        yield self.oper_i
+        yield self.imm_i
+        yield self.src1_i
+        yield self.src2_i
+        yield self.busy_o
+        yield self.rd_rel_o
+        yield self.req_rel_o
+        yield self.data_o
+
+    def ports(self):
+        return list(self)
+
+
+def scoreboard_sim(dut):
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_read_i.eq(1)
+    yield
+    yield dut.go_read_i.eq(0)
+    yield
+    yield dut.go_write_i.eq(1)
+    yield
+    yield dut.go_write_i.eq(0)
+    yield
+
+def test_scoreboard():
+    from alu_hier import ALU
+    alu = ALU(16)
+    dut = ComputationUnitNoDelay(16, 8, alu)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_compalu.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, scoreboard_sim(dut), vcd_name='test_compalu.vcd')
+
+if __name__ == '__main__':
+    test_scoreboard()
diff --git a/src/soc/experiment/compldst.py b/src/soc/experiment/compldst.py
new file mode 100644
index 00000000..77ad39dd
--- /dev/null
+++ b/src/soc/experiment/compldst.py
@@ -0,0 +1,288 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Mux, Cat, Elaboratable
+
+from nmutil.latch import SRLatch, latchregister
+
+""" LOAD / STORE Computation Unit.  Also capable of doing ADD and ADD immediate
+
+    This module runs a "revolving door" set of four latches, based on
+    * Issue
+    * Go_Read
+    * Go_Addr
+    * Go_Write *OR* Go_Store
+
+    (Note that opc_l has been inverted (and qn used), due to SRLatch
+     default reset state being "0" rather than "1")
+"""
+
+# internal opcodes.  hypothetically this could do more combinations.
+# meanings:
+# * bit 0: 0 = ADD , 1 = SUB
+# * bit 1: 0 = src1, 1 = IMM
+# * bit 2: 1 = LD
+# * bit 3: 1 = ST
+LDST_OP_ADDI = 0b0000 # plain ADD (src1 + src2)
+LDST_OP_SUBI = 0b0001 # plain SUB (src1 - src2)
+LDST_OP_ADD  = 0b0010 # immed ADD (imm + src1)
+LDST_OP_SUB  = 0b0011 # immed SUB (imm - src1)
+LDST_OP_ST   = 0b0110 # immed ADD plus LD op.  ADD result is address
+LDST_OP_LD   = 0b1010 # immed ADD plus ST op.  ADD result is address
+
+
+class LDSTCompUnit(Elaboratable):
+    """ LOAD / STORE / ADD / SUB Computation Unit
+
+        Inputs
+        ------
+
+        * :rwid:   register width
+        * :alu:    an ALU module
+        * :mem:    a Memory Module (read-write capable)
+
+        Control Signals (In)
+        --------------------
+
+        * :issue_i:    LD/ST is being "issued".
+        * :isalu_i:    ADD/SUB is being "issued" (aka issue_alu_i)
+        * :shadown_i:  Inverted-shadow is being held (stops STORE *and* WRITE)
+        * :go_rd_i:    read is being actioned (latches in src regs)
+        * :go_ad_i:    address is being actioned (triggers actual mem LD)
+        * :go_st_i:    store is being actioned (triggers actual mem STORE)
+        * :go_die_i:   resets the unit back to "wait for issue"
+    """
+    def __init__(self, rwid, opwid, alu, mem):
+        self.opwid = opwid
+        self.rwid = rwid
+        self.alu = alu
+        self.mem = mem
+
+        self.counter = Signal(4)
+        self.go_rd_i = Signal(reset_less=True) # go read in
+        self.go_ad_i = Signal(reset_less=True) # go address in
+        self.go_wr_i = Signal(reset_less=True) # go write in
+        self.go_st_i = Signal(reset_less=True) # go store in
+        self.issue_i = Signal(reset_less=True) # fn issue in
+        self.isalu_i = Signal(reset_less=True) # fn issue as ALU in
+        self.shadown_i = Signal(reset=1) # shadow function, defaults to ON
+        self.go_die_i = Signal() # go die (reset)
+
+        self.oper_i = Signal(opwid, reset_less=True) # opcode in
+        self.imm_i = Signal(rwid, reset_less=True) # immediate in
+        self.src1_i = Signal(rwid, reset_less=True) # oper1 in
+        self.src2_i = Signal(rwid, reset_less=True) # oper2 in
+
+        self.busy_o = Signal(reset_less=True)       # fn busy out
+        self.rd_rel_o = Signal(reset_less=True) # request src1/src2
+        self.adr_rel_o = Signal(reset_less=True) # request address (from mem)
+        self.sto_rel_o = Signal(reset_less=True) # request store (to mem)
+        self.req_rel_o = Signal(reset_less=True) # request write (result)
+        self.data_o = Signal(rwid, reset_less=True) # Dest out (LD or ALU)
+        self.addr_o = Signal(rwid, reset_less=True) # Address out (LD or ST)
+
+        # hmm... TODO... move these to outside of LDSTCompUnit
+        self.load_mem_o = Signal(reset_less=True) # activate memory LOAD
+        self.stwd_mem_o = Signal(reset_less=True) # activate memory STORE
+        self.ld_o = Signal(reset_less=True) # operation is a LD
+        self.st_o = Signal(reset_less=True) # operation is a ST
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        sync = m.d.sync
+
+        m.submodules.alu = self.alu
+        m.submodules.src_l = src_l = SRLatch(sync=False)
+        m.submodules.opc_l = opc_l = SRLatch(sync=False)
+        m.submodules.adr_l = adr_l = SRLatch(sync=False)
+        m.submodules.req_l = req_l = SRLatch(sync=False)
+        m.submodules.sto_l = sto_l = SRLatch(sync=False)
+
+        # shadow/go_die
+        reset_b = Signal(reset_less=True)
+        reset_w = Signal(reset_less=True)
+        reset_a = Signal(reset_less=True)
+        reset_s = Signal(reset_less=True)
+        reset_r = Signal(reset_less=True)
+        comb += reset_b.eq(self.go_st_i | self.go_wr_i | self.go_die_i)
+        comb += reset_w.eq(self.go_wr_i | self.go_die_i)
+        comb += reset_s.eq(self.go_st_i | self.go_die_i)
+        comb += reset_r.eq(self.go_rd_i | self.go_die_i)
+        # this one is slightly different, issue_alu_i selects go_wr_i)
+        a_sel = Mux(self.isalu_i, self.go_wr_i, self.go_ad_i)
+        comb += reset_a.eq(a_sel| self.go_die_i)
+
+        # opcode decode
+        op_alu = Signal(reset_less=True)
+        op_is_ld = Signal(reset_less=True)
+        op_is_st = Signal(reset_less=True)
+        op_ldst = Signal(reset_less=True)
+        op_is_imm = Signal(reset_less=True)
+
+        # select immediate or src2 reg to add
+        src2_or_imm = Signal(self.rwid, reset_less=True)
+        src_sel = Signal(reset_less=True)
+
+        # issue can be either issue_i or issue_alu_i (isalu_i)
+        issue_i = Signal(reset_less=True)
+        comb += issue_i.eq(self.issue_i | self.isalu_i)
+
+        # Ripple-down the latches, each one set cancels the previous.
+        # NOTE: use sync to stop combinatorial loops.
+
+        # opcode latch - inverted so that busy resets to 0
+        sync += opc_l.s.eq(issue_i) # XXX NOTE: INVERTED FROM book!
+        sync += opc_l.r.eq(reset_b) # XXX NOTE: INVERTED FROM book!
+
+        # src operand latch
+        sync += src_l.s.eq(issue_i)
+        sync += src_l.r.eq(reset_r)
+
+        # addr latch
+        sync += adr_l.s.eq(self.go_rd_i)
+        sync += adr_l.r.eq(reset_a)
+
+        # dest operand latch
+        sync += req_l.s.eq(self.go_ad_i)
+        sync += req_l.r.eq(reset_w)
+
+        # store latch
+        sync += sto_l.s.eq(self.go_ad_i)
+        sync += sto_l.r.eq(reset_s)
+
+        # outputs: busy and release signals
+        busy_o = self.busy_o
+        comb += self.busy_o.eq(opc_l.q) # busy out
+        comb += self.rd_rel_o.eq(src_l.q & busy_o) # src1/src2 req rel
+        comb += self.sto_rel_o.eq(sto_l.q & busy_o & self.shadown_i & op_is_st)
+
+        # request release enabled based on if op is a LD/ST or a plain ALU
+        # if op is an ADD/SUB or a LD, req_rel activates.
+        wr_q = Signal(reset_less=True)
+        comb += wr_q.eq(req_l.q & (~op_ldst | op_is_ld))
+
+        alulatch = Signal(reset_less=True)
+        comb += alulatch.eq((op_ldst & self.adr_rel_o) | \
+                            (~op_ldst & self.req_rel_o))
+
+        # only proceed if ALU says its output is valid
+        with m.If(self.alu.n_valid_o):
+
+            # write req release out.  waits until shadow is dropped.
+            comb += self.req_rel_o.eq(wr_q & busy_o & self.shadown_i)
+            # address release only happens on LD/ST, and is shadowed.
+            comb += self.adr_rel_o.eq(adr_l.q & op_ldst & busy_o & \
+                                      self.shadown_i)
+            # when output latch is ready, and ALU says ready, accept ALU output
+            with m.If(self.req_rel_o):
+                m.d.comb += self.alu.n_ready_i.eq(1) # tells ALU "thanks got it"
+
+        # select immediate if opcode says so.  however also change the latch
+        # to trigger *from* the opcode latch instead.
+        comb += src_sel.eq(Mux(op_is_imm, opc_l.qn, src_l.q))
+        comb += src2_or_imm.eq(Mux(op_is_imm, self.imm_i, self.src2_i))
+
+        # create a latch/register for src1/src2 (include immediate select)
+        latchregister(m, self.src1_i, self.alu.a, src_l.q)
+        latchregister(m, src2_or_imm, self.alu.b, src_sel)
+
+        # create a latch/register for the operand
+        oper_r = Signal(self.opwid, reset_less=True) # Dest register
+        latchregister(m, self.oper_i, oper_r, self.issue_i)
+        alu_op = Cat(op_alu, 0, op_is_imm) # using alu_hier, here.
+        comb += self.alu.op.eq(alu_op)
+
+        # and one for the output from the ALU
+        data_r = Signal(self.rwid, reset_less=True) # Dest register
+        latchregister(m, self.alu.o, data_r, alulatch)
+
+        # decode bits of operand (latched)
+        comb += op_alu.eq(oper_r[0])
+        comb += op_is_imm.eq(oper_r[1])
+        comb += op_is_ld.eq(oper_r[2])
+        comb += op_is_st.eq(oper_r[3])
+        comb += op_ldst.eq(op_is_ld | op_is_st)
+        comb += self.load_mem_o.eq(op_is_ld & self.go_ad_i)
+        comb += self.stwd_mem_o.eq(op_is_st & self.go_st_i)
+        comb += self.ld_o.eq(op_is_ld)
+        comb += self.st_o.eq(op_is_st)
+
+        # on a go_read, tell the ALU we're accepting data.
+        # NOTE: this spells TROUBLE if the ALU isn't ready!
+        # go_read is only valid for one clock!
+        with m.If(self.go_rd_i):                     # src operands ready, GO!
+            with m.If(~self.alu.p_ready_o):          # no ACK yet
+                m.d.comb += self.alu.p_valid_i.eq(1) # so indicate valid
+
+        # put the register directly onto the output bus on a go_write
+        with m.If(self.go_wr_i):
+            comb += self.data_o.eq(data_r)
+
+        # put the register directly onto the address bus
+        with m.If(self.go_ad_i):
+            comb += self.addr_o.eq(data_r)
+
+        return m
+
+    def __iter__(self):
+        yield self.go_rd_i
+        yield self.go_ad_i
+        yield self.go_wr_i
+        yield self.go_st_i
+        yield self.issue_i
+        yield self.isalu_i
+        yield self.shadown_i
+        yield self.go_die_i
+        yield self.oper_i
+        yield self.imm_i
+        yield self.src1_i
+        yield self.src2_i
+        yield self.busy_o
+        yield self.rd_rel_o
+        yield self.adr_rel_o
+        yield self.sto_rel_o
+        yield self.req_rel_o
+        yield self.data_o
+        yield self.load_mem_o
+        yield self.stwd_mem_o
+
+    def ports(self):
+        return list(self)
+
+
+def scoreboard_sim(dut):
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_read_i.eq(1)
+    yield
+    yield dut.go_read_i.eq(0)
+    yield
+    yield dut.go_write_i.eq(1)
+    yield
+    yield dut.go_write_i.eq(0)
+    yield
+
+
+def test_scoreboard():
+    from alu_hier import ALU
+    alu = ALU(16)
+    mem = alu # fake
+    dut = LDSTCompUnit(16, 4, alu, mem)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_ldst_comp.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, scoreboard_sim(dut), vcd_name='test_ldst_comp.vcd')
+
+if __name__ == '__main__':
+    test_scoreboard()
diff --git a/src/soc/experiment/cscore.py b/src/soc/experiment/cscore.py
new file mode 100644
index 00000000..18b71c80
--- /dev/null
+++ b/src/soc/experiment/cscore.py
@@ -0,0 +1,435 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Const, Signal, Array, Cat, Elaboratable
+
+from regfile.regfile import RegFileArray, treereduce
+from scoreboard.fn_unit import IntFnUnit, FPFnUnit, LDFnUnit, STFnUnit
+from scoreboard.fu_fu_matrix import FUFUDepMatrix
+from scoreboard.fu_reg_matrix import FURegDepMatrix
+from scoreboard.global_pending import GlobalPending
+from scoreboard.group_picker import GroupPicker
+from scoreboard.issue_unit import IntFPIssueUnit, RegDecode
+
+from compalu import ComputationUnitNoDelay
+
+from alu_hier import ALU
+from nmutil.latch import SRLatch
+
+from random import randint
+
+
+class Scoreboard(Elaboratable):
+    def __init__(self, rwid, n_regs):
+        """ Inputs:
+
+            * :rwid:   bit width of register file(s) - both FP and INT
+            * :n_regs: depth of register file(s) - number of FP and INT regs
+        """
+        self.rwid = rwid
+        self.n_regs = n_regs
+
+        # Register Files
+        self.intregs = RegFileArray(rwid, n_regs)
+        self.fpregs = RegFileArray(rwid, n_regs)
+
+        # inputs
+        self.int_store_i = Signal(reset_less=True) # instruction is a store
+        self.int_dest_i = Signal(max=n_regs, reset_less=True) # Dest R# in
+        self.int_src1_i = Signal(max=n_regs, reset_less=True) # oper1 R# in
+        self.int_src2_i = Signal(max=n_regs, reset_less=True) # oper2 R# in
+
+        self.issue_o = Signal(reset_less=True) # instruction was accepted
+
+    def elaborate(self, platform):
+        m = Module()
+
+        m.submodules.intregs = self.intregs
+        m.submodules.fpregs = self.fpregs
+
+        # register ports
+        int_dest = self.intregs.write_port("dest")
+        int_src1 = self.intregs.read_port("src1")
+        int_src2 = self.intregs.read_port("src2")
+
+        fp_dest = self.fpregs.write_port("dest")
+        fp_src1 = self.fpregs.read_port("src1")
+        fp_src2 = self.fpregs.read_port("src2")
+
+        # Int ALUs
+        add = ALU(self.rwid)
+        sub = ALU(self.rwid)
+        m.submodules.comp1 = comp1 = ComputationUnitNoDelay(self.rwid, 1, add)
+        m.submodules.comp2 = comp2 = ComputationUnitNoDelay(self.rwid, 1, sub)
+        int_alus = [comp1, comp2]
+
+        m.d.comb += comp1.oper_i.eq(Const(0)) # temporary/experiment: op=add
+        m.d.comb += comp2.oper_i.eq(Const(1)) # temporary/experiment: op=sub
+
+        # Int FUs
+        if_l = []
+        int_src1_pend_v = []
+        int_src2_pend_v = []
+        int_rd_pend_v = []
+        int_wr_pend_v = []
+        for i, a in enumerate(int_alus):
+            # set up Integer Function Unit, add to module (and python list)
+            fu = IntFnUnit(self.n_regs, shadow_wid=0)
+            setattr(m.submodules, "intfu%d" % i, fu)
+            if_l.append(fu)
+            # collate the read/write pending vectors (to go into global pending)
+            int_src1_pend_v.append(fu.src1_pend_o)
+            int_src2_pend_v.append(fu.src2_pend_o)
+            int_rd_pend_v.append(fu.int_rd_pend_o)
+            int_wr_pend_v.append(fu.int_wr_pend_o)
+        int_fus = Array(if_l)
+
+        # Count of number of FUs
+        n_int_fus = len(if_l)
+        n_fp_fus = 0 # for now
+
+        n_fus = n_int_fus + n_fp_fus # plus FP FUs
+
+        # XXX replaced by array of FUs? *FnUnit
+        # # Integer FU-FU Dep Matrix
+        # m.submodules.intfudeps = FUFUDepMatrix(n_int_fus, n_int_fus)
+        # Integer FU-Reg Dep Matrix
+        # intregdeps = FURegDepMatrix(self.n_regs, n_int_fus)
+        # m.submodules.intregdeps = intregdeps
+
+        # Integer Priority Picker 1: Adder + Subtractor
+        intpick1 = GroupPicker(2) # picks between add and sub
+        m.submodules.intpick1 = intpick1
+
+        # Global Pending Vectors (INT and FP)
+        # NOTE: number of vectors is NOT same as number of FUs.
+        g_int_src1_pend_v = GlobalPending(self.n_regs, int_src1_pend_v)
+        g_int_src2_pend_v = GlobalPending(self.n_regs, int_src2_pend_v)
+        g_int_rd_pend_v = GlobalPending(self.n_regs, int_rd_pend_v, True)
+        g_int_wr_pend_v = GlobalPending(self.n_regs, int_wr_pend_v, True)
+        m.submodules.g_int_src1_pend_v = g_int_src1_pend_v
+        m.submodules.g_int_src2_pend_v = g_int_src2_pend_v
+        m.submodules.g_int_rd_pend_v = g_int_rd_pend_v
+        m.submodules.g_int_wr_pend_v = g_int_wr_pend_v
+
+        # INT/FP Issue Unit
+        regdecode = RegDecode(self.n_regs)
+        m.submodules.regdecode = regdecode
+        issueunit = IntFPIssueUnit(self.n_regs, n_int_fus, n_fp_fus)
+        m.submodules.issueunit = issueunit
+
+        # FU-FU Dependency Matrices
+        intfudeps = FUFUDepMatrix(n_int_fus, n_int_fus)
+        m.submodules.intfudeps = intfudeps
+
+        #---------
+        # ok start wiring things together...
+        # "now hear de word of de looord... dem bones dem bones dem dryy bones"
+        # https://www.youtube.com/watch?v=pYb8Wm6-QfA
+        #---------
+
+        #---------
+        # Issue Unit is where it starts.  set up some in/outs for this module
+        #---------
+        m.d.comb += [issueunit.i.store_i.eq(self.int_store_i),
+                     regdecode.dest_i.eq(self.int_dest_i),
+                     regdecode.src1_i.eq(self.int_src1_i),
+                     regdecode.src2_i.eq(self.int_src2_i),
+                     regdecode.enable_i.eq(1),
+                     self.issue_o.eq(issueunit.issue_o),
+                    issueunit.i.dest_i.eq(regdecode.dest_o),
+                    ]
+        self.int_insn_i = issueunit.i.insn_i # enabled by instruction decode
+
+        # connect global rd/wr pending vectors
+        m.d.comb += issueunit.i.g_wr_pend_i.eq(g_int_wr_pend_v.g_pend_o)
+        # TODO: issueunit.f (FP)
+
+        # and int function issue / busy arrays, and dest/src1/src2
+        fn_issue_l = []
+        fn_busy_l = []
+        for i, fu in enumerate(if_l):
+            fn_issue_l.append(fu.issue_i)
+            fn_busy_l.append(fu.busy_o)
+            m.d.sync += fu.issue_i.eq(issueunit.i.fn_issue_o[i])
+            m.d.sync += fu.dest_i.eq(self.int_dest_i)
+            m.d.sync += fu.src1_i.eq(self.int_src1_i)
+            m.d.sync += fu.src2_i.eq(self.int_src2_i)
+            # XXX sync, so as to stop a simulation infinite loop
+            m.d.comb += issueunit.i.busy_i[i].eq(fu.busy_o)
+
+        #---------
+        # connect Function Units
+        #---------
+
+        # Group Picker... done manually for now.  TODO: cat array of pick sigs
+        m.d.comb += if_l[0].go_rd_i.eq(intpick1.go_rd_o[0]) # add rd
+        m.d.comb += if_l[0].go_wr_i.eq(intpick1.go_wr_o[0]) # add wr
+
+        m.d.comb += if_l[1].go_rd_i.eq(intpick1.go_rd_o[1]) # subtract rd
+        m.d.comb += if_l[1].go_wr_i.eq(intpick1.go_wr_o[1]) # subtract wr
+
+        # create read-pending FU-FU vectors
+        intfu_rd_pend_v = Signal(n_int_fus, reset_less = True)
+        intfu_wr_pend_v = Signal(n_int_fus, reset_less = True)
+        for i in range(n_int_fus):
+            #m.d.comb += intfu_rd_pend_v[i].eq(if_l[i].int_rd_pend_o.bool())
+            #m.d.comb += intfu_wr_pend_v[i].eq(if_l[i].int_wr_pend_o.bool())
+            m.d.comb += intfu_rd_pend_v[i].eq(if_l[i].int_readable_o)
+            m.d.comb += intfu_wr_pend_v[i].eq(if_l[i].int_writable_o)
+
+        # Connect INT Fn Unit global wr/rd pending
+        for fu in if_l:
+            m.d.comb += fu.g_int_wr_pend_i.eq(g_int_wr_pend_v.g_pend_o)
+            m.d.comb += fu.g_int_rd_pend_i.eq(g_int_rd_pend_v.g_pend_o)
+
+        # Connect FU-FU Matrix, NOTE: FN Units readable/writable considered
+        # to be unit "read-pending / write-pending"
+        m.d.comb += intfudeps.rd_pend_i.eq(intfu_rd_pend_v)
+        m.d.comb += intfudeps.wr_pend_i.eq(intfu_wr_pend_v)
+        m.d.comb += intfudeps.issue_i.eq(issueunit.i.fn_issue_o)
+        for i in range(n_int_fus):
+            m.d.comb += intfudeps.go_rd_i[i].eq(intpick1.go_rd_o[i])
+            m.d.comb += intfudeps.go_wr_i[i].eq(intpick1.go_wr_o[i])
+
+        # Connect Picker (note connection to FU-FU)
+        #---------
+        readable_o = intfudeps.readable_o
+        writable_o = intfudeps.writable_o
+        m.d.comb += intpick1.rd_rel_i[0].eq(int_alus[0].rd_rel_o)
+        m.d.comb += intpick1.rd_rel_i[1].eq(int_alus[1].rd_rel_o)
+        m.d.comb += intpick1.req_rel_i[0].eq(int_alus[0].req_rel_o)
+        m.d.comb += intpick1.req_rel_i[1].eq(int_alus[1].req_rel_o)
+        m.d.comb += intpick1.readable_i[0].eq(readable_o[0]) # add rd
+        m.d.comb += intpick1.writable_i[0].eq(writable_o[0]) # add wr
+        m.d.comb += intpick1.readable_i[1].eq(readable_o[1]) # sub rd
+        m.d.comb += intpick1.writable_i[1].eq(writable_o[1]) # sub wr
+
+        #---------
+        # Connect Register File(s)
+        #---------
+        #with m.If(if_l[0].go_wr_i | if_l[1].go_wr_i):
+        m.d.sync += int_dest.wen.eq(g_int_wr_pend_v.g_pend_o)
+        #with m.If(intpick1.go_rd_o):
+        #with m.If(if_l[0].go_rd_i | if_l[1].go_rd_i):
+        m.d.sync += int_src1.ren.eq(g_int_src1_pend_v.g_pend_o)
+        m.d.sync += int_src2.ren.eq(g_int_src2_pend_v.g_pend_o)
+
+        # merge (OR) all integer FU / ALU outputs to a single value
+        # bit of a hack: treereduce needs a list with an item named "dest_o"
+        dest_o = treereduce(int_alus)
+        m.d.sync += int_dest.data_i.eq(dest_o)
+
+        # connect ALUs
+        for i, alu in enumerate(int_alus):
+            m.d.comb += alu.go_rd_i.eq(intpick1.go_rd_o[i])
+            m.d.comb += alu.go_wr_i.eq(intpick1.go_wr_o[i])
+            m.d.comb += alu.issue_i.eq(fn_issue_l[i])
+            #m.d.comb += fn_busy_l[i].eq(alu.busy_o)  # XXX ignore, use fnissue
+            m.d.comb += alu.src1_i.eq(int_src1.data_o)
+            m.d.comb += alu.src2_i.eq(int_src2.data_o)
+            m.d.comb += if_l[i].req_rel_i.eq(alu.req_rel_o) # pipe out ready
+
+        return m
+
+
+    def __iter__(self):
+        yield from self.intregs
+        yield from self.fpregs
+        yield self.int_store_i
+        yield self.int_dest_i
+        yield self.int_src1_i
+        yield self.int_src2_i
+        yield self.issue_o
+        #yield from self.int_src1
+        #yield from self.int_dest
+        #yield from self.int_src1
+        #yield from self.int_src2
+        #yield from self.fp_dest
+        #yield from self.fp_src1
+        #yield from self.fp_src2
+
+    def ports(self):
+        return list(self)
+
+IADD = 0
+ISUB = 1
+
+class RegSim:
+    def __init__(self, rwidth, nregs):
+        self.rwidth = rwidth
+        self.regs = [0] * nregs
+
+    def op(self, op, src1, src2, dest):
+        src1 = self.regs[src1]
+        src2 = self.regs[src2]
+        if op == IADD:
+            val = (src1 + src2) & ((1<<(self.rwidth))-1)
+        elif op == ISUB:
+            val = (src1 - src2) & ((1<<(self.rwidth))-1)
+        self.regs[dest] = val
+
+    def setval(self, dest, val):
+        self.regs[dest] = val
+
+    def dump(self, dut):
+        for i, val in enumerate(self.regs):
+            reg = yield dut.intregs.regs[i].reg
+            okstr = "OK" if reg == val else "!ok"
+            print("reg %d expected %x received %x %s" % (i, val, reg, okstr))
+
+    def check(self, dut):
+        for i, val in enumerate(self.regs):
+            reg = yield dut.intregs.regs[i].reg
+            if reg != val:
+                print("reg %d expected %x received %x\n" % (i, val, reg))
+                yield from self.dump(dut)
+                assert False
+
+def int_instr(dut, alusim, op, src1, src2, dest):
+    for i in range(len(dut.int_insn_i)):
+        yield dut.int_insn_i[i].eq(0)
+    yield dut.int_dest_i.eq(dest)
+    yield dut.int_src1_i.eq(src1)
+    yield dut.int_src2_i.eq(src2)
+    yield dut.int_insn_i[op].eq(1)
+    alusim.op(op, src1, src2, dest)
+
+
+def print_reg(dut, rnums):
+    rs = []
+    for rnum in rnums:
+        reg = yield dut.intregs.regs[rnum].reg
+        rs.append("%x" % reg)
+    rnums = map(str, rnums)
+    print ("reg %s: %s" % (','.join(rnums), ','.join(rs)))
+
+
+def scoreboard_sim(dut, alusim):
+    yield dut.int_store_i.eq(0)
+
+    for i in range(1, dut.n_regs):
+        yield dut.intregs.regs[i].reg.eq(i)
+        alusim.setval(i, i)
+
+    if False:
+        yield from int_instr(dut, alusim, IADD, 4, 3, 5)
+        yield from print_reg(dut, [3,4,5])
+        yield
+        yield from int_instr(dut, alusim, IADD, 5, 2, 5)
+        yield from print_reg(dut, [3,4,5])
+        yield
+        yield from int_instr(dut, alusim, ISUB, 5, 1, 3)
+        yield from print_reg(dut, [3,4,5])
+        yield
+        for i in range(len(dut.int_insn_i)):
+            yield dut.int_insn_i[i].eq(0)
+        yield from print_reg(dut, [3,4,5])
+        yield
+        yield from print_reg(dut, [3,4,5])
+        yield
+        yield from print_reg(dut, [3,4,5])
+        yield
+
+        yield from alusim.check(dut)
+
+    for i in range(2):
+        src1 = randint(1, dut.n_regs-1)
+        src2 = randint(1, dut.n_regs-1)
+        while True:
+            dest = randint(1, dut.n_regs-1)
+            break
+            if dest not in [src1, src2]:
+                break
+        op = randint(0, 1)
+        if False:
+            if i % 2 == 0:
+                src1 = 6
+                src2 = 6
+                dest = 1
+            else:
+                src1 = 1
+                src2 = 7
+                dest = 2
+            #src1 = 2
+            #src2 = 3
+            #dest = 2
+
+            op = i
+
+        if True:
+            if i == 0:
+                src1 = 2
+                src2 = 3
+                dest = 3
+            else:
+                src1 = 5
+                src2 = 3
+                dest = 4
+
+            #op = (i+1) % 2
+            op = i
+
+        print ("random %d: %d %d %d %d\n" % (i, op, src1, src2, dest))
+        yield from int_instr(dut, alusim, op, src1, src2, dest)
+        yield from print_reg(dut, [3,4,5])
+        while True:
+            yield
+            issue_o = yield dut.issue_o
+            if issue_o:
+                yield from print_reg(dut, [3,4,5])
+                for i in range(len(dut.int_insn_i)):
+                    yield dut.int_insn_i[i].eq(0)
+                break
+            print ("busy",)
+            yield from print_reg(dut, [3,4,5])
+        yield
+        yield
+        yield
+
+
+    yield
+    yield from print_reg(dut, [3,4,5])
+    yield
+    yield from print_reg(dut, [3,4,5])
+    yield
+    yield from print_reg(dut, [3,4,5])
+    yield
+    yield from print_reg(dut, [3,4,5])
+    yield
+    yield
+    yield
+    yield
+    yield
+    yield
+    yield
+    yield
+    yield
+    yield from alusim.check(dut)
+    yield from alusim.dump(dut)
+
+
+def explore_groups(dut):
+    from nmigen.hdl.ir import Fragment
+    from nmigen.hdl.xfrm import LHSGroupAnalyzer
+
+    fragment = dut.elaborate(platform=None)
+    fr = Fragment.get(fragment, platform=None)
+
+    groups = LHSGroupAnalyzer()(fragment._statements)
+
+    print (groups)
+
+
+def test_scoreboard():
+    dut = Scoreboard(16, 8)
+    alusim = RegSim(16, 8)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_scoreboard.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, scoreboard_sim(dut, alusim),
+                        vcd_name='test_scoreboard.vcd')
+
+
+if __name__ == '__main__':
+    test_scoreboard()
diff --git a/src/soc/experiment/score6600.py b/src/soc/experiment/score6600.py
new file mode 100644
index 00000000..209bc99c
--- /dev/null
+++ b/src/soc/experiment/score6600.py
@@ -0,0 +1,1296 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen.hdl.ast import unsigned
+from nmigen import Module, Const, Signal, Array, Cat, Elaboratable, Memory
+
+from regfile.regfile import RegFileArray, treereduce
+from scoreboard.fu_fu_matrix import FUFUDepMatrix
+from scoreboard.fu_reg_matrix import FURegDepMatrix
+from scoreboard.global_pending import GlobalPending
+from scoreboard.group_picker import GroupPicker
+from scoreboard.issue_unit import IssueUnitGroup, IssueUnitArray, RegDecode
+from scoreboard.shadow import ShadowMatrix, BranchSpeculationRecord
+from scoreboard.instruction_q import Instruction, InstructionQ
+from scoreboard.memfu import MemFunctionUnits
+
+from compalu import ComputationUnitNoDelay
+from compldst import LDSTCompUnit
+
+from alu_hier import ALU, BranchALU
+from nmutil.latch import SRLatch
+from nmutil.nmoperator import eq
+
+from random import randint, seed
+from copy import deepcopy
+from math import log
+
+
+class TestMemory(Elaboratable):
+    def __init__(self, regwid, addrw):
+        self.ddepth = 1 # regwid //8
+        depth = (1<<addrw) // self.ddepth
+        self.mem   = Memory(width=regwid, depth=depth, init=range(0, depth))
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.rdport = self.rdport = self.mem.read_port()
+        m.submodules.wrport = self.wrport = self.mem.write_port()
+        return m
+
+
+class MemSim:
+    def __init__(self, regwid, addrw):
+        self.regwid = regwid
+        self.ddepth = 1 # regwid//8
+        depth = (1<<addrw) // self.ddepth
+        self.mem = list(range(0, depth))
+
+    def ld(self, addr):
+        return self.mem[addr>>self.ddepth]
+
+    def st(self, addr, data):
+        self.mem[addr>>self.ddepth] = data & ((1<<self.regwid)-1)
+
+
+class CompUnitsBase(Elaboratable):
+    """ Computation Unit Base class.
+
+        Amazingly, this class works recursively.  It's supposed to just
+        look after some ALUs (that can handle the same operations),
+        grouping them together, however it turns out that the same code
+        can also group *groups* of Computation Units together as well.
+
+        Basically it was intended just to concatenate the ALU's issue,
+        go_rd etc. signals together, which start out as bits and become
+        sequences.  Turns out that the same trick works just as well
+        on Computation Units!
+
+        So this class may be used recursively to present a top-level
+        sequential concatenation of all the signals in and out of
+        ALUs, whilst at the same time making it convenient to group
+        ALUs together.
+
+        At the lower level, the intent is that groups of (identical)
+        ALUs may be passed the same operation.  Even beyond that,
+        the intent is that that group of (identical) ALUs actually
+        share the *same pipeline* and as such become a "Concurrent
+        Computation Unit" as defined by Mitch Alsup (see section
+        11.4.9.3)
+    """
+    def __init__(self, rwid, units, ldstmode=False):
+        """ Inputs:
+
+            * :rwid:   bit width of register file(s) - both FP and INT
+            * :units: sequence of ALUs (or CompUnitsBase derivatives)
+        """
+        self.units = units
+        self.ldstmode = ldstmode
+        self.rwid = rwid
+        self.rwid = rwid
+        if units and isinstance(units[0], CompUnitsBase):
+            self.n_units = 0
+            for u in self.units:
+                self.n_units += u.n_units
+        else:
+            self.n_units = len(units)
+
+        n_units = self.n_units
+
+        # inputs
+        self.issue_i = Signal(n_units, reset_less=True)
+        self.go_rd_i = Signal(n_units, reset_less=True)
+        self.go_wr_i = Signal(n_units, reset_less=True)
+        self.shadown_i = Signal(n_units, reset_less=True)
+        self.go_die_i = Signal(n_units, reset_less=True)
+        if ldstmode:
+            self.go_ad_i = Signal(n_units, reset_less=True)
+            self.go_st_i = Signal(n_units, reset_less=True)
+
+        # outputs
+        self.busy_o = Signal(n_units, reset_less=True)
+        self.rd_rel_o = Signal(n_units, reset_less=True)
+        self.req_rel_o = Signal(n_units, reset_less=True)
+        if ldstmode:
+            self.ld_o = Signal(n_units, reset_less=True) # op is LD
+            self.st_o = Signal(n_units, reset_less=True) # op is ST
+            self.adr_rel_o = Signal(n_units, reset_less=True)
+            self.sto_rel_o = Signal(n_units, reset_less=True)
+            self.req_rel_o = Signal(n_units, reset_less=True)
+            self.load_mem_o = Signal(n_units, reset_less=True)
+            self.stwd_mem_o = Signal(n_units, reset_less=True)
+            self.addr_o = Signal(rwid, reset_less=True)
+
+        # in/out register data (note: not register#, actual data)
+        self.data_o = Signal(rwid, reset_less=True)
+        self.src1_i = Signal(rwid, reset_less=True)
+        self.src2_i = Signal(rwid, reset_less=True)
+        # input operand
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        for i, alu in enumerate(self.units):
+            setattr(m.submodules, "comp%d" % i, alu)
+
+        go_rd_l = []
+        go_wr_l = []
+        issue_l = []
+        busy_l = []
+        req_rel_l = []
+        rd_rel_l = []
+        shadow_l = []
+        godie_l = []
+        for alu in self.units:
+            req_rel_l.append(alu.req_rel_o)
+            rd_rel_l.append(alu.rd_rel_o)
+            shadow_l.append(alu.shadown_i)
+            godie_l.append(alu.go_die_i)
+            go_wr_l.append(alu.go_wr_i)
+            go_rd_l.append(alu.go_rd_i)
+            issue_l.append(alu.issue_i)
+            busy_l.append(alu.busy_o)
+        comb += self.rd_rel_o.eq(Cat(*rd_rel_l))
+        comb += self.req_rel_o.eq(Cat(*req_rel_l))
+        comb += self.busy_o.eq(Cat(*busy_l))
+        comb += Cat(*godie_l).eq(self.go_die_i)
+        comb += Cat(*shadow_l).eq(self.shadown_i)
+        comb += Cat(*go_wr_l).eq(self.go_wr_i)
+        comb += Cat(*go_rd_l).eq(self.go_rd_i)
+        comb += Cat(*issue_l).eq(self.issue_i)
+
+        # connect data register input/output
+
+        # merge (OR) all integer FU / ALU outputs to a single value
+        if self.units:
+            data_o = treereduce(self.units, "data_o")
+            comb += self.data_o.eq(data_o)
+            if self.ldstmode:
+                addr_o = treereduce(self.units, "addr_o")
+                comb += self.addr_o.eq(addr_o)
+
+        for i, alu in enumerate(self.units):
+            comb += alu.src1_i.eq(self.src1_i)
+            comb += alu.src2_i.eq(self.src2_i)
+
+        if not self.ldstmode:
+            return m
+
+        ldmem_l = []
+        stmem_l = []
+        go_ad_l = []
+        go_st_l = []
+        ld_l = []
+        st_l = []
+        adr_rel_l = []
+        sto_rel_l = []
+        for alu in self.units:
+            ld_l.append(alu.ld_o)
+            st_l.append(alu.st_o)
+            adr_rel_l.append(alu.adr_rel_o)
+            sto_rel_l.append(alu.sto_rel_o)
+            ldmem_l.append(alu.load_mem_o)
+            stmem_l.append(alu.stwd_mem_o)
+            go_ad_l.append(alu.go_ad_i)
+            go_st_l.append(alu.go_st_i)
+        comb += self.ld_o.eq(Cat(*ld_l))
+        comb += self.st_o.eq(Cat(*st_l))
+        comb += self.adr_rel_o.eq(Cat(*adr_rel_l))
+        comb += self.sto_rel_o.eq(Cat(*sto_rel_l))
+        comb += self.load_mem_o.eq(Cat(*ldmem_l))
+        comb += self.stwd_mem_o.eq(Cat(*stmem_l))
+        comb += Cat(*go_ad_l).eq(self.go_ad_i)
+        comb += Cat(*go_st_l).eq(self.go_st_i)
+
+        return m
+
+
+class CompUnitLDSTs(CompUnitsBase):
+
+    def __init__(self, rwid, opwid, n_ldsts, mem):
+        """ Inputs:
+
+            * :rwid:   bit width of register file(s) - both FP and INT
+            * :opwid:  operand bit width
+        """
+        self.opwid = opwid
+
+        # inputs
+        self.oper_i = Signal(opwid, reset_less=True)
+        self.imm_i = Signal(rwid, reset_less=True)
+
+        # Int ALUs
+        self.alus = []
+        for i in range(n_ldsts):
+            self.alus.append(ALU(rwid))
+
+        units = []
+        for alu in self.alus:
+            aluopwid = 4 # see compldst.py for "internal" opcode
+            units.append(LDSTCompUnit(rwid, aluopwid, alu, mem))
+
+        CompUnitsBase.__init__(self, rwid, units, ldstmode=True)
+
+    def elaborate(self, platform):
+        m = CompUnitsBase.elaborate(self, platform)
+        comb = m.d.comb
+
+        # hand the same operation to all units, 4 lower bits though
+        for alu in self.units:
+            comb += alu.oper_i[0:4].eq(self.oper_i)
+            comb += alu.imm_i.eq(self.imm_i)
+            comb += alu.isalu_i.eq(0)
+
+        return m
+
+
+class CompUnitALUs(CompUnitsBase):
+
+    def __init__(self, rwid, opwid, n_alus):
+        """ Inputs:
+
+            * :rwid:   bit width of register file(s) - both FP and INT
+            * :opwid:  operand bit width
+        """
+        self.opwid = opwid
+
+        # inputs
+        self.oper_i = Signal(opwid, reset_less=True)
+        self.imm_i = Signal(rwid, reset_less=True)
+
+        # Int ALUs
+        alus = []
+        for i in range(n_alus):
+            alus.append(ALU(rwid))
+
+        units = []
+        for alu in alus:
+            aluopwid = 3 # extra bit for immediate mode
+            units.append(ComputationUnitNoDelay(rwid, aluopwid, alu))
+
+        CompUnitsBase.__init__(self, rwid, units)
+
+    def elaborate(self, platform):
+        m = CompUnitsBase.elaborate(self, platform)
+        comb = m.d.comb
+
+        # hand the same operation to all units, only lower 3 bits though
+        for alu in self.units:
+            comb += alu.oper_i[0:3].eq(self.oper_i)
+            comb += alu.imm_i.eq(self.imm_i)
+
+        return m
+
+
+class CompUnitBR(CompUnitsBase):
+
+    def __init__(self, rwid, opwid):
+        """ Inputs:
+
+            * :rwid:   bit width of register file(s) - both FP and INT
+            * :opwid:  operand bit width
+
+            Note: bgt unit is returned so that a shadow unit can be created
+            for it
+        """
+        self.opwid = opwid
+
+        # inputs
+        self.oper_i = Signal(opwid, reset_less=True)
+        self.imm_i = Signal(rwid, reset_less=True)
+
+        # Branch ALU and CU
+        self.bgt = BranchALU(rwid)
+        aluopwid = 3 # extra bit for immediate mode
+        self.br1 = ComputationUnitNoDelay(rwid, aluopwid, self.bgt)
+        CompUnitsBase.__init__(self, rwid, [self.br1])
+
+    def elaborate(self, platform):
+        m = CompUnitsBase.elaborate(self, platform)
+        comb = m.d.comb
+
+        # hand the same operation to all units
+        for alu in self.units:
+            comb += alu.oper_i.eq(self.oper_i)
+            comb += alu.imm_i.eq(self.imm_i)
+
+        return m
+
+
+class FunctionUnits(Elaboratable):
+
+    def __init__(self, n_regs, n_int_alus):
+        self.n_regs = n_regs
+        self.n_int_alus = n_int_alus
+
+        self.dest_i = Signal(n_regs, reset_less=True) # Dest R# in
+        self.src1_i = Signal(n_regs, reset_less=True) # oper1 R# in
+        self.src2_i = Signal(n_regs, reset_less=True) # oper2 R# in
+
+        self.g_int_rd_pend_o = Signal(n_regs, reset_less=True)
+        self.g_int_wr_pend_o = Signal(n_regs, reset_less=True)
+
+        self.dest_rsel_o = Signal(n_regs, reset_less=True) # dest reg (bot)
+        self.src1_rsel_o = Signal(n_regs, reset_less=True) # src1 reg (bot)
+        self.src2_rsel_o = Signal(n_regs, reset_less=True) # src2 reg (bot)
+
+        self.readable_o = Signal(n_int_alus, reset_less=True)
+        self.writable_o = Signal(n_int_alus, reset_less=True)
+
+        self.go_rd_i = Signal(n_int_alus, reset_less=True)
+        self.go_wr_i = Signal(n_int_alus, reset_less=True)
+        self.go_die_i = Signal(n_int_alus, reset_less=True)
+        self.fn_issue_i = Signal(n_int_alus, reset_less=True)
+
+        # Note: FURegs wr_pend_o is also outputted from here, for use in WaWGrid
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        sync = m.d.sync
+
+        n_intfus = self.n_int_alus
+
+        # Integer FU-FU Dep Matrix
+        intfudeps = FUFUDepMatrix(n_intfus, n_intfus)
+        m.submodules.intfudeps = intfudeps
+        # Integer FU-Reg Dep Matrix
+        intregdeps = FURegDepMatrix(n_intfus, self.n_regs, 2)
+        m.submodules.intregdeps = intregdeps
+
+        comb += self.g_int_rd_pend_o.eq(intregdeps.v_rd_rsel_o)
+        comb += self.g_int_wr_pend_o.eq(intregdeps.v_wr_rsel_o)
+
+        comb += intregdeps.rd_pend_i.eq(intregdeps.v_rd_rsel_o)
+        comb += intregdeps.wr_pend_i.eq(intregdeps.v_wr_rsel_o)
+
+        comb += intfudeps.rd_pend_i.eq(intregdeps.rd_pend_o)
+        comb += intfudeps.wr_pend_i.eq(intregdeps.wr_pend_o)
+        self.wr_pend_o = intregdeps.wr_pend_o # also output for use in WaWGrid
+
+        comb += intfudeps.issue_i.eq(self.fn_issue_i)
+        comb += intfudeps.go_rd_i.eq(self.go_rd_i)
+        comb += intfudeps.go_wr_i.eq(self.go_wr_i)
+        comb += intfudeps.go_die_i.eq(self.go_die_i)
+        comb += self.readable_o.eq(intfudeps.readable_o)
+        comb += self.writable_o.eq(intfudeps.writable_o)
+
+        # Connect function issue / arrays, and dest/src1/src2
+        comb += intregdeps.dest_i.eq(self.dest_i)
+        comb += intregdeps.src_i[0].eq(self.src1_i)
+        comb += intregdeps.src_i[1].eq(self.src2_i)
+
+        comb += intregdeps.go_rd_i.eq(self.go_rd_i)
+        comb += intregdeps.go_wr_i.eq(self.go_wr_i)
+        comb += intregdeps.go_die_i.eq(self.go_die_i)
+        comb += intregdeps.issue_i.eq(self.fn_issue_i)
+
+        comb += self.dest_rsel_o.eq(intregdeps.dest_rsel_o)
+        comb += self.src1_rsel_o.eq(intregdeps.src_rsel_o[0])
+        comb += self.src2_rsel_o.eq(intregdeps.src_rsel_o[1])
+
+        return m
+
+
+class Scoreboard(Elaboratable):
+    def __init__(self, rwid, n_regs):
+        """ Inputs:
+
+            * :rwid:   bit width of register file(s) - both FP and INT
+            * :n_regs: depth of register file(s) - number of FP and INT regs
+        """
+        self.rwid = rwid
+        self.n_regs = n_regs
+
+        # Register Files
+        self.intregs = RegFileArray(rwid, n_regs)
+        self.fpregs = RegFileArray(rwid, n_regs)
+
+        # Memory (test for now)
+        self.mem = TestMemory(self.rwid, 8) # not too big, takes too long
+
+        # issue q needs to get at these
+        self.aluissue = IssueUnitGroup(2)
+        self.lsissue = IssueUnitGroup(2)
+        self.brissue = IssueUnitGroup(1)
+        # and these
+        self.alu_oper_i = Signal(4, reset_less=True)
+        self.alu_imm_i = Signal(rwid, reset_less=True)
+        self.br_oper_i = Signal(4, reset_less=True)
+        self.br_imm_i = Signal(rwid, reset_less=True)
+        self.ls_oper_i = Signal(4, reset_less=True)
+        self.ls_imm_i = Signal(rwid, reset_less=True)
+
+        # inputs
+        self.int_dest_i = Signal(range(n_regs), reset_less=True) # Dest R# in
+        self.int_src1_i = Signal(range(n_regs), reset_less=True) # oper1 R# in
+        self.int_src2_i = Signal(range(n_regs), reset_less=True) # oper2 R# in
+        self.reg_enable_i = Signal(reset_less=True) # enable reg decode
+
+        # outputs
+        self.issue_o = Signal(reset_less=True) # instruction was accepted
+        self.busy_o = Signal(reset_less=True) # at least one CU is busy
+
+        # for branch speculation experiment.  branch_direction = 0 if
+        # the branch hasn't been met yet.  1 indicates "success", 2 is "fail"
+        # branch_succ and branch_fail are requests to have the current
+        # instruction be dependent on the branch unit "shadow" capability.
+        self.branch_succ_i = Signal(reset_less=True)
+        self.branch_fail_i = Signal(reset_less=True)
+        self.branch_direction_o = Signal(2, reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        sync = m.d.sync
+
+        m.submodules.intregs = self.intregs
+        m.submodules.fpregs = self.fpregs
+        m.submodules.mem = mem = self.mem
+
+        # register ports
+        int_dest = self.intregs.write_port("dest")
+        int_src1 = self.intregs.read_port("src1")
+        int_src2 = self.intregs.read_port("src2")
+
+        fp_dest = self.fpregs.write_port("dest")
+        fp_src1 = self.fpregs.read_port("src1")
+        fp_src2 = self.fpregs.read_port("src2")
+
+        # Int ALUs and BR ALUs
+        n_int_alus = 5
+        cua = CompUnitALUs(self.rwid, 3, n_alus=self.aluissue.n_insns)
+        cub = CompUnitBR(self.rwid, 3) # 1 BR ALUs
+
+        # LDST Comp Units
+        n_ldsts = 2
+        cul = CompUnitLDSTs(self.rwid, 4, self.lsissue.n_insns, None)
+
+        # Comp Units
+        m.submodules.cu = cu = CompUnitsBase(self.rwid, [cua, cul, cub])
+        bgt = cub.bgt # get at the branch computation unit
+        br1 = cub.br1
+
+        # Int FUs
+        m.submodules.intfus = intfus = FunctionUnits(self.n_regs, n_int_alus)
+
+        # Memory FUs
+        m.submodules.memfus = memfus = MemFunctionUnits(n_ldsts, 5)
+
+        # Memory Priority Picker 1: one gateway per memory port
+        mempick1 = GroupPicker(n_ldsts) # picks 1 reader and 1 writer to intreg
+        m.submodules.mempick1 = mempick1
+
+        # Count of number of FUs
+        n_intfus = n_int_alus
+        n_fp_fus = 0 # for now
+
+        # Integer Priority Picker 1: Adder + Subtractor (and LD/ST)
+        intpick1 = GroupPicker(n_intfus) # picks 1 reader and 1 writer to intreg
+        m.submodules.intpick1 = intpick1
+
+        # INT/FP Issue Unit
+        regdecode = RegDecode(self.n_regs)
+        m.submodules.regdecode = regdecode
+        issueunit = IssueUnitArray([self.aluissue, self.lsissue, self.brissue])
+        m.submodules.issueunit = issueunit
+
+        # Shadow Matrix.  currently n_intfus shadows, to be used for
+        # write-after-write hazards.  NOTE: there is one extra for branches,
+        # so the shadow width is increased by 1
+        m.submodules.shadows = shadows = ShadowMatrix(n_intfus, n_intfus, True)
+        m.submodules.bshadow = bshadow = ShadowMatrix(n_intfus, 1, False)
+
+        # record previous instruction to cast shadow on current instruction
+        prev_shadow = Signal(n_intfus)
+
+        # Branch Speculation recorder.  tracks the success/fail state as
+        # each instruction is issued, so that when the branch occurs the
+        # allow/cancel can be issued as appropriate.
+        m.submodules.specrec = bspec = BranchSpeculationRecord(n_intfus)
+
+        #---------
+        # ok start wiring things together...
+        # "now hear de word of de looord... dem bones dem bones dem dryy bones"
+        # https://www.youtube.com/watch?v=pYb8Wm6-QfA
+        #---------
+
+        #---------
+        # Issue Unit is where it starts.  set up some in/outs for this module
+        #---------
+        comb += [    regdecode.dest_i.eq(self.int_dest_i),
+                     regdecode.src1_i.eq(self.int_src1_i),
+                     regdecode.src2_i.eq(self.int_src2_i),
+                     regdecode.enable_i.eq(self.reg_enable_i),
+                     self.issue_o.eq(issueunit.issue_o)
+                    ]
+
+        # take these to outside (issue needs them)
+        comb += cua.oper_i.eq(self.alu_oper_i)
+        comb += cua.imm_i.eq(self.alu_imm_i)
+        comb += cub.oper_i.eq(self.br_oper_i)
+        comb += cub.imm_i.eq(self.br_imm_i)
+        comb += cul.oper_i.eq(self.ls_oper_i)
+        comb += cul.imm_i.eq(self.ls_imm_i)
+
+        # TODO: issueunit.f (FP)
+
+        # and int function issue / busy arrays, and dest/src1/src2
+        comb += intfus.dest_i.eq(regdecode.dest_o)
+        comb += intfus.src1_i.eq(regdecode.src1_o)
+        comb += intfus.src2_i.eq(regdecode.src2_o)
+
+        fn_issue_o = issueunit.fn_issue_o
+
+        comb += intfus.fn_issue_i.eq(fn_issue_o)
+        comb += issueunit.busy_i.eq(cu.busy_o)
+        comb += self.busy_o.eq(cu.busy_o.bool())
+
+        #---------
+        # Memory Function Unit
+        #---------
+        reset_b = Signal(cul.n_units, reset_less=True)
+        sync += reset_b.eq(cul.go_st_i | cul.go_wr_i | cul.go_die_i)
+
+        comb += memfus.fn_issue_i.eq(cul.issue_i) # Comp Unit Issue -> Mem FUs
+        comb += memfus.addr_en_i.eq(cul.adr_rel_o) # Match enable on adr rel
+        comb += memfus.addr_rs_i.eq(reset_b) # reset same as LDSTCompUnit
+
+        # LD/STs have to accumulate prior LD/STs (TODO: multi-issue as well,
+        # in a transitive fashion).  This cycle activates based on LDSTCompUnit
+        # issue_i.  multi-issue gets a bit more complex but not a lot.
+        prior_ldsts = Signal(cul.n_units, reset_less=True)
+        sync += prior_ldsts.eq(memfus.g_int_ld_pend_o | memfus.g_int_st_pend_o)
+        with m.If(self.ls_oper_i[2]): # LD bit of operand
+            comb += memfus.ld_i.eq(cul.issue_i | prior_ldsts)
+        with m.If(self.ls_oper_i[3]): # ST bit of operand
+            comb += memfus.st_i.eq(cul.issue_i | prior_ldsts)
+
+        # TODO: adr_rel_o needs to go into L1 Cache.  for now,
+        # just immediately activate go_adr
+        comb += cul.go_ad_i.eq(cul.adr_rel_o)
+
+        # connect up address data
+        comb += memfus.addrs_i[0].eq(cul.units[0].addr_o)
+        comb += memfus.addrs_i[1].eq(cul.units[1].addr_o)
+
+        # connect loadable / storable to go_ld/go_st.
+        # XXX should only be done when the memory ld/st has actually happened!
+        go_st_i = Signal(cul.n_units, reset_less=True)
+        go_ld_i = Signal(cul.n_units, reset_less=True)
+        comb += go_ld_i.eq(memfus.loadable_o & memfus.addr_nomatch_o &\
+                                  cul.req_rel_o & cul.ld_o)
+        comb += go_st_i.eq(memfus.storable_o & memfus.addr_nomatch_o &\
+                                  cul.sto_rel_o & cul.st_o)
+        comb += memfus.go_ld_i.eq(go_ld_i)
+        comb += memfus.go_st_i.eq(go_st_i)
+        #comb += cul.go_wr_i.eq(go_ld_i)
+        comb += cul.go_st_i.eq(go_st_i)
+
+        #comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
+        #comb += cu.go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus])
+        #comb += cu.issue_i[0:n_intfus].eq(fn_issue_o[0:n_intfus])
+
+        #---------
+        # merge shadow matrices outputs
+        #---------
+
+        # these are explained in ShadowMatrix docstring, and are to be
+        # connected to the FUReg and FUFU Matrices, to get them to reset
+        anydie = Signal(n_intfus, reset_less=True)
+        allshadown = Signal(n_intfus, reset_less=True)
+        shreset = Signal(n_intfus, reset_less=True)
+        comb += allshadown.eq(shadows.shadown_o & bshadow.shadown_o)
+        comb += anydie.eq(shadows.go_die_o | bshadow.go_die_o)
+        comb += shreset.eq(bspec.match_g_o | bspec.match_f_o)
+
+        #---------
+        # connect fu-fu matrix
+        #---------
+
+        # Group Picker... done manually for now.
+        go_rd_o = intpick1.go_rd_o
+        go_wr_o = intpick1.go_wr_o
+        go_rd_i = intfus.go_rd_i
+        go_wr_i = intfus.go_wr_i
+        go_die_i = intfus.go_die_i
+        # NOTE: connect to the shadowed versions so that they can "die" (reset)
+        comb += go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus]) # rd
+        comb += go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus]) # wr
+        comb += go_die_i[0:n_intfus].eq(anydie[0:n_intfus]) # die
+
+        # Connect Picker
+        #---------
+        comb += intpick1.rd_rel_i[0:n_intfus].eq(cu.rd_rel_o[0:n_intfus])
+        comb += intpick1.req_rel_i[0:n_intfus].eq(cu.req_rel_o[0:n_intfus])
+        int_rd_o = intfus.readable_o
+        int_wr_o = intfus.writable_o
+        comb += intpick1.readable_i[0:n_intfus].eq(int_rd_o[0:n_intfus])
+        comb += intpick1.writable_i[0:n_intfus].eq(int_wr_o[0:n_intfus])
+
+        #---------
+        # Shadow Matrix
+        #---------
+
+        comb += shadows.issue_i.eq(fn_issue_o)
+        #comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
+        comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
+        #---------
+        # NOTE; this setup is for the instruction order preservation...
+
+        # connect shadows / go_dies to Computation Units
+        comb += cu.shadown_i[0:n_intfus].eq(allshadown)
+        comb += cu.go_die_i[0:n_intfus].eq(anydie)
+
+        # ok connect first n_int_fu shadows to busy lines, to create an
+        # instruction-order linked-list-like arrangement, using a bit-matrix
+        # (instead of e.g. a ring buffer).
+
+        # when written, the shadow can be cancelled (and was good)
+        for i in range(n_intfus):
+            comb += shadows.s_good_i[i][0:n_intfus].eq(go_wr_o[0:n_intfus])
+
+        # *previous* instruction shadows *current* instruction, and, obviously,
+        # if the previous is completed (!busy) don't cast the shadow!
+        comb += prev_shadow.eq(~fn_issue_o & cu.busy_o)
+        for i in range(n_intfus):
+            comb += shadows.shadow_i[i][0:n_intfus].eq(prev_shadow)
+
+        #---------
+        # ... and this is for branch speculation.  it uses the extra bit
+        # tacked onto the ShadowMatrix (hence shadow_wid=n_intfus+1)
+        # only needs to set shadow_i, s_fail_i and s_good_i
+
+        # issue captures shadow_i (if enabled)
+        comb += bshadow.reset_i[0:n_intfus].eq(shreset[0:n_intfus])
+
+        bactive = Signal(reset_less=True)
+        comb += bactive.eq((bspec.active_i | br1.issue_i) & ~br1.go_wr_i)
+
+        # instruction being issued (fn_issue_o) has a shadow cast by the branch
+        with m.If(bactive & (self.branch_succ_i | self.branch_fail_i)):
+            comb += bshadow.issue_i.eq(fn_issue_o)
+            for i in range(n_intfus):
+                with m.If(fn_issue_o & (Const(1<<i))):
+                    comb += bshadow.shadow_i[i][0].eq(1)
+
+        # finally, we need an indicator to the test infrastructure as to
+        # whether the branch succeeded or failed, plus, link up to the
+        # "recorder" of whether the instruction was under shadow or not
+
+        with m.If(br1.issue_i):
+            sync += bspec.active_i.eq(1)
+        with m.If(self.branch_succ_i):
+            comb += bspec.good_i.eq(fn_issue_o & 0x1f) # XXX MAGIC CONSTANT
+        with m.If(self.branch_fail_i):
+            comb += bspec.fail_i.eq(fn_issue_o & 0x1f) # XXX MAGIC CONSTANT
+
+        # branch is active (TODO: a better signal: this is over-using the
+        # go_write signal - actually the branch should not be "writing")
+        with m.If(br1.go_wr_i):
+            sync += self.branch_direction_o.eq(br1.data_o+Const(1, 2))
+            sync += bspec.active_i.eq(0)
+            comb += bspec.br_i.eq(1)
+            # branch occurs if data == 1, failed if data == 0
+            comb += bspec.br_ok_i.eq(br1.data_o == 1)
+            for i in range(n_intfus):
+                # *expected* direction of the branch matched against *actual*
+                comb += bshadow.s_good_i[i][0].eq(bspec.match_g_o[i])
+                # ... or it didn't
+                comb += bshadow.s_fail_i[i][0].eq(bspec.match_f_o[i])
+
+        #---------
+        # Connect Register File(s)
+        #---------
+        comb += int_dest.wen.eq(intfus.dest_rsel_o)
+        comb += int_src1.ren.eq(intfus.src1_rsel_o)
+        comb += int_src2.ren.eq(intfus.src2_rsel_o)
+
+        # connect ALUs to regfule
+        comb += int_dest.data_i.eq(cu.data_o)
+        comb += cu.src1_i.eq(int_src1.data_o)
+        comb += cu.src2_i.eq(int_src2.data_o)
+
+        # connect ALU Computation Units
+        comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
+        comb += cu.go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus])
+        comb += cu.issue_i[0:n_intfus].eq(fn_issue_o[0:n_intfus])
+
+        return m
+
+    def __iter__(self):
+        yield from self.intregs
+        yield from self.fpregs
+        yield self.int_dest_i
+        yield self.int_src1_i
+        yield self.int_src2_i
+        yield self.issue_o
+        yield self.branch_succ_i
+        yield self.branch_fail_i
+        yield self.branch_direction_o
+
+    def ports(self):
+        return list(self)
+
+
+class IssueToScoreboard(Elaboratable):
+
+    def __init__(self, qlen, n_in, n_out, rwid, opwid, n_regs):
+        self.qlen = qlen
+        self.n_in = n_in
+        self.n_out = n_out
+        self.rwid = rwid
+        self.opw = opwid
+        self.n_regs = n_regs
+
+        mqbits = unsigned(int(log(qlen) / log(2))+2)
+        self.p_add_i = Signal(mqbits) # instructions to add (from data_i)
+        self.p_ready_o = Signal() # instructions were added
+        self.data_i = Instruction.nq(n_in, "data_i", rwid, opwid)
+
+        self.busy_o = Signal(reset_less=True) # at least one CU is busy
+        self.qlen_o = Signal(mqbits, reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        sync = m.d.sync
+
+        iq = InstructionQ(self.rwid, self.opw, self.qlen, self.n_in, self.n_out)
+        sc = Scoreboard(self.rwid, self.n_regs)
+        m.submodules.iq = iq
+        m.submodules.sc = sc
+
+        # get at the regfile for testing
+        self.intregs = sc.intregs
+
+        # and the "busy" signal and instruction queue length
+        comb += self.busy_o.eq(sc.busy_o)
+        comb += self.qlen_o.eq(iq.qlen_o)
+
+        # link up instruction queue
+        comb += iq.p_add_i.eq(self.p_add_i)
+        comb += self.p_ready_o.eq(iq.p_ready_o)
+        for i in range(self.n_in):
+            comb += eq(iq.data_i[i], self.data_i[i])
+
+        # take instruction and process it.  note that it's possible to
+        # "inspect" the queue contents *without* actually removing the
+        # items.  items are only removed when the
+
+        # in "waiting" state
+        wait_issue_br = Signal()
+        wait_issue_alu = Signal()
+        wait_issue_ls = Signal()
+
+        with m.If(wait_issue_br | wait_issue_alu | wait_issue_ls):
+            # set instruction pop length to 1 if the unit accepted
+            with m.If(wait_issue_ls & (sc.lsissue.fn_issue_o != 0)):
+                with m.If(iq.qlen_o != 0):
+                    comb += iq.n_sub_i.eq(1)
+            with m.If(wait_issue_br & (sc.brissue.fn_issue_o != 0)):
+                with m.If(iq.qlen_o != 0):
+                    comb += iq.n_sub_i.eq(1)
+            with m.If(wait_issue_alu & (sc.aluissue.fn_issue_o != 0)):
+                with m.If(iq.qlen_o != 0):
+                    comb += iq.n_sub_i.eq(1)
+
+        # see if some instruction(s) are here.  note that this is
+        # "inspecting" the in-place queue.  note also that on the
+        # cycle following "waiting" for fn_issue_o to be set, the
+        # "resetting" done above (insn_i=0) could be re-ASSERTed.
+        with m.If(iq.qlen_o != 0):
+            # get the operands and operation
+            imm = iq.data_o[0].imm_i
+            dest = iq.data_o[0].dest_i
+            src1 = iq.data_o[0].src1_i
+            src2 = iq.data_o[0].src2_i
+            op = iq.data_o[0].oper_i
+            opi = iq.data_o[0].opim_i # immediate set
+
+            # set the src/dest regs
+            comb += sc.int_dest_i.eq(dest)
+            comb += sc.int_src1_i.eq(src1)
+            comb += sc.int_src2_i.eq(src2)
+            comb += sc.reg_enable_i.eq(1) # enable the regfile
+
+            # choose a Function-Unit-Group
+            with m.If((op & (0x3<<2)) != 0): # branch
+                comb += sc.br_oper_i.eq(Cat(op[0:2], opi))
+                comb += sc.br_imm_i.eq(imm)
+                comb += sc.brissue.insn_i.eq(1)
+                comb += wait_issue_br.eq(1)
+            with m.Elif((op & (0x3<<4)) != 0): # ld/st
+                # see compldst.py
+                # bit 0: ADD/SUB
+                # bit 1: immed
+                # bit 4: LD
+                # bit 5: ST
+                comb += sc.ls_oper_i.eq(Cat(op[0], opi[0], op[4:6]))
+                comb += sc.ls_imm_i.eq(imm)
+                comb += sc.lsissue.insn_i.eq(1)
+                comb += wait_issue_ls.eq(1)
+            with m.Else(): # alu
+                comb += sc.alu_oper_i.eq(Cat(op[0:2], opi))
+                comb += sc.alu_imm_i.eq(imm)
+                comb += sc.aluissue.insn_i.eq(1)
+                comb += wait_issue_alu.eq(1)
+
+            # XXX TODO
+            # these indicate that the instruction is to be made
+            # shadow-dependent on
+            # (either) branch success or branch fail
+            #yield sc.branch_fail_i.eq(branch_fail)
+            #yield sc.branch_succ_i.eq(branch_success)
+
+        return m
+
+    def __iter__(self):
+        yield self.p_ready_o
+        for o in self.data_i:
+            yield from list(o)
+        yield self.p_add_i
+
+    def ports(self):
+        return list(self)
+
+
+IADD = 0
+ISUB = 1
+IMUL = 2
+ISHF = 3
+IBGT = 4
+IBLT = 5
+IBEQ = 6
+IBNE = 7
+
+
+class RegSim:
+    def __init__(self, rwidth, nregs):
+        self.rwidth = rwidth
+        self.regs = [0] * nregs
+
+    def op(self, op, op_imm, imm, src1, src2, dest):
+        maxbits = (1 << self.rwidth) - 1
+        src1 = self.regs[src1] & maxbits
+        if op_imm:
+            src2 = imm
+        else:
+            src2 = self.regs[src2] & maxbits
+        if op == IADD:
+            val = src1 + src2
+        elif op == ISUB:
+            val = src1 - src2
+        elif op == IMUL:
+            val = src1 * src2
+        elif op == ISHF:
+            val = src1 >> (src2 & maxbits)
+        elif op == IBGT:
+            val = int(src1 > src2)
+        elif op == IBLT:
+            val = int(src1 < src2)
+        elif op == IBEQ:
+            val = int(src1 == src2)
+        elif op == IBNE:
+            val = int(src1 != src2)
+        else:
+            return 0 # LD/ST TODO
+        val &= maxbits
+        self.setval(dest, val)
+        return val
+
+    def setval(self, dest, val):
+        print ("sim setval", dest, hex(val))
+        self.regs[dest] = val
+
+    def dump(self, dut):
+        for i, val in enumerate(self.regs):
+            reg = yield dut.intregs.regs[i].reg
+            okstr = "OK" if reg == val else "!ok"
+            print("reg %d expected %x received %x %s" % (i, val, reg, okstr))
+
+    def check(self, dut):
+        for i, val in enumerate(self.regs):
+            reg = yield dut.intregs.regs[i].reg
+            if reg != val:
+                print("reg %d expected %x received %x\n" % (i, val, reg))
+                yield from self.dump(dut)
+                assert False
+
+def instr_q(dut, op, op_imm, imm, src1, src2, dest,
+            branch_success, branch_fail):
+    instrs = [{'oper_i': op, 'dest_i': dest, 'imm_i': imm, 'opim_i': op_imm,
+               'src1_i': src1, 'src2_i': src2}]
+
+    sendlen = 1
+    for idx in range(sendlen):
+        yield from eq(dut.data_i[idx], instrs[idx])
+        di = yield dut.data_i[idx]
+        print ("senddata %d %x" % (idx, di))
+    yield dut.p_add_i.eq(sendlen)
+    yield
+    o_p_ready = yield dut.p_ready_o
+    while not o_p_ready:
+        yield
+        o_p_ready = yield dut.p_ready_o
+
+    yield dut.p_add_i.eq(0)
+
+
+def int_instr(dut, op, imm, src1, src2, dest, branch_success, branch_fail):
+    yield from disable_issue(dut)
+    yield dut.int_dest_i.eq(dest)
+    yield dut.int_src1_i.eq(src1)
+    yield dut.int_src2_i.eq(src2)
+    if (op & (0x3<<2)) != 0: # branch
+        yield dut.brissue.insn_i.eq(1)
+        yield dut.br_oper_i.eq(Const(op & 0x3, 2))
+        yield dut.br_imm_i.eq(imm)
+        dut_issue = dut.brissue
+    else:
+        yield dut.aluissue.insn_i.eq(1)
+        yield dut.alu_oper_i.eq(Const(op & 0x3, 2))
+        yield dut.alu_imm_i.eq(imm)
+        dut_issue = dut.aluissue
+    yield dut.reg_enable_i.eq(1)
+
+    # these indicate that the instruction is to be made shadow-dependent on
+    # (either) branch success or branch fail
+    yield dut.branch_fail_i.eq(branch_fail)
+    yield dut.branch_succ_i.eq(branch_success)
+
+    yield
+    yield from wait_for_issue(dut, dut_issue)
+
+
+def print_reg(dut, rnums):
+    rs = []
+    for rnum in rnums:
+        reg = yield dut.intregs.regs[rnum].reg
+        rs.append("%x" % reg)
+    rnums = map(str, rnums)
+    print ("reg %s: %s" % (','.join(rnums), ','.join(rs)))
+
+
+def create_random_ops(dut, n_ops, shadowing=False, max_opnums=3):
+    insts = []
+    for i in range(n_ops):
+        src1 = randint(1, dut.n_regs-1)
+        src2 = randint(1, dut.n_regs-1)
+        imm = randint(1, (1<<dut.rwid)-1)
+        dest = randint(1, dut.n_regs-1)
+        op = randint(0, max_opnums)
+        opi = 0 if randint(0, 2) else 1 # set true if random is nonzero
+
+        if shadowing:
+            insts.append((src1, src2, dest, op, opi, imm, (0, 0)))
+        else:
+            insts.append((src1, src2, dest, op, opi, imm))
+    return insts
+
+
+def wait_for_busy_clear(dut):
+    while True:
+        busy_o = yield dut.busy_o
+        if not busy_o:
+            break
+        print ("busy",)
+        yield
+
+def disable_issue(dut):
+    yield dut.aluissue.insn_i.eq(0)
+    yield dut.brissue.insn_i.eq(0)
+    yield dut.lsissue.insn_i.eq(0)
+
+
+def wait_for_issue(dut, dut_issue):
+    while True:
+        issue_o = yield dut_issue.fn_issue_o
+        if issue_o:
+            yield from disable_issue(dut)
+            yield dut.reg_enable_i.eq(0)
+            break
+        print ("busy",)
+        #yield from print_reg(dut, [1,2,3])
+        yield
+    #yield from print_reg(dut, [1,2,3])
+
+def scoreboard_branch_sim(dut, alusim):
+
+    iseed = 3
+
+    for i in range(1):
+
+        print ("rseed", iseed)
+        seed(iseed)
+        iseed += 1
+
+        yield dut.branch_direction_o.eq(0)
+
+        # set random values in the registers
+        for i in range(1, dut.n_regs):
+            val = 31+i*3
+            val = randint(0, (1<<alusim.rwidth)-1)
+            yield dut.intregs.regs[i].reg.eq(val)
+            alusim.setval(i, val)
+
+        if False:
+            # create some instructions: branches create a tree
+            insts = create_random_ops(dut, 1, True, 1)
+            #insts.append((6, 6, 1, 2, (0, 0)))
+            #insts.append((4, 3, 3, 0, (0, 0)))
+
+            src1 = randint(1, dut.n_regs-1)
+            src2 = randint(1, dut.n_regs-1)
+            #op = randint(4, 7)
+            op = 4 # only BGT at the moment
+
+            branch_ok = create_random_ops(dut, 1, True, 1)
+            branch_fail = create_random_ops(dut, 1, True, 1)
+
+            insts.append((src1, src2, (branch_ok, branch_fail), op, (0, 0)))
+
+        if True:
+            insts = []
+            insts.append( (3, 5, 2, 0, (0, 0)) )
+            branch_ok = []
+            branch_fail = []
+            #branch_ok.append  ( (5, 7, 5, 1, (1, 0)) )
+            branch_ok.append( None )
+            branch_fail.append( (1, 1, 2, 0, (0, 1)) )
+            #branch_fail.append( None )
+            insts.append( (6, 4, (branch_ok, branch_fail), 4, (0, 0)) )
+
+        siminsts = deepcopy(insts)
+
+        # issue instruction(s)
+        i = -1
+        instrs = insts
+        branch_direction = 0
+        while instrs:
+            yield
+            yield
+            i += 1
+            branch_direction = yield dut.branch_direction_o # way branch went
+            (src1, src2, dest, op, (shadow_on, shadow_off)) = insts.pop(0)
+            if branch_direction == 1 and shadow_on:
+                print ("skip", i, src1, src2, dest, op, shadow_on, shadow_off)
+                continue # branch was "success" and this is a "failed"... skip
+            if branch_direction == 2 and shadow_off:
+                print ("skip", i, src1, src2, dest, op, shadow_on, shadow_off)
+                continue # branch was "fail" and this is a "success"... skip
+            if branch_direction != 0:
+                shadow_on = 0
+                shadow_off = 0
+            is_branch = op >= 4
+            if is_branch:
+                branch_ok, branch_fail = dest
+                dest = src2
+                # ok zip up the branch success / fail instructions and
+                # drop them into the queue, one marked "to have branch success"
+                # the other to be marked shadow branch "fail".
+                # one out of each of these will be cancelled
+                for ok, fl in zip(branch_ok, branch_fail):
+                    if ok:
+                        instrs.append((ok[0], ok[1], ok[2], ok[3], (1, 0)))
+                    if fl:
+                        instrs.append((fl[0], fl[1], fl[2], fl[3], (0, 1)))
+            print ("instr %d: (%d, %d, %d, %d, (%d, %d))" % \
+                            (i, src1, src2, dest, op, shadow_on, shadow_off))
+            yield from int_instr(dut, op, src1, src2, dest,
+                                 shadow_on, shadow_off)
+
+        # wait for all instructions to stop before checking
+        yield
+        yield from wait_for_busy_clear(dut)
+
+        i = -1
+        while siminsts:
+            instr = siminsts.pop(0)
+            if instr is None:
+                continue
+            (src1, src2, dest, op, (shadow_on, shadow_off)) = instr
+            i += 1
+            is_branch = op >= 4
+            if is_branch:
+                branch_ok, branch_fail = dest
+                dest = src2
+            print ("sim %d: (%d, %d, %d, %d, (%d, %d))" % \
+                            (i, src1, src2, dest, op, shadow_on, shadow_off))
+            branch_res = alusim.op(op, src1, src2, dest)
+            if is_branch:
+                if branch_res:
+                    siminsts += branch_ok
+                else:
+                    siminsts += branch_fail
+
+        # check status
+        yield from alusim.check(dut)
+        yield from alusim.dump(dut)
+
+
+def scoreboard_sim(dut, alusim):
+
+    seed(0)
+
+    for i in range(1):
+
+        # set random values in the registers
+        for i in range(1, dut.n_regs):
+            val = randint(0, (1<<alusim.rwidth)-1)
+            #val = 31+i*3
+            #val = i
+            yield dut.intregs.regs[i].reg.eq(val)
+            alusim.setval(i, val)
+
+        # create some instructions (some random, some regression tests)
+        instrs = []
+        if False:
+            instrs = create_random_ops(dut, 15, True, 4)
+
+        if False: # LD/ST test (with immediate)
+            instrs.append( (1, 2, 0, 0x10, 1, 1, (0, 0)) )
+            #instrs.append( (1, 2, 0, 0x10, 1, 1, (0, 0)) )
+
+        if True:
+            instrs.append( (1, 2, 2, 1, 1, 20, (0, 0)) )
+
+        if True:
+            instrs.append( (7, 3, 2, 4, 0, 0, (0, 0)) )
+            instrs.append( (7, 6, 6, 2, 0, 0, (0, 0)) )
+            instrs.append( (1, 7, 2, 2, 0, 0, (0, 0)) )
+
+        if True:
+            instrs.append((2, 3, 3, 0, 0, 0, (0, 0)))
+            instrs.append((5, 3, 3, 1, 0, 0, (0, 0)))
+            instrs.append((3, 5, 5, 2, 0, 0, (0, 0)))
+            instrs.append((5, 3, 3, 3, 0, 0, (0, 0)))
+            instrs.append((3, 5, 5, 0, 0, 0, (0, 0)))
+
+        if False:
+            instrs.append( (3, 3, 4, 0, 0, 13979, (0, 0)))
+            instrs.append( (6, 4, 1, 2, 0, 40976, (0, 0)))
+            instrs.append( (1, 4, 7, 4, 1, 23652, (0, 0)))
+
+        if False:
+            instrs.append((5, 6, 2, 1))
+            instrs.append((2, 2, 4, 0))
+            #instrs.append((2, 2, 3, 1))
+
+        if False:
+            instrs.append((2, 1, 2, 3))
+
+        if False:
+            instrs.append((2, 6, 2, 1))
+            instrs.append((2, 1, 2, 0))
+
+        if False:
+            instrs.append((1, 2, 7, 2))
+            instrs.append((7, 1, 5, 0))
+            instrs.append((4, 4, 1, 1))
+
+        if False:
+            instrs.append((5, 6, 2, 2))
+            instrs.append((1, 1, 4, 1))
+            instrs.append((6, 5, 3, 0))
+
+        if False:
+            # Write-after-Write Hazard
+            instrs.append( (3, 6, 7, 2) )
+            instrs.append( (4, 4, 7, 1) )
+
+        if False:
+            # self-read/write-after-write followed by Read-after-Write
+            instrs.append((1, 1, 1, 1))
+            instrs.append((1, 5, 3, 0))
+
+        if False:
+            # Read-after-Write followed by self-read-after-write
+            instrs.append((5, 6, 1, 2))
+            instrs.append((1, 1, 1, 1))
+
+        if False:
+            # self-read-write sandwich
+            instrs.append((5, 6, 1, 2))
+            instrs.append((1, 1, 1, 1))
+            instrs.append((1, 5, 3, 0))
+
+        if False:
+            # very weird failure
+            instrs.append( (5, 2, 5, 2) )
+            instrs.append( (2, 6, 3, 0) )
+            instrs.append( (4, 2, 2, 1) )
+
+        if False:
+            v1 = 4
+            yield dut.intregs.regs[5].reg.eq(v1)
+            alusim.setval(5, v1)
+            yield dut.intregs.regs[3].reg.eq(5)
+            alusim.setval(3, 5)
+            instrs.append((5, 3, 3, 4, (0, 0)))
+            instrs.append((4, 2, 1, 2, (0, 1)))
+
+        if False:
+            v1 = 6
+            yield dut.intregs.regs[5].reg.eq(v1)
+            alusim.setval(5, v1)
+            yield dut.intregs.regs[3].reg.eq(5)
+            alusim.setval(3, 5)
+            instrs.append((5, 3, 3, 4, (0, 0)))
+            instrs.append((4, 2, 1, 2, (1, 0)))
+
+        if False:
+            instrs.append( (4, 3, 5, 1, 0, (0, 0)) )
+            instrs.append( (5, 2, 3, 1, 0, (0, 0)) )
+            instrs.append( (7, 1, 5, 2, 0, (0, 0)) )
+            instrs.append( (5, 6, 6, 4, 0, (0, 0)) )
+            instrs.append( (7, 5, 2, 2, 0, (1, 0)) )
+            instrs.append( (1, 7, 5, 0, 0, (0, 1)) )
+            instrs.append( (1, 6, 1, 2, 0, (1, 0)) )
+            instrs.append( (1, 6, 7, 3, 0, (0, 0)) )
+            instrs.append( (6, 7, 7, 0, 0, (0, 0)) )
+
+        # issue instruction(s), wait for issue to be free before proceeding
+        for i, instr in enumerate(instrs):
+            src1, src2, dest, op, opi, imm, (br_ok, br_fail) = instr
+
+            print ("instr %d: (%d, %d, %d, %d, %d, %d)" % \
+                    (i, src1, src2, dest, op, opi, imm))
+            alusim.op(op, opi, imm, src1, src2, dest)
+            yield from instr_q(dut, op, opi, imm, src1, src2, dest,
+                               br_ok, br_fail)
+
+        # wait for all instructions to stop before checking
+        while True:
+            iqlen = yield dut.qlen_o
+            if iqlen == 0:
+                break
+            yield
+        yield
+        yield
+        yield
+        yield
+        yield from wait_for_busy_clear(dut)
+
+        # check status
+        yield from alusim.check(dut)
+        yield from alusim.dump(dut)
+
+
+def test_scoreboard():
+    dut = IssueToScoreboard(2, 1, 1, 16, 8, 8)
+    alusim = RegSim(16, 8)
+    memsim = MemSim(16, 16)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_scoreboard6600.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, scoreboard_sim(dut, alusim),
+                        vcd_name='test_scoreboard6600.vcd')
+
+    #run_simulation(dut, scoreboard_branch_sim(dut, alusim),
+    #                    vcd_name='test_scoreboard6600.vcd')
+
+
+if __name__ == '__main__':
+    test_scoreboard()
diff --git a/src/soc/iommu/axi_rab/axi4_ar_buffer.py b/src/soc/iommu/axi_rab/axi4_ar_buffer.py
new file mode 100644
index 00000000..1f3a5ff3
--- /dev/null
+++ b/src/soc/iommu/axi_rab/axi4_ar_buffer.py
@@ -0,0 +1,135 @@
+# Copyright 2018 ETH Zurich and University of Bologna.
+# Copyright and related rights are licensed under the Solderpad Hardware
+# License, Version 0.51 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+# http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# or agreed to in writing, software, hardware and materials distributed under
+# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+# module axi4_ar_buffer
+#  #(
+#    parameter AXI_ID_WIDTH   = 4,
+#    parameter AXI_USER_WIDTH = 4
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_arid,
+#    input  logic               [31:0] s_axi4_araddr,
+#    input  logic                      s_axi4_arvalid,
+#    output logic                      s_axi4_arready,
+#    input  logic                [7:0] s_axi4_arlen,
+#    input  logic                [2:0] s_axi4_arsize,
+#    input  logic                [1:0] s_axi4_arburst,
+#    input  logic                      s_axi4_arlock,
+#    input  logic                [2:0] s_axi4_arprot,
+#    input  logic                [3:0] s_axi4_arcache,
+#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_aruser,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_arid,
+#    output logic               [31:0] m_axi4_araddr,
+#    output logic                      m_axi4_arvalid,
+#    input  logic                      m_axi4_arready,
+#    output logic                [7:0] m_axi4_arlen,
+#    output logic                [2:0] m_axi4_arsize,
+#    output logic                [1:0] m_axi4_arburst,
+#    output logic                      m_axi4_arlock,
+#    output logic                [2:0] m_axi4_arprot,
+#    output logic                [3:0] m_axi4_arcache,
+#    output logic [AXI_USER_WIDTH-1:0] m_axi4_aruser
+#  );
+
+
+class axi4_ar_buffer(Elaboratable):
+
+    def __init__(self):
+        # self.axi4_aclk = Signal() # input
+        # self.axi4_arstn = Signal() # input
+        self.s_axi4_arid = Signal(AXI_ID_WIDTH)  # input
+        self.s_axi4_araddr = Signal(32)  # input
+        self.s_axi4_arvalid = Signal()  # input
+        self.s_axi4_arready = Signal()  # output
+        self.s_axi4_arlen = Signal(8)  # input
+        self.s_axi4_arsize = Signal(3)  # input
+        self.s_axi4_arburst = Signal(2)  # input
+        self.s_axi4_arlock = Signal()  # input
+        self.s_axi4_arprot = Signal(3)  # input
+        self.s_axi4_arcache = Signal(4)  # input
+        self.s_axi4_aruser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_arid = Signal(AXI_ID_WIDTH)  # output
+        self.m_axi4_araddr = Signal(32)  # output
+        self.m_axi4_arvalid = Signal()  # output
+        self.m_axi4_arready = Signal()  # input
+        self.m_axi4_arlen = Signal(8)  # output
+        self.m_axi4_arsize = Signal(3)  # output
+        self.m_axi4_arburst = Signal(2)  # output
+        self.m_axi4_arlock = Signal()  # output
+        self.m_axi4_arprot = Signal(3)  # output
+        self.m_axi4_arcache = Signal(4)  # output
+        self.m_axi4_aruser = Signal(AXI_USER_WIDTH)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        #  #TODO use record types here
+        #  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+52:0] data_in;
+        #  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+52:0] data_out;
+
+        # assign data_in                                           [3:0] = s_axi4_arcache;
+        # assign data_in                                           [6:4] = s_axi4_arprot;
+        # assign data_in                                             [7] = s_axi4_arlock;
+        # assign data_in                                           [9:8] = s_axi4_arburst;
+        # assign data_in                                         [12:10] = s_axi4_arsize;
+        # assign data_in                                         [20:13] = s_axi4_arlen;
+        # assign data_in                                         [52:21] = s_axi4_araddr;
+        # assign data_in                            [52+AXI_ID_WIDTH:53] = s_axi4_arid;
+        # assign data_in[52+AXI_ID_WIDTH+AXI_USER_WIDTH:53+AXI_ID_WIDTH] = s_axi4_aruser;
+        #
+        # assign m_axi4_arcache = data_out[3:0];
+        # assign m_axi4_arprot  = data_out[6:4];
+        # assign m_axi4_arlock  = data_out[7];
+        # assign m_axi4_arburst = data_out[9:8];
+        # assign m_axi4_arsize  = data_out[12:10];
+        # assign m_axi4_arlen   = data_out[20:13];
+        # assign m_axi4_araddr  = data_out[52:21];
+        # assign m_axi4_arid    = data_out[52+AXI_ID_WIDTH:53];
+        # assign m_axi4_aruser  = data_out[52+AXI_ID_WIDTH+AXI_USER_WIDTH:53+AXI_ID_WIDTH];
+
+        # m.d.comb += self.m_axi4_arcache.eq(..)
+        # m.d.comb += self.m_axi4_arprot.eq(..)
+        # m.d.comb += self.m_axi4_arlock.eq(..)
+        # m.d.comb += self.m_axi4_arburst.eq(..)
+        # m.d.comb += self.m_axi4_arsize.eq(..)
+        # m.d.comb += self.m_axi4_arlen.eq(..)
+        # m.d.comb += self.m_axi4_araddr.eq(..)
+        # m.d.comb += self.m_axi4_arid.eq(..)
+        # m.d.comb += self.m_axi4_aruser.eq(..)
+        return m
+
+# TODO convert axi_buffer_rab.sv
+#
+#  axi_buffer_rab
+#    #(
+#      .DATA_WIDTH   ( AXI_ID_WIDTH+AXI_USER_WIDTH+53  ),
+#      .BUFFER_DEPTH ( 4                               )
+#      )
+#    u_buffer
+#    (
+#      .clk       ( axi4_aclk      ),
+#      .rstn      ( axi4_arstn     ),
+#      .valid_out ( m_axi4_arvalid ),
+#      .data_out  ( data_out       ),
+#      .ready_in  ( m_axi4_arready ),
+#      .valid_in  ( s_axi4_arvalid ),
+#      .data_in   ( data_in        ),
+#      .ready_out ( s_axi4_arready )
+#    );
+#
+
+# endmodule
diff --git a/src/soc/iommu/axi_rab/axi4_ar_sender.py b/src/soc/iommu/axi_rab/axi4_ar_sender.py
new file mode 100644
index 00000000..4cbd97d5
--- /dev/null
+++ b/src/soc/iommu/axi_rab/axi4_ar_sender.py
@@ -0,0 +1,232 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_ar_sender(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.l1_done_o = Signal()  # output
+        self.l1_accept_i = Signal()  # input
+        self.l1_drop_i = Signal()  # input
+        self.l1_save_i = Signal()  # input
+        self.l2_done_o = Signal()  # output
+        self.l2_accept_i = Signal()  # input
+        self.l2_drop_i = Signal()  # input
+        self.l2_sending_o = Signal()  # output
+        self.l1_araddr_i = Signal(AXI_ADDR_WIDTH)  # input
+        self.l2_araddr_i = Signal(AXI_ADDR_WIDTH)  # input
+        self.s_axi4_arid = Signal(AXI_ID_WIDTH)  # input
+        self.s_axi4_arvalid = Signal()  # input
+        self.s_axi4_arready = Signal()  # output
+        self.s_axi4_arlen = Signal(8)  # input
+        self.s_axi4_arsize = Signal(3)  # input
+        self.s_axi4_arburst = Signal(2)  # input
+        self.s_axi4_arlock = Signal()  # input
+        self.s_axi4_arprot = Signal(3)  # input
+        self.s_axi4_arcache = Signal(4)  # input
+        self.s_axi4_aruser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_arid = Signal(AXI_ID_WIDTH)  # output
+        self.m_axi4_araddr = Signal(AXI_ADDR_WIDTH)  # output
+        self.m_axi4_arvalid = Signal()  # output
+        self.m_axi4_arready = Signal()  # input
+        self.m_axi4_arlen = Signal(8)  # output
+        self.m_axi4_arsize = Signal(3)  # output
+        self.m_axi4_arburst = Signal(2)  # output
+        self.m_axi4_arlock = Signal()  # output
+        self.m_axi4_arprot = Signal(3)  # output
+        self.m_axi4_arcache = Signal(4)  # output
+        self.m_axi4_aruser = Signal(AXI_USER_WIDTH)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.l1_save.eq(self.None)
+        m.d.comb += self.l1_done_o.eq(self.None)
+        m.d.comb += self.m_axi4_arvalid.eq(self.None)
+        m.d.comb += self.s_axi4_arready.eq(self.None)
+        m.d.comb += self.m_axi4_aruser.eq(self.None)
+        m.d.comb += self.m_axi4_arcache.eq(self.None)
+        m.d.comb += self.m_axi4_arprot.eq(self.None)
+        m.d.comb += self.m_axi4_arlock.eq(self.None)
+        m.d.comb += self.m_axi4_arburst.eq(self.None)
+        m.d.comb += self.m_axi4_arsize.eq(self.None)
+        m.d.comb += self.m_axi4_arlen.eq(self.None)
+        m.d.comb += self.m_axi4_araddr.eq(self.None)
+        m.d.comb += self.m_axi4_arid.eq(self.None)
+        m.d.comb += self.l2_sending_o.eq(self.None)
+        m.d.comb += self.l2_sent.eq(self.None)
+        m.d.comb += self.l2_done_o.eq(self.None)
+        m.d.comb += self.m_axi4_aruser.eq(self.s_axi4_aruser)
+        m.d.comb += self.m_axi4_arcache.eq(self.s_axi4_arcache)
+        m.d.comb += self.m_axi4_arprot.eq(self.s_axi4_arprot)
+        m.d.comb += self.m_axi4_arlock.eq(self.s_axi4_arlock)
+        m.d.comb += self.m_axi4_arburst.eq(self.s_axi4_arburst)
+        m.d.comb += self.m_axi4_arsize.eq(self.s_axi4_arsize)
+        m.d.comb += self.m_axi4_arlen.eq(self.s_axi4_arlen)
+        m.d.comb += self.m_axi4_araddr.eq(self.l1_araddr_i)
+        m.d.comb += self.m_axi4_arid.eq(self.s_axi4_arid)
+        m.d.comb += self.l2_sending_o.eq(self.1: 'b0)
+        m.d.comb += self.l2_available_q.eq(self.1: 'b0)
+        m.d.comb += self.l2_done_o.eq(self.1: 'b0)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_ar_sender
+#  #(
+#    parameter AXI_ADDR_WIDTH = 40,
+#    parameter AXI_ID_WIDTH   = 4,
+#    parameter AXI_USER_WIDTH = 4,
+#    parameter ENABLE_L2TLB   = 0
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    output logic                      l1_done_o,
+#    input  logic                      l1_accept_i,
+#    input  logic                      l1_drop_i,
+#    input  logic                      l1_save_i,
+#
+#    output logic                      l2_done_o,
+#    input  logic                      l2_accept_i,
+#    input  logic                      l2_drop_i,
+#    output logic                      l2_sending_o,
+#
+#    input  logic [AXI_ADDR_WIDTH-1:0] l1_araddr_i,
+#    input  logic [AXI_ADDR_WIDTH-1:0] l2_araddr_i,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_arid,
+#    input  logic                      s_axi4_arvalid,
+#    output logic                      s_axi4_arready,
+#    input  logic                [7:0] s_axi4_arlen,
+#    input  logic                [2:0] s_axi4_arsize,
+#    input  logic                [1:0] s_axi4_arburst,
+#    input  logic                      s_axi4_arlock,
+#    input  logic                [2:0] s_axi4_arprot,
+#    input  logic                [3:0] s_axi4_arcache,
+#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_aruser,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_arid,
+#    output logic [AXI_ADDR_WIDTH-1:0] m_axi4_araddr,
+#    output logic                      m_axi4_arvalid,
+#    input  logic                      m_axi4_arready,
+#    output logic                [7:0] m_axi4_arlen,
+#    output logic                [2:0] m_axi4_arsize,
+#    output logic                [1:0] m_axi4_arburst,
+#    output logic                      m_axi4_arlock,
+#    output logic                [2:0] m_axi4_arprot,
+#    output logic                [3:0] m_axi4_arcache,
+#    output logic [AXI_USER_WIDTH-1:0] m_axi4_aruser
+#  );
+#
+#  logic l1_save;
+#
+#  logic l2_sent;
+#  logic l2_available_q;
+#
+#  assign l1_save      = l1_save_i & l2_available_q;
+#
+#  assign l1_done_o    = s_axi4_arvalid & s_axi4_arready ;
+#
+#  // if 1: accept and forward a transaction translated by L1
+#  //    2: drop or save request (if L2 slot not occupied already)
+#  assign m_axi4_arvalid = (s_axi4_arvalid & l1_accept_i) |
+#                          l2_sending_o;
+#  assign s_axi4_arready = (m_axi4_arvalid & m_axi4_arready & ~l2_sending_o) |
+#                          (s_axi4_arvalid & (l1_drop_i | l1_save));
+#
+# generate
+#  if (ENABLE_L2TLB == 1) begin
+#    logic [AXI_USER_WIDTH-1:0] l2_axi4_aruser  ;
+#    logic                [3:0] l2_axi4_arcache ;
+#    logic                [3:0] l2_axi4_arregion;
+#    logic                [3:0] l2_axi4_arqos   ;
+#    logic                [2:0] l2_axi4_arprot  ;
+#    logic                      l2_axi4_arlock  ;
+#    logic                [1:0] l2_axi4_arburst ;
+#    logic                [2:0] l2_axi4_arsize  ;
+#    logic                [7:0] l2_axi4_arlen   ;
+#    logic   [AXI_ID_WIDTH-1:0] l2_axi4_arid    ;
+#
+#    assign m_axi4_aruser  = l2_sending_o ? l2_axi4_aruser   : s_axi4_aruser;
+#    assign m_axi4_arcache = l2_sending_o ? l2_axi4_arcache  : s_axi4_arcache;
+#    assign m_axi4_arprot  = l2_sending_o ? l2_axi4_arprot   : s_axi4_arprot;
+#    assign m_axi4_arlock  = l2_sending_o ? l2_axi4_arlock   : s_axi4_arlock;
+#    assign m_axi4_arburst = l2_sending_o ? l2_axi4_arburst  : s_axi4_arburst;
+#    assign m_axi4_arsize  = l2_sending_o ? l2_axi4_arsize   : s_axi4_arsize;
+#    assign m_axi4_arlen   = l2_sending_o ? l2_axi4_arlen    : s_axi4_arlen;
+#    assign m_axi4_araddr  = l2_sending_o ? l2_araddr_i      : l1_araddr_i;
+#    assign m_axi4_arid    = l2_sending_o ? l2_axi4_arid     : s_axi4_arid;
+#
+#    // Buffer AXI signals in case of L1 miss
+#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
+#      if (axi4_arstn == 1'b0) begin
+#        l2_axi4_aruser  <=  'b0;
+#        l2_axi4_arcache <=  'b0;
+#        l2_axi4_arprot  <=  'b0;
+#        l2_axi4_arlock  <= 1'b0;
+#        l2_axi4_arburst <=  'b0;
+#        l2_axi4_arsize  <=  'b0;
+#        l2_axi4_arlen   <=  'b0;
+#        l2_axi4_arid    <=  'b0;
+#      end else if (l1_save) begin
+#        l2_axi4_aruser  <= s_axi4_aruser;
+#        l2_axi4_arcache <= s_axi4_arcache;
+#        l2_axi4_arprot  <= s_axi4_arprot;
+#        l2_axi4_arlock  <= s_axi4_arlock;
+#        l2_axi4_arburst <= s_axi4_arburst;
+#        l2_axi4_arsize  <= s_axi4_arsize;
+#        l2_axi4_arlen   <= s_axi4_arlen;
+#        l2_axi4_arid    <= s_axi4_arid;
+#      end
+#    end
+#
+#    // signal that an l1_save_i can be accepted
+#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
+#      if (axi4_arstn == 1'b0) begin
+#        l2_available_q <= 1'b1;
+#      end else if (l2_sent | l2_drop_i) begin
+#        l2_available_q <= 1'b1;
+#      end else if (l1_save) begin
+#        l2_available_q <= 1'b0;
+#      end
+#    end
+#
+#    assign l2_sending_o = l2_accept_i & ~l2_available_q;
+#    assign l2_sent      = l2_sending_o & m_axi4_arvalid & m_axi4_arready;
+#
+#    // if 1: having sent out a transaction translated by L2
+#    //    2: drop request (L2 slot is available again)
+#    assign l2_done_o    = l2_sent | l2_drop_i;
+#
+#  end else begin // !`ifdef ENABLE_L2TLB
+#    assign m_axi4_aruser  =  s_axi4_aruser;
+#    assign m_axi4_arcache =  s_axi4_arcache;
+#    assign m_axi4_arprot  =  s_axi4_arprot;
+#    assign m_axi4_arlock  =  s_axi4_arlock;
+#    assign m_axi4_arburst =  s_axi4_arburst;
+#    assign m_axi4_arsize  =  s_axi4_arsize;
+#    assign m_axi4_arlen   =  s_axi4_arlen;
+#    assign m_axi4_araddr  =  l1_araddr_i;
+#    assign m_axi4_arid    =  s_axi4_arid;
+#
+#    assign l2_sending_o   = 1'b0;
+#    assign l2_available_q = 1'b0;
+#    assign l2_done_o      = 1'b0;
+#  end // else: !if(ENABLE_L2TLB == 1)
+# endgenerate
+#
+# endmodule
+#
+#
diff --git a/src/soc/iommu/axi_rab/axi4_aw_buffer.py b/src/soc/iommu/axi_rab/axi4_aw_buffer.py
new file mode 100644
index 00000000..f5ca37d1
--- /dev/null
+++ b/src/soc/iommu/axi_rab/axi4_aw_buffer.py
@@ -0,0 +1,157 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_aw_buffer(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.s_axi4_awid = Signal(AXI_ID_WIDTH)  # input
+        self.s_axi4_awaddr = Signal(32)  # input
+        self.s_axi4_awvalid = Signal()  # input
+        self.s_axi4_awready = Signal()  # output
+        self.s_axi4_awlen = Signal(8)  # input
+        self.s_axi4_awsize = Signal(3)  # input
+        self.s_axi4_awburst = Signal(2)  # input
+        self.s_axi4_awlock = Signal()  # input
+        self.s_axi4_awprot = Signal(3)  # input
+        self.s_axi4_awcache = Signal(4)  # input
+        self.s_axi4_awregion = Signal(4)  # input
+        self.s_axi4_awqos = Signal(4)  # input
+        self.s_axi4_awuser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_awid = Signal(AXI_ID_WIDTH)  # output
+        self.m_axi4_awaddr = Signal(32)  # output
+        self.m_axi4_awvalid = Signal()  # output
+        self.m_axi4_awready = Signal()  # input
+        self.m_axi4_awlen = Signal(8)  # output
+        self.m_axi4_awsize = Signal(3)  # output
+        self.m_axi4_awburst = Signal(2)  # output
+        self.m_axi4_awlock = Signal()  # output
+        self.m_axi4_awprot = Signal(3)  # output
+        self.m_axi4_awcache = Signal(4)  # output
+        self.m_axi4_awregion = Signal(4)  # output
+        self.m_axi4_awqos = Signal(4)  # output
+        self.m_axi4_awuser = Signal(AXI_USER_WIDTH)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.None.eq(self.s_axi4_awcache)
+        m.d.comb += self.None.eq(self.s_axi4_awprot)
+        m.d.comb += self.None.eq(self.s_axi4_awlock)
+        m.d.comb += self.None.eq(self.s_axi4_awburst)
+        m.d.comb += self.None.eq(self.s_axi4_awsize)
+        m.d.comb += self.None.eq(self.s_axi4_awlen)
+        m.d.comb += self.None.eq(self.s_axi4_awaddr)
+        m.d.comb += self.None.eq(self.s_axi4_awregion)
+        m.d.comb += self.None.eq(self.s_axi4_awqos)
+        m.d.comb += self.None.eq(self.s_axi4_awid)
+        m.d.comb += self.None.eq(self.s_axi4_awuser)
+        m.d.comb += self.m_axi4_awcache.eq(self.None)
+        m.d.comb += self.m_axi4_awprot.eq(self.None)
+        m.d.comb += self.m_axi4_awlock.eq(self.None)
+        m.d.comb += self.m_axi4_awburst.eq(self.None)
+        m.d.comb += self.m_axi4_awsize.eq(self.None)
+        m.d.comb += self.m_axi4_awlen.eq(self.None)
+        m.d.comb += self.m_axi4_awaddr.eq(self.None)
+        m.d.comb += self.m_axi4_awregion.eq(self.None)
+        m.d.comb += self.m_axi4_awqos.eq(self.None)
+        m.d.comb += self.m_axi4_awid.eq(self.None)
+        m.d.comb += self.m_axi4_awuser.eq(self.None)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_aw_buffer
+#  #(
+#    parameter AXI_ID_WIDTH   = 4,
+#    parameter AXI_USER_WIDTH = 4
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_awid,
+#    input  logic               [31:0] s_axi4_awaddr,
+#    input  logic                      s_axi4_awvalid,
+#    output logic                      s_axi4_awready,
+#    input  logic                [7:0] s_axi4_awlen,
+#    input  logic                [2:0] s_axi4_awsize,
+#    input  logic                [1:0] s_axi4_awburst,
+#    input  logic                      s_axi4_awlock,
+#    input  logic                [2:0] s_axi4_awprot,
+#    input  logic                [3:0] s_axi4_awcache,
+#    input  logic                [3:0] s_axi4_awregion,
+#    input  logic                [3:0] s_axi4_awqos,
+#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_awuser,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_awid,
+#    output logic               [31:0] m_axi4_awaddr,
+#    output logic                      m_axi4_awvalid,
+#    input  logic                      m_axi4_awready,
+#    output logic                [7:0] m_axi4_awlen,
+#    output logic                [2:0] m_axi4_awsize,
+#    output logic                [1:0] m_axi4_awburst,
+#    output logic                      m_axi4_awlock,
+#    output logic                [2:0] m_axi4_awprot,
+#    output logic                [3:0] m_axi4_awcache,
+#    output logic                [3:0] m_axi4_awregion,
+#    output logic                [3:0] m_axi4_awqos,
+#    output logic [AXI_USER_WIDTH-1:0] m_axi4_awuser
+#  );
+#
+#  wire [AXI_USER_WIDTH+AXI_ID_WIDTH+60:0] data_in;
+#  wire [AXI_USER_WIDTH+AXI_ID_WIDTH+60:0] data_out;
+#
+#  assign data_in                                            [3:0] = s_axi4_awcache;
+#  assign data_in                                            [6:4] = s_axi4_awprot;
+#  assign data_in                                              [7] = s_axi4_awlock;
+#  assign data_in                                            [9:8] = s_axi4_awburst;
+#  assign data_in                                          [12:10] = s_axi4_awsize;
+#  assign data_in                                          [20:13] = s_axi4_awlen;
+#  assign data_in                                          [52:21] = s_axi4_awaddr;
+#  assign data_in                                          [56:53] = s_axi4_awregion;
+#  assign data_in                                          [60:57] = s_axi4_awqos;
+#  assign data_in                             [60+AXI_ID_WIDTH:61] = s_axi4_awid;
+#  assign data_in [60+AXI_ID_WIDTH+AXI_USER_WIDTH:61+AXI_ID_WIDTH] = s_axi4_awuser;
+#
+#  assign m_axi4_awcache  = data_out[3:0];
+#  assign m_axi4_awprot   = data_out[6:4];
+#  assign m_axi4_awlock   = data_out[7];
+#  assign m_axi4_awburst  = data_out[9:8];
+#  assign m_axi4_awsize   = data_out[12:10];
+#  assign m_axi4_awlen    = data_out[20:13];
+#  assign m_axi4_awaddr   = data_out[52:21];
+#  assign m_axi4_awregion = data_out[56:53];
+#  assign m_axi4_awqos    = data_out[60:57];
+#  assign m_axi4_awid     = data_out[60+AXI_ID_WIDTH:61];
+#  assign m_axi4_awuser   = data_out[60+AXI_ID_WIDTH+AXI_USER_WIDTH:61+AXI_ID_WIDTH];
+#
+#  axi_buffer_rab
+#    #(
+#      .DATA_WIDTH   ( AXI_ID_WIDTH+AXI_USER_WIDTH+61  ),
+#      .BUFFER_DEPTH ( 4                               )
+#    )
+#    u_buffer
+#    (
+#      .clk       ( axi4_aclk      ),
+#      .rstn      ( axi4_arstn     ),
+#      .valid_out ( m_axi4_awvalid ),
+#      .data_out  ( data_out       ),
+#      .ready_in  ( m_axi4_awready ),
+#      .valid_in  ( s_axi4_awvalid ),
+#      .data_in   ( data_in        ),
+#      .ready_out ( s_axi4_awready )
+#    );
+# endmodule
+#
+#
diff --git a/src/soc/iommu/axi_rab/axi4_aw_sender.py b/src/soc/iommu/axi_rab/axi4_aw_sender.py
new file mode 100644
index 00000000..fbc917df
--- /dev/null
+++ b/src/soc/iommu/axi_rab/axi4_aw_sender.py
@@ -0,0 +1,252 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_aw_sender(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.l1_done_o = Signal()  # output
+        self.l1_accept_i = Signal()  # input
+        self.l1_drop_i = Signal()  # input
+        self.l1_save_i = Signal()  # input
+        self.l2_done_o = Signal()  # output
+        self.l2_accept_i = Signal()  # input
+        self.l2_drop_i = Signal()  # input
+        self.l2_sending_o = Signal()  # output
+        self.l1_awaddr_i = Signal(AXI_ADDR_WIDTH)  # input
+        self.l2_awaddr_i = Signal(AXI_ADDR_WIDTH)  # input
+        self.s_axi4_awid = Signal(AXI_ID_WIDTH)  # input
+        self.s_axi4_awvalid = Signal()  # input
+        self.s_axi4_awready = Signal()  # output
+        self.s_axi4_awlen = Signal(8)  # input
+        self.s_axi4_awsize = Signal(3)  # input
+        self.s_axi4_awburst = Signal(2)  # input
+        self.s_axi4_awlock = Signal()  # input
+        self.s_axi4_awprot = Signal(3)  # input
+        self.s_axi4_awcache = Signal(4)  # input
+        self.s_axi4_awregion = Signal(4)  # input
+        self.s_axi4_awqos = Signal(4)  # input
+        self.s_axi4_awuser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_awid = Signal(AXI_ID_WIDTH)  # output
+        self.m_axi4_awaddr = Signal(AXI_ADDR_WIDTH)  # output
+        self.m_axi4_awvalid = Signal()  # output
+        self.m_axi4_awready = Signal()  # input
+        self.m_axi4_awlen = Signal(8)  # output
+        self.m_axi4_awsize = Signal(3)  # output
+        self.m_axi4_awburst = Signal(2)  # output
+        self.m_axi4_awlock = Signal()  # output
+        self.m_axi4_awprot = Signal(3)  # output
+        self.m_axi4_awcache = Signal(4)  # output
+        self.m_axi4_awregion = Signal(4)  # output
+        self.m_axi4_awqos = Signal(4)  # output
+        self.m_axi4_awuser = Signal(AXI_USER_WIDTH)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.l1_save.eq(self.None)
+        m.d.comb += self.l1_done_o.eq(self.None)
+        m.d.comb += self.m_axi4_awvalid.eq(self.None)
+        m.d.comb += self.s_axi4_awready.eq(self.None)
+        m.d.comb += self.m_axi4_awuser.eq(self.None)
+        m.d.comb += self.m_axi4_awcache.eq(self.None)
+        m.d.comb += self.m_axi4_awregion.eq(self.None)
+        m.d.comb += self.m_axi4_awqos.eq(self.None)
+        m.d.comb += self.m_axi4_awprot.eq(self.None)
+        m.d.comb += self.m_axi4_awlock.eq(self.None)
+        m.d.comb += self.m_axi4_awburst.eq(self.None)
+        m.d.comb += self.m_axi4_awsize.eq(self.None)
+        m.d.comb += self.m_axi4_awlen.eq(self.None)
+        m.d.comb += self.m_axi4_awaddr.eq(self.None)
+        m.d.comb += self.m_axi4_awid.eq(self.None)
+        m.d.comb += self.l2_sending_o.eq(self.None)
+        m.d.comb += self.l2_sent.eq(self.None)
+        m.d.comb += self.l2_done_o.eq(self.None)
+        m.d.comb += self.m_axi4_awuser.eq(self.s_axi4_awuser)
+        m.d.comb += self.m_axi4_awcache.eq(self.s_axi4_awcache)
+        m.d.comb += self.m_axi4_awregion.eq(self.s_axi4_awregion)
+        m.d.comb += self.m_axi4_awqos.eq(self.s_axi4_awqos)
+        m.d.comb += self.m_axi4_awprot.eq(self.s_axi4_awprot)
+        m.d.comb += self.m_axi4_awlock.eq(self.s_axi4_awlock)
+        m.d.comb += self.m_axi4_awburst.eq(self.s_axi4_awburst)
+        m.d.comb += self.m_axi4_awsize.eq(self.s_axi4_awsize)
+        m.d.comb += self.m_axi4_awlen.eq(self.s_axi4_awlen)
+        m.d.comb += self.m_axi4_awaddr.eq(self.l1_awaddr_i)
+        m.d.comb += self.m_axi4_awid.eq(self.s_axi4_awid)
+        m.d.comb += self.l2_sending_o.eq(self.1: 'b0)
+        m.d.comb += self.l2_available_q.eq(self.1: 'b0)
+        m.d.comb += self.l2_done_o.eq(self.1: 'b0)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_aw_sender
+#  #(
+#    parameter AXI_ADDR_WIDTH   = 40,
+#    parameter AXI_ID_WIDTH     = 4,
+#    parameter AXI_USER_WIDTH   = 4,
+#    parameter ENABLE_L2TLB     = 0
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    output logic                      l1_done_o,
+#    input  logic                      l1_accept_i,
+#    input  logic                      l1_drop_i,
+#    input  logic                      l1_save_i,
+#
+#    output logic                      l2_done_o,
+#    input  logic                      l2_accept_i,
+#    input  logic                      l2_drop_i,
+#    output logic                      l2_sending_o,
+#
+#    input  logic [AXI_ADDR_WIDTH-1:0] l1_awaddr_i,
+#    input  logic [AXI_ADDR_WIDTH-1:0] l2_awaddr_i,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_awid,
+#    input  logic                      s_axi4_awvalid,
+#    output logic                      s_axi4_awready,
+#    input  logic                [7:0] s_axi4_awlen,
+#    input  logic                [2:0] s_axi4_awsize,
+#    input  logic                [1:0] s_axi4_awburst,
+#    input  logic                      s_axi4_awlock,
+#    input  logic                [2:0] s_axi4_awprot,
+#    input  logic                [3:0] s_axi4_awcache,
+#    input  logic                [3:0] s_axi4_awregion,
+#    input  logic                [3:0] s_axi4_awqos,
+#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_awuser,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_awid,
+#    output logic [AXI_ADDR_WIDTH-1:0] m_axi4_awaddr,
+#    output logic                      m_axi4_awvalid,
+#    input  logic                      m_axi4_awready,
+#    output logic                [7:0] m_axi4_awlen,
+#    output logic                [2:0] m_axi4_awsize,
+#    output logic                [1:0] m_axi4_awburst,
+#    output logic                      m_axi4_awlock,
+#    output logic                [2:0] m_axi4_awprot,
+#    output logic                [3:0] m_axi4_awcache,
+#    output logic                [3:0] m_axi4_awregion,
+#    output logic                [3:0] m_axi4_awqos,
+#    output logic [AXI_USER_WIDTH-1:0] m_axi4_awuser
+#  );
+#
+#  logic l1_save;
+#
+#  logic l2_sent;
+#  logic l2_available_q;
+#
+#  assign l1_save      = l1_save_i & l2_available_q;
+#
+#  assign l1_done_o    = s_axi4_awvalid & s_axi4_awready ;
+#
+#  // if 1: accept and forward a transaction translated by L1
+#  //    2: drop or save request (if L2 slot not occupied already)
+#  assign m_axi4_awvalid = (s_axi4_awvalid & l1_accept_i) |
+#                          l2_sending_o;
+#  assign s_axi4_awready = (m_axi4_awvalid & m_axi4_awready & ~l2_sending_o) |
+#                          (s_axi4_awvalid & (l1_drop_i | l1_save));
+#
+# generate
+#  if (ENABLE_L2TLB    == 1) begin
+#    logic [AXI_USER_WIDTH-1:0] l2_axi4_awuser  ;
+#    logic                [3:0] l2_axi4_awcache ;
+#    logic                [3:0] l2_axi4_awregion;
+#    logic                [3:0] l2_axi4_awqos   ;
+#    logic                [2:0] l2_axi4_awprot  ;
+#    logic                      l2_axi4_awlock  ;
+#    logic                [1:0] l2_axi4_awburst ;
+#    logic                [2:0] l2_axi4_awsize  ;
+#    logic                [7:0] l2_axi4_awlen   ;
+#    logic   [AXI_ID_WIDTH-1:0] l2_axi4_awid    ;
+#
+#    assign m_axi4_awuser   = l2_sending_o ? l2_axi4_awuser   : s_axi4_awuser;
+#    assign m_axi4_awcache  = l2_sending_o ? l2_axi4_awcache  : s_axi4_awcache;
+#    assign m_axi4_awregion = l2_sending_o ? l2_axi4_awregion : s_axi4_awregion;
+#    assign m_axi4_awqos    = l2_sending_o ? l2_axi4_awqos    : s_axi4_awqos;
+#    assign m_axi4_awprot   = l2_sending_o ? l2_axi4_awprot   : s_axi4_awprot;
+#    assign m_axi4_awlock   = l2_sending_o ? l2_axi4_awlock   : s_axi4_awlock;
+#    assign m_axi4_awburst  = l2_sending_o ? l2_axi4_awburst  : s_axi4_awburst;
+#    assign m_axi4_awsize   = l2_sending_o ? l2_axi4_awsize   : s_axi4_awsize;
+#    assign m_axi4_awlen    = l2_sending_o ? l2_axi4_awlen    : s_axi4_awlen;
+#    assign m_axi4_awaddr   = l2_sending_o ? l2_awaddr_i      : l1_awaddr_i;
+#    assign m_axi4_awid     = l2_sending_o ? l2_axi4_awid     : s_axi4_awid;
+#
+#    // buffer AXI signals in case of L1 miss
+#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
+#      if (axi4_arstn == 1'b0) begin
+#        l2_axi4_awuser   <=  'b0;
+#        l2_axi4_awcache  <=  'b0;
+#        l2_axi4_awregion <=  'b0;
+#        l2_axi4_awqos    <=  'b0;
+#        l2_axi4_awprot   <=  'b0;
+#        l2_axi4_awlock   <= 1'b0;
+#        l2_axi4_awburst  <=  'b0;
+#        l2_axi4_awsize   <=  'b0;
+#        l2_axi4_awlen    <=  'b0;
+#        l2_axi4_awid     <=  'b0;
+#      end else if (l1_save) begin
+#        l2_axi4_awuser   <= s_axi4_awuser;
+#        l2_axi4_awcache  <= s_axi4_awcache;
+#        l2_axi4_awregion <= s_axi4_awregion;
+#        l2_axi4_awqos    <= s_axi4_awqos;
+#        l2_axi4_awprot   <= s_axi4_awprot;
+#        l2_axi4_awlock   <= s_axi4_awlock;
+#        l2_axi4_awburst  <= s_axi4_awburst;
+#        l2_axi4_awsize   <= s_axi4_awsize;
+#        l2_axi4_awlen    <= s_axi4_awlen;
+#        l2_axi4_awid     <= s_axi4_awid;
+#      end
+#    end
+#
+#    // signal that an l1_save_i can be accepted
+#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
+#      if (axi4_arstn == 1'b0) begin
+#        l2_available_q <= 1'b1;
+#      end else if (l2_sent | l2_drop_i) begin
+#        l2_available_q <= 1'b1;
+#      end else if (l1_save) begin
+#        l2_available_q <= 1'b0;
+#      end
+#    end
+#
+#    assign l2_sending_o = l2_accept_i & ~l2_available_q;
+#    assign l2_sent      = l2_sending_o & m_axi4_awvalid & m_axi4_awready;
+#
+#    // if 1: having sent out a transaction translated by L2
+#    //    2: drop request (L2 slot is available again)
+#    assign l2_done_o    = l2_sent | l2_drop_i;
+#
+#  end else begin // !`ifdef ENABLE_L2TLB
+#    assign m_axi4_awuser   =  s_axi4_awuser;
+#    assign m_axi4_awcache  =  s_axi4_awcache;
+#    assign m_axi4_awregion =  s_axi4_awregion;
+#    assign m_axi4_awqos    =  s_axi4_awqos;
+#    assign m_axi4_awprot   =  s_axi4_awprot;
+#    assign m_axi4_awlock   =  s_axi4_awlock;
+#    assign m_axi4_awburst  =  s_axi4_awburst;
+#    assign m_axi4_awsize   =  s_axi4_awsize;
+#    assign m_axi4_awlen    =  s_axi4_awlen;
+#    assign m_axi4_awaddr   =  l1_awaddr_i;
+#    assign m_axi4_awid     =  s_axi4_awid;
+#
+#    assign l2_sending_o    = 1'b0;
+#    assign l2_available_q  = 1'b0;
+#    assign l2_done_o       = 1'b0;
+#  end // !`ifdef ENABLE_L2TLB
+# endgenerate
+#
+# endmodule
+#
+#
diff --git a/src/soc/iommu/axi_rab/axi4_b_buffer.py b/src/soc/iommu/axi_rab/axi4_b_buffer.py
new file mode 100644
index 00000000..42fce1ad
--- /dev/null
+++ b/src/soc/iommu/axi_rab/axi4_b_buffer.py
@@ -0,0 +1,94 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_b_buffer(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.s_axi4_bid = Signal(AXI_ID_WIDTH)  # output
+        self.s_axi4_bresp = Signal(2)  # output
+        self.s_axi4_bvalid = Signal()  # output
+        self.s_axi4_buser = Signal(AXI_USER_WIDTH)  # output
+        self.s_axi4_bready = Signal()  # input
+        self.m_axi4_bid = Signal(AXI_ID_WIDTH)  # input
+        self.m_axi4_bresp = Signal(2)  # input
+        self.m_axi4_bvalid = Signal()  # input
+        self.m_axi4_buser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_bready = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.None.eq(self.m_axi4_bresp)
+        m.d.comb += self.None.eq(self.m_axi4_bid)
+        m.d.comb += self.None.eq(self.m_axi4_buser)
+        m.d.comb += self.s_axi4_buser.eq(self.None)
+        m.d.comb += self.s_axi4_bid.eq(self.None)
+        m.d.comb += self.s_axi4_bresp.eq(self.None)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_b_buffer
+#  #(
+#    parameter AXI_ID_WIDTH   = 4,
+#    parameter AXI_USER_WIDTH = 4
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_bid,
+#    output logic                [1:0] s_axi4_bresp,
+#    output logic                      s_axi4_bvalid,
+#    output logic [AXI_USER_WIDTH-1:0] s_axi4_buser,
+#    input  logic                      s_axi4_bready,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_bid,
+#    input  logic                [1:0] m_axi4_bresp,
+#    input  logic                      m_axi4_bvalid,
+#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_buser,
+#    output logic                      m_axi4_bready
+#  );
+#
+#  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+1:0] data_in;
+#  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+1:0] data_out;
+#
+#  assign data_in                                         [1:0] = m_axi4_bresp;
+#  assign data_in                            [AXI_ID_WIDTH+1:2] = m_axi4_bid;
+#  assign data_in[AXI_ID_WIDTH+AXI_USER_WIDTH+1:AXI_ID_WIDTH+2] = m_axi4_buser;
+#
+#  assign s_axi4_buser = data_out[AXI_ID_WIDTH+AXI_USER_WIDTH+1:AXI_ID_WIDTH+2];
+#  assign s_axi4_bid   = data_out[AXI_ID_WIDTH+1:2];
+#  assign s_axi4_bresp = data_out[1:0];
+#
+#  axi_buffer_rab
+#  #(
+#    .DATA_WIDTH   ( AXI_ID_WIDTH+AXI_USER_WIDTH+2 ),
+#    .BUFFER_DEPTH ( 4                             )
+#    )
+#  u_buffer
+#  (
+#    .clk      ( axi4_aclk     ),
+#    .rstn     ( axi4_arstn    ),
+#    .valid_out( s_axi4_bvalid ),
+#    .data_out ( data_out      ),
+#    .ready_in ( s_axi4_bready ),
+#    .valid_in ( m_axi4_bvalid ),
+#    .data_in  ( data_in       ),
+#    .ready_out( m_axi4_bready )
+#  );
+#
+# endmodule
+#
+#
diff --git a/src/soc/iommu/axi_rab/axi4_b_sender.py b/src/soc/iommu/axi_rab/axi4_b_sender.py
new file mode 100644
index 00000000..1c61a2a5
--- /dev/null
+++ b/src/soc/iommu/axi_rab/axi4_b_sender.py
@@ -0,0 +1,136 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_b_sender(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.drop_i = Signal()  # input
+        self.done_o = Signal()  # output
+        self.id_i = Signal(AXI_ID_WIDTH)  # input
+        self.prefetch_i = Signal()  # input
+        self.hit_i = Signal()  # input
+        self.s_axi4_bid = Signal(AXI_ID_WIDTH)  # output
+        self.s_axi4_bresp = Signal(2)  # output
+        self.s_axi4_bvalid = Signal()  # output
+        self.s_axi4_buser = Signal(AXI_USER_WIDTH)  # output
+        self.s_axi4_bready = Signal()  # input
+        self.m_axi4_bid = Signal(AXI_ID_WIDTH)  # input
+        self.m_axi4_bresp = Signal(2)  # input
+        self.m_axi4_bvalid = Signal()  # input
+        self.m_axi4_buser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_bready = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.fifo_push.eq(self.None)
+        m.d.comb += self.done_o.eq(self.fifo_push)
+        m.d.comb += self.fifo_pop.eq(self.None)
+        m.d.comb += self.s_axi4_buser.eq(self.None)
+        m.d.comb += self.s_axi4_bid.eq(self.None)
+        m.d.comb += self.s_axi4_bresp.eq(self.None)
+        m.d.comb += self.s_axi4_bvalid.eq(self.None)
+        m.d.comb += self.m_axi4_bready.eq(self.None)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_b_sender
+#  #(
+#    parameter AXI_ID_WIDTH   = 10,
+#    parameter AXI_USER_WIDTH = 4
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    input  logic                      drop_i,
+#    output logic                      done_o,
+#    input  logic   [AXI_ID_WIDTH-1:0] id_i,
+#    input  logic                      prefetch_i,
+#    input  logic                      hit_i,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_bid,
+#    output logic                [1:0] s_axi4_bresp,
+#    output logic                      s_axi4_bvalid,
+#    output logic [AXI_USER_WIDTH-1:0] s_axi4_buser,
+#    input  logic                      s_axi4_bready,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_bid,
+#    input  logic                [1:0] m_axi4_bresp,
+#    input  logic                      m_axi4_bvalid,
+#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_buser,
+#    output logic                      m_axi4_bready
+#  );
+#
+#  logic                    fifo_valid;
+#  logic                    fifo_pop;
+#  logic                    fifo_push;
+#  logic                    fifo_ready;
+#  logic [AXI_ID_WIDTH-1:0] id;
+#  logic                    prefetch;
+#  logic                    hit;
+#
+#  logic                    dropping;
+#
+#  axi_buffer_rab
+#    #(
+#      .DATA_WIDTH   ( 2+AXI_ID_WIDTH  ),
+#      .BUFFER_DEPTH ( 4               )
+#      )
+#    u_fifo
+#      (
+#        .clk       ( axi4_aclk                 ),
+#        .rstn      ( axi4_arstn                ),
+#        // Pop
+#        .data_out  ( {prefetch,   hit,   id}   ),
+#        .valid_out ( fifo_valid                ),
+#        .ready_in  ( fifo_pop                  ),
+#        // Push
+#        .valid_in  ( fifo_push                 ),
+#        .data_in   ( {prefetch_i, hit_i, id_i} ),
+#        .ready_out ( fifo_ready                )
+#      );
+#
+#  assign fifo_push = drop_i & fifo_ready;
+#  assign done_o    = fifo_push;
+#
+#  assign fifo_pop  = dropping & s_axi4_bready;
+#
+#  always @ (posedge axi4_aclk or negedge axi4_arstn) begin
+#    if (axi4_arstn == 1'b0) begin
+#      dropping <= 1'b0;
+#    end else begin
+#      if (fifo_valid && ~dropping)
+#        dropping <= 1'b1;
+#      else if (fifo_pop)
+#        dropping <= 1'b0;
+#    end
+#  end
+#
+#  assign s_axi4_buser  = dropping ? {AXI_USER_WIDTH{1'b0}} : m_axi4_buser;
+#  assign s_axi4_bid    = dropping ? id : m_axi4_bid;
+#
+#  assign s_axi4_bresp  = (dropping & prefetch & hit) ? 2'b00 : // prefetch hit, mutli, prot
+#                         (dropping & prefetch      ) ? 2'b10 : // prefetch miss
+#                         (dropping            & hit) ? 2'b10 : // non-prefetch multi, prot
+#                         (dropping                 ) ? 2'b10 : // non-prefetch miss
+#                         m_axi4_bresp;
+#
+#  assign s_axi4_bvalid =  dropping | m_axi4_bvalid;
+#  assign m_axi4_bready = ~dropping & s_axi4_bready;
+#
+# endmodule
+#
+#
diff --git a/src/soc/iommu/axi_rab/axi4_r_buffer.py b/src/soc/iommu/axi_rab/axi4_r_buffer.py
new file mode 100644
index 00000000..91bdf0a5
--- /dev/null
+++ b/src/soc/iommu/axi_rab/axi4_r_buffer.py
@@ -0,0 +1,120 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_r_buffer(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.s_axi4_rid = Signal(AXI_ID_WIDTH)  # output
+        self.s_axi4_rresp = Signal(2)  # output
+        self.s_axi4_rdata = Signal(AXI_DATA_WIDTH)  # output
+        self.s_axi4_rlast = Signal()  # output
+        self.s_axi4_rvalid = Signal()  # output
+        self.s_axi4_ruser = Signal(AXI_USER_WIDTH)  # output
+        self.s_axi4_rready = Signal()  # input
+        self.m_axi4_rid = Signal(AXI_ID_WIDTH)  # input
+        self.m_axi4_rresp = Signal(2)  # input
+        self.m_axi4_rdata = Signal(AXI_DATA_WIDTH)  # input
+        self.m_axi4_rlast = Signal()  # input
+        self.m_axi4_rvalid = Signal()  # input
+        self.m_axi4_ruser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_rready = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.None.eq(self.m_axi4_rresp)
+        m.d.comb += self.None.eq(self.m_axi4_rlast)
+        m.d.comb += self.None.eq(self.m_axi4_rid)
+        m.d.comb += self.None.eq(self.m_axi4_rdata)
+        m.d.comb += self.None.eq(self.m_axi4_ruser)
+        m.d.comb += self.s_axi4_rresp.eq(self.None)
+        m.d.comb += self.s_axi4_rlast.eq(self.None)
+        m.d.comb += self.s_axi4_rid.eq(self.None)
+        m.d.comb += self.s_axi4_rdata.eq(self.None)
+        m.d.comb += self.s_axi4_ruser.eq(self.None)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_r_buffer
+#  #(
+#    parameter AXI_DATA_WIDTH = 32,
+#    parameter AXI_ID_WIDTH   = 4,
+#    parameter AXI_USER_WIDTH = 4
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_rid,
+#    output logic                [1:0] s_axi4_rresp,
+#    output logic [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
+#    output logic                      s_axi4_rlast,
+#    output logic                      s_axi4_rvalid,
+#    output logic [AXI_USER_WIDTH-1:0] s_axi4_ruser,
+#    input  logic                      s_axi4_rready,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_rid,
+#    input  logic                [1:0] m_axi4_rresp,
+#    input  logic [AXI_DATA_WIDTH-1:0] m_axi4_rdata,
+#    input  logic                      m_axi4_rlast,
+#    input  logic                      m_axi4_rvalid,
+#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_ruser,
+#    output logic                      m_axi4_rready
+#  );
+#
+#  wire [AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3-1:0] data_in;
+#  wire [AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3-1:0] data_out;
+#
+#  localparam ID_START   = 3;
+#  localparam ID_END     = AXI_ID_WIDTH-1 + ID_START;
+#  localparam DATA_START = ID_END + 1;
+#  localparam DATA_END   = AXI_DATA_WIDTH-1 + DATA_START;
+#  localparam USER_START = DATA_END + 1;
+#  localparam USER_END   = AXI_USER_WIDTH-1 + USER_START;
+#
+#  assign data_in                [1:0] = m_axi4_rresp;
+#  assign data_in                  [2] = m_axi4_rlast;
+#  assign data_in    [ID_END:ID_START] = m_axi4_rid;
+#  assign data_in[DATA_END:DATA_START] = m_axi4_rdata;
+#  assign data_in[USER_END:USER_START] = m_axi4_ruser;
+#
+#  assign s_axi4_rresp  = data_out                [1:0];
+#  assign s_axi4_rlast  = data_out                  [2];
+#  assign s_axi4_rid    = data_out    [ID_END:ID_START];
+#  assign s_axi4_rdata  = data_out[DATA_END:DATA_START];
+#  assign s_axi4_ruser  = data_out[USER_END:USER_START];
+#
+#  axi_buffer_rab
+#  #(
+#    .DATA_WIDTH   ( AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3  ),
+#    .BUFFER_DEPTH ( 4                                             )
+#    )
+#  u_buffer
+#  (
+#    .clk       ( axi4_aclk     ),
+#    .rstn      ( axi4_arstn    ),
+#    // Pop
+#    .valid_out ( s_axi4_rvalid ),
+#    .data_out  ( data_out      ),
+#    .ready_in  ( s_axi4_rready ),
+#    // Push
+#    .valid_in  ( m_axi4_rvalid ),
+#    .data_in   ( data_in       ),
+#    .ready_out ( m_axi4_rready )
+#  );
+#
+# endmodule
+#
+#
diff --git a/src/soc/iommu/axi_rab/axi4_r_sender.py b/src/soc/iommu/axi_rab/axi4_r_sender.py
new file mode 100644
index 00000000..d4e22bb2
--- /dev/null
+++ b/src/soc/iommu/axi_rab/axi4_r_sender.py
@@ -0,0 +1,206 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_r_sender(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.drop_i = Signal()  # input
+        self.drop_len_i = Signal(8)  # input
+        self.done_o = Signal()  # output
+        self.id_i = Signal(AXI_ID_WIDTH)  # input
+        self.prefetch_i = Signal()  # input
+        self.hit_i = Signal()  # input
+        self.s_axi4_rid = Signal(AXI_ID_WIDTH)  # output
+        self.s_axi4_rresp = Signal(2)  # output
+        self.s_axi4_rdata = Signal(AXI_DATA_WIDTH)  # output
+        self.s_axi4_rlast = Signal()  # output
+        self.s_axi4_rvalid = Signal()  # output
+        self.s_axi4_ruser = Signal(AXI_USER_WIDTH)  # output
+        self.s_axi4_rready = Signal()  # input
+        self.m_axi4_rid = Signal(AXI_ID_WIDTH)  # input
+        self.m_axi4_rresp = Signal(2)  # input
+        self.m_axi4_rdata = Signal(AXI_DATA_WIDTH)  # input
+        self.m_axi4_rlast = Signal()  # input
+        self.m_axi4_rvalid = Signal()  # input
+        self.m_axi4_ruser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_rready = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.fifo_push.eq(self.None)
+        m.d.comb += self.done_o.eq(self.fifo_push)
+        m.d.comb += self.s_axi4_rdata.eq(self.m_axi4_rdata)
+        m.d.comb += self.s_axi4_ruser.eq(self.None)
+        m.d.comb += self.s_axi4_rid.eq(self.None)
+        m.d.comb += self.s_axi4_rresp.eq(self.None)
+        m.d.comb += self.s_axi4_rvalid.eq(self.None)
+        m.d.comb += self.m_axi4_rready.eq(self.None)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# //import CfMath::log2;
+#
+# module axi4_r_sender
+#  #(
+#    parameter AXI_DATA_WIDTH = 32,
+#    parameter AXI_ID_WIDTH   = 4,
+#    parameter AXI_USER_WIDTH = 4
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    input  logic                      drop_i,
+#    input  logic                [7:0] drop_len_i,
+#    output logic                      done_o,
+#    input  logic   [AXI_ID_WIDTH-1:0] id_i,
+#    input  logic                      prefetch_i,
+#    input  logic                      hit_i,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_rid,
+#    output logic                [1:0] s_axi4_rresp,
+#    output logic [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
+#    output logic                      s_axi4_rlast,
+#    output logic                      s_axi4_rvalid,
+#    output logic [AXI_USER_WIDTH-1:0] s_axi4_ruser,
+#    input  logic                      s_axi4_rready,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_rid,
+#    input  logic                [1:0] m_axi4_rresp,
+#    input  logic [AXI_DATA_WIDTH-1:0] m_axi4_rdata,
+#    input  logic                      m_axi4_rlast,
+#    input  logic                      m_axi4_rvalid,
+#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_ruser,
+#    output logic                      m_axi4_rready
+#  );
+#
+#  localparam BUFFER_DEPTH = 16;
+#
+#  logic                    fifo_valid;
+#  logic                    fifo_pop;
+#  logic                    fifo_push;
+#  logic                    fifo_ready;
+#  logic [AXI_ID_WIDTH-1:0] id;
+#  logic              [7:0] len;
+#  logic                    prefetch;
+#  logic                    hit;
+#
+#  logic                    dropping;
+#
+#  enum logic [1:0]  { FORWARDING, DROPPING }
+#                            state_d,                state_q;
+#  logic                     burst_ongoing_d,        burst_ongoing_q;
+#  logic [7:0]               drop_cnt_d,             drop_cnt_q;
+#
+#  axi_buffer_rab
+#    #(
+#      .DATA_WIDTH       ( 2+AXI_ID_WIDTH+8  ),
+#      .BUFFER_DEPTH     ( BUFFER_DEPTH      )
+#      )
+#    u_fifo
+#      (
+#        .clk       ( axi4_aclk                              ),
+#        .rstn      ( axi4_arstn                             ),
+#        // Pop
+#        .data_out  ( {prefetch,   hit,   id,   len}         ),
+#        .valid_out ( fifo_valid                             ),
+#        .ready_in  ( fifo_pop                               ),
+#        // Push
+#        .valid_in  ( fifo_push                              ),
+#        .data_in   ( {prefetch_i, hit_i, id_i, drop_len_i}  ),
+#        .ready_out ( fifo_ready                             )
+#      );
+#
+#  assign fifo_push = drop_i & fifo_ready;
+#  assign done_o    = fifo_push;
+#
+#  always_comb begin
+#    burst_ongoing_d = burst_ongoing_q;
+#    drop_cnt_d      = drop_cnt_q;
+#    dropping        = 1'b0;
+#    s_axi4_rlast    = 1'b0;
+#    fifo_pop        = 1'b0;
+#    state_d         = state_q;
+#
+#    case (state_q)
+#      FORWARDING: begin
+#        s_axi4_rlast = m_axi4_rlast;
+#        // Remember whether there is currently a burst ongoing.
+#        if (m_axi4_rvalid && m_axi4_rready) begin
+#          if (m_axi4_rlast) begin
+#            burst_ongoing_d = 1'b0;
+#          end else begin
+#            burst_ongoing_d = 1'b1;
+#          end
+#        end
+#        // If there is no burst ongoing and the FIFO has a drop request ready, process it.
+#        if (!burst_ongoing_d && fifo_valid) begin
+#          drop_cnt_d  = len;
+#          state_d     = DROPPING;
+#        end
+#      end
+#
+#      DROPPING: begin
+#        dropping      = 1'b1;
+#        s_axi4_rlast  = (drop_cnt_q == '0);
+#        // Handshake on slave interface
+#        if (s_axi4_rready) begin
+#          drop_cnt_d -= 1;
+#          if (drop_cnt_q == '0) begin
+#            drop_cnt_d  = '0;
+#            fifo_pop    = 1'b1;
+#            state_d     = FORWARDING;
+#          end
+#        end
+#      end
+#
+#      default: begin
+#        state_d = FORWARDING;
+#      end
+#    endcase
+#  end
+#
+#  assign s_axi4_rdata  = m_axi4_rdata;
+#
+#  assign s_axi4_ruser  = dropping ? {AXI_USER_WIDTH{1'b0}} : m_axi4_ruser;
+#  assign s_axi4_rid    = dropping ? id : m_axi4_rid;
+#
+#  assign s_axi4_rresp  = (dropping & prefetch & hit) ? 2'b00 : // prefetch hit, mutli, prot
+#                         (dropping & prefetch      ) ? 2'b10 : // prefetch miss
+#                         (dropping            & hit) ? 2'b10 : // non-prefetch multi, prot
+#                         (dropping                 ) ? 2'b10 : // non-prefetch miss
+#                         m_axi4_rresp;
+#
+#  assign s_axi4_rvalid =  dropping | m_axi4_rvalid;
+#  assign m_axi4_rready = ~dropping & s_axi4_rready;
+#
+#  always_ff @(posedge axi4_aclk, negedge axi4_arstn) begin
+#    if (axi4_arstn == 1'b0) begin
+#      burst_ongoing_q <= 1'b0;
+#      drop_cnt_q      <=  'b0;
+#      state_q         <= FORWARDING;
+#    end else begin
+#      burst_ongoing_q <= burst_ongoing_d;
+#      drop_cnt_q      <= drop_cnt_d;
+#      state_q         <= state_d;
+#    end
+#  end
+#
+# endmodule
+#
+#
+#
+#
diff --git a/src/soc/iommu/axi_rab/axi4_w_buffer.py b/src/soc/iommu/axi_rab/axi4_w_buffer.py
new file mode 100644
index 00000000..aa06dc22
--- /dev/null
+++ b/src/soc/iommu/axi_rab/axi4_w_buffer.py
@@ -0,0 +1,777 @@
+# this file has been generated by sv2nmigen
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_w_buffer(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.l1_done_o = Signal()  # output
+        self.l1_accept_i = Signal()  # input
+        self.l1_save_i = Signal()  # input
+        self.l1_drop_i = Signal()  # input
+        self.l1_master_i = Signal()  # input
+        self.l1_id_i = Signal(AXI_ID_WIDTH)  # input
+        self.l1_len_i = Signal(8)  # input
+        self.l1_prefetch_i = Signal()  # input
+        self.l1_hit_i = Signal()  # input
+        self.l2_done_o = Signal()  # output
+        self.l2_accept_i = Signal()  # input
+        self.l2_drop_i = Signal()  # input
+        self.l2_master_i = Signal()  # input
+        self.l2_id_i = Signal(AXI_ID_WIDTH)  # input
+        self.l2_len_i = Signal(8)  # input
+        self.l2_prefetch_i = Signal()  # input
+        self.l2_hit_i = Signal()  # input
+        self.master_select_o = Signal()  # output
+        self.input_stall_o = Signal()  # output
+        self.output_stall_o = Signal()  # output
+        self.b_drop_o = Signal()  # output
+        self.b_done_i = Signal()  # input
+        self.id_o = Signal(AXI_ID_WIDTH)  # output
+        self.prefetch_o = Signal()  # output
+        self.hit_o = Signal()  # output
+        self.s_axi4_wdata = Signal(AXI_DATA_WIDTH)  # input
+        self.s_axi4_wvalid = Signal()  # input
+        self.s_axi4_wready = Signal()  # output
+        self.s_axi4_wstrb = Signal(1+ERROR p_expression_25)  # input
+        self.s_axi4_wlast = Signal()  # input
+        self.s_axi4_wuser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_wdata = Signal(AXI_DATA_WIDTH)  # output
+        self.m_axi4_wvalid = Signal()  # output
+        self.m_axi4_wready = Signal()  # input
+        self.m_axi4_wstrb = Signal(1+ERROR p_expression_25)  # output
+        self.m_axi4_wlast = Signal()  # output
+        self.m_axi4_wuser = Signal(AXI_USER_WIDTH)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+#
+# //import CfMath::log2;
+#
+# module axi4_w_buffer
+#  #(
+#    parameter AXI_DATA_WIDTH   = 32,
+#    parameter AXI_ID_WIDTH     = 4,
+#    parameter AXI_USER_WIDTH   = 4,
+#    parameter ENABLE_L2TLB     = 0,
+#    parameter HUM_BUFFER_DEPTH = 16
+#  )
+#  (
+#    input  logic                        axi4_aclk,
+#    input  logic                        axi4_arstn,
+#
+#    // L1 & L2 interfaces
+#    output logic                        l1_done_o,
+#    input  logic                        l1_accept_i,
+#    input  logic                        l1_save_i,
+#    input  logic                        l1_drop_i,
+#    input  logic                        l1_master_i,
+#    input  logic     [AXI_ID_WIDTH-1:0] l1_id_i,
+#    input  logic                  [7:0] l1_len_i,
+#    input  logic                        l1_prefetch_i,
+#    input  logic                        l1_hit_i,
+#
+#    output logic                        l2_done_o,
+#    input  logic                        l2_accept_i,
+#    input  logic                        l2_drop_i,
+#    input  logic                        l2_master_i,
+#    input  logic     [AXI_ID_WIDTH-1:0] l2_id_i,
+#    input  logic                  [7:0] l2_len_i,
+#    input  logic                        l2_prefetch_i,
+#    input  logic                        l2_hit_i,
+#
+#    output logic                        master_select_o,
+#    output logic                        input_stall_o,
+#    output logic                        output_stall_o,
+#
+#    // B sender interface
+#    output logic                        b_drop_o,
+#    input  logic                        b_done_i,
+#    output logic     [AXI_ID_WIDTH-1:0] id_o,
+#    output logic                        prefetch_o,
+#    output logic                        hit_o,
+#
+#    // AXI W channel interfaces
+#    input  logic   [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
+#    input  logic                        s_axi4_wvalid,
+#    output logic                        s_axi4_wready,
+#    input  logic [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
+#    input  logic                        s_axi4_wlast,
+#    input  logic   [AXI_USER_WIDTH-1:0] s_axi4_wuser,
+#
+#    output logic   [AXI_DATA_WIDTH-1:0] m_axi4_wdata,
+#    output logic                        m_axi4_wvalid,
+#    input  logic                        m_axi4_wready,
+#    output logic [AXI_DATA_WIDTH/8-1:0] m_axi4_wstrb,
+#    output logic                        m_axi4_wlast,
+#    output logic   [AXI_USER_WIDTH-1:0] m_axi4_wuser
+#  );
+#
+"""
+
+  localparam BUFFER_WIDTH  = AXI_DATA_WIDTH+AXI_USER_WIDTH+AXI_DATA_WIDTH/8+1;
+
+  localparam INPUT_BUFFER_DEPTH = 4;
+  localparam L1_FIFO_DEPTH      = 8;
+  localparam L2_FIFO_DEPTH      = 4;
+
+  logic      [AXI_DATA_WIDTH-1:0] axi4_wdata;
+  logic                           axi4_wvalid;
+  logic                           axi4_wready;
+  logic    [AXI_DATA_WIDTH/8-1:0] axi4_wstrb;
+  logic                           axi4_wlast;
+  logic      [AXI_USER_WIDTH-1:0] axi4_wuser;
+
+  logic                           l1_fifo_valid_out;
+  logic                           l1_fifo_ready_in;
+  logic                           l1_fifo_valid_in;
+  logic                           l1_fifo_ready_out;
+
+  logic                           l1_req;
+  logic                           l1_accept_cur, l1_save_cur, l1_drop_cur;
+  logic                           l1_master_cur;
+  logic        [AXI_ID_WIDTH-1:0] l1_id_cur;
+  logic                     [7:0] l1_len_cur;
+  logic                           l1_hit_cur, l1_prefetch_cur;
+  logic                           l1_save_in, l1_save_out;
+  logic [log2(L1_FIFO_DEPTH)-1:0] n_l1_save_SP;
+
+  logic                           l2_fifo_valid_out;
+  logic                           l2_fifo_ready_in;
+  logic                           l2_fifo_valid_in;
+  logic                           l2_fifo_ready_out;
+
+  logic                           l2_req;
+  logic                           l2_accept_cur, l2_drop_cur;
+  logic                           l2_master_cur;
+  logic        [AXI_ID_WIDTH-1:0] l2_id_cur;
+  logic                     [7:0] l2_len_cur;
+  logic                           l2_hit_cur, l2_prefetch_cur;
+
+  logic                           fifo_select, fifo_select_SN, fifo_select_SP;
+  logic                           w_done;
+  logic                           b_drop_set;
+
+  // HUM buffer signals
+  logic                           hum_buf_ready_out;
+  logic                           hum_buf_valid_in;
+  logic                           hum_buf_ready_in;
+  logic                           hum_buf_valid_out;
+  logic                           hum_buf_underfull;
+
+  logic      [AXI_DATA_WIDTH-1:0] hum_buf_wdata;
+  logic    [AXI_DATA_WIDTH/8-1:0] hum_buf_wstrb;
+  logic                           hum_buf_wlast;
+  logic      [AXI_USER_WIDTH-1:0] hum_buf_wuser;
+
+  logic                           hum_buf_drop_req_SN, hum_buf_drop_req_SP;
+  logic                     [7:0] hum_buf_drop_len_SN, hum_buf_drop_len_SP;
+  logic                           hum_buf_almost_full;
+
+  logic                           stop_store;
+  logic                           wlast_in, wlast_out;
+  logic signed              [3:0] n_wlast_SN,          n_wlast_SP;
+  logic                           block_forwarding;
+
+  // Search FSM
+  typedef enum logic        [3:0] {STORE,                       BYPASS,
+                                   WAIT_L1_BYPASS_YES,          WAIT_L2_BYPASS_YES,
+                                   WAIT_L1_BYPASS_NO,           WAIT_L2_BYPASS_NO,
+                                   FLUSH,                       DISCARD,
+                                   DISCARD_FINISH}
+                                  hum_buf_state_t;
+  hum_buf_state_t                 hum_buf_SP; // Present state
+  hum_buf_state_tbg                 hum_buf_SN; // Next State
+
+  axi_buffer_rab
+    #(
+      .DATA_WIDTH       ( BUFFER_WIDTH        ),
+      .BUFFER_DEPTH     ( INPUT_BUFFER_DEPTH  )
+      )
+    u_input_buf
+    (
+      .clk       ( axi4_aclk                                                ),
+      .rstn      ( axi4_arstn                                               ),
+      // Push
+      .data_in   ( {s_axi4_wuser, s_axi4_wstrb, s_axi4_wdata, s_axi4_wlast} ),
+      .valid_in  ( s_axi4_wvalid                                            ),
+      .ready_out ( s_axi4_wready                                            ),
+      // Pop
+      .data_out  ( {axi4_wuser,   axi4_wstrb,   axi4_wdata,   axi4_wlast}   ),
+      .valid_out ( axi4_wvalid                                              ),
+      .ready_in  ( axi4_wready                                              )
+    );
+
+  axi_buffer_rab
+    #(
+      .DATA_WIDTH       ( 2+AXI_ID_WIDTH+8+4  ),
+      .BUFFER_DEPTH     ( L1_FIFO_DEPTH       )
+      )
+    u_l1_fifo
+    (
+      .clk       ( axi4_aclk                                                                                                    ),
+      .rstn      ( axi4_arstn                                                                                                   ),
+      // Push
+      .data_in   ( {l1_prefetch_i,   l1_hit_i,   l1_id_i,   l1_len_i,   l1_master_i,   l1_accept_i,   l1_save_i,   l1_drop_i}   ),
+      .valid_in  ( l1_fifo_valid_in                                                                                             ),
+      .ready_out ( l1_fifo_ready_out                                                                                            ),
+      // Pop
+      .data_out  ( {l1_prefetch_cur, l1_hit_cur, l1_id_cur, l1_len_cur, l1_master_cur, l1_accept_cur, l1_save_cur, l1_drop_cur} ),
+      .valid_out ( l1_fifo_valid_out                                                                                            ),
+      .ready_in  ( l1_fifo_ready_in                                                                                             )
+    );
+
+    // Push upon receiving new requests from the TLB.
+    assign l1_req           = l1_accept_i | l1_save_i | l1_drop_i;
+    assign l1_fifo_valid_in = l1_req & l1_fifo_ready_out;
+
+    // Signal handshake
+    assign l1_done_o  = l1_fifo_valid_in;
+    assign l2_done_o  = l2_fifo_valid_in;
+
+    // Stall AW input of L1 TLB
+    assign input_stall_o = ~(l1_fifo_ready_out & l2_fifo_ready_out);
+
+    // Interface b_drop signals + handshake
+    always_comb begin
+      if (fifo_select == 1'b0) begin
+        prefetch_o       = l1_prefetch_cur;
+        hit_o            = l1_hit_cur;
+        id_o             = l1_id_cur;
+
+        l1_fifo_ready_in = w_done | b_done_i;
+        l2_fifo_ready_in = 1'b0;
+      end else begin
+        prefetch_o       = l2_prefetch_cur;
+        hit_o            = l2_hit_cur;
+        id_o             = l2_id_cur;
+
+        l1_fifo_ready_in = 1'b0;
+        l2_fifo_ready_in = w_done | b_done_i;
+      end
+    end
+
+    // Detect when an L1 transaction save request enters or exits the L1 FIFO.
+    assign l1_save_in  = l1_fifo_valid_in & l1_save_i;
+    assign l1_save_out = l1_fifo_ready_in & l1_save_cur;
+
+    // Count the number of L1 transaction to save in the L1 FIFO.
+    always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
+      if (axi4_arstn == 0) begin
+        n_l1_save_SP <= '0;
+      end else if (l1_save_in ^ l1_save_out) begin
+        if (l1_save_in) begin
+          n_l1_save_SP <= n_l1_save_SP + 1'b1;
+        end else if (l1_save_out) begin
+          n_l1_save_SP <= n_l1_save_SP - 1'b1;
+        end
+      end
+    end
+
+    // Stall forwarding of AW L1 hits if:
+    // 1. The HUM buffer does not allow to be bypassed.
+    // 2. There are multiple L1 save requests in the FIFO, i.e., multiple L2 outputs pending.
+    assign output_stall_o = (n_l1_save_SP > 1) || (block_forwarding == 1'b1);
+
+  generate
+  if (ENABLE_L2TLB == 1) begin : HUM_BUFFER
+
+    axi_buffer_rab_bram
+    #(
+      .DATA_WIDTH       ( BUFFER_WIDTH      ),
+      .BUFFER_DEPTH     ( HUM_BUFFER_DEPTH  )
+      )
+    u_hum_buf
+    (
+      .clk           ( axi4_aclk                                                    ),
+      .rstn          ( axi4_arstn                                                   ),
+      // Push
+      .data_in       ( {axi4_wuser,    axi4_wstrb,    axi4_wdata,    axi4_wlast}    ),
+      .valid_in      ( hum_buf_valid_in                                             ),
+      .ready_out     ( hum_buf_ready_out                                            ),
+      // Pop
+      .data_out      ( {hum_buf_wuser, hum_buf_wstrb, hum_buf_wdata, hum_buf_wlast} ),
+      .valid_out     ( hum_buf_valid_out                                            ),
+      .ready_in      ( hum_buf_ready_in                                             ),
+      // Clear
+      .almost_full   ( hum_buf_almost_full                                          ),
+      .underfull     ( hum_buf_underfull                                            ),
+      .drop_req      ( hum_buf_drop_req_SP                                          ),
+      .drop_len      ( hum_buf_drop_len_SP                                          )
+    );
+
+    axi_buffer_rab
+    #(
+      .DATA_WIDTH       ( 2+AXI_ID_WIDTH+8+3  ),
+      .BUFFER_DEPTH     ( L2_FIFO_DEPTH       )
+      )
+    u_l2_fifo
+    (
+      .clk       ( axi4_aclk                                                                                        ),
+      .rstn      ( axi4_arstn                                                                                       ),
+      // Push
+      .data_in   ( {l2_prefetch_i,   l2_hit_i,   l2_id_i,   l2_len_i,   l2_master_i,   l2_accept_i,   l2_drop_i}    ),
+      .valid_in  ( l2_fifo_valid_in                                                                                 ),
+      .ready_out ( l2_fifo_ready_out                                                                                ),
+      // Pop
+      .data_out  ( {l2_prefetch_cur, l2_hit_cur, l2_id_cur, l2_len_cur, l2_master_cur, l2_accept_cur, l2_drop_cur}  ),
+      .valid_out ( l2_fifo_valid_out                                                                                ),
+      .ready_in  ( l2_fifo_ready_in                                                                                 )
+    );
+
+    // Push upon receiving new result from TLB.
+    assign l2_req           = l2_accept_i | l2_drop_i;
+    assign l2_fifo_valid_in = l2_req & l2_fifo_ready_out;
+
+    assign wlast_in  =    axi4_wlast & hum_buf_valid_in  & hum_buf_ready_out;
+    assign wlast_out = hum_buf_wlast & hum_buf_valid_out & hum_buf_ready_in;
+
+    always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
+      if (axi4_arstn == 0) begin
+        fifo_select_SP      <= 1'b0;
+        hum_buf_drop_len_SP <=  'b0;
+        hum_buf_drop_req_SP <= 1'b0;
+        hum_buf_SP          <= STORE;
+        n_wlast_SP          <=  'b0;
+      end else begin
+        fifo_select_SP      <= fifo_select_SN;
+        hum_buf_drop_len_SP <= hum_buf_drop_len_SN;
+        hum_buf_drop_req_SP <= hum_buf_drop_req_SN;
+        hum_buf_SP          <= hum_buf_SN;
+        n_wlast_SP          <= n_wlast_SN;
+      end
+    end
+
+    always_comb begin
+      n_wlast_SN = n_wlast_SP;
+      if (hum_buf_drop_req_SP) begin  // Happens exactly once per burst to be dropped.
+        n_wlast_SN -= 1;
+      end
+      if (wlast_in) begin
+        n_wlast_SN += 1;
+      end
+      if (wlast_out) begin
+        n_wlast_SN -= 1;
+      end
+    end
+
+    always_comb begin : HUM_BUFFER_FSM
+      hum_buf_SN       = hum_buf_SP;
+
+      m_axi4_wlast     = 1'b0;
+      m_axi4_wdata     =  'b0;
+      m_axi4_wstrb     =  'b0;
+      m_axi4_wuser     =  'b0;
+
+      m_axi4_wvalid    = 1'b0;
+      axi4_wready      = 1'b0;
+
+      hum_buf_valid_in = 1'b0;
+      hum_buf_ready_in = 1'b0;
+
+      hum_buf_drop_req_SN = hum_buf_drop_req_SP;
+      hum_buf_drop_len_SN = hum_buf_drop_len_SP;
+      master_select_o  = 1'b0;
+
+      w_done           = 1'b0; // read from FIFO without handshake with B sender
+      b_drop_o         = 1'b0; // send data from FIFO to B sender (with handshake)
+      fifo_select      = 1'b0;
+
+      fifo_select_SN   = fifo_select_SP;
+      stop_store       = 1'b0;
+
+      block_forwarding = 1'b0;
+
+      unique case (hum_buf_SP)
+
+        STORE : begin
+          // Simply store the data in the buffer.
+          hum_buf_valid_in = axi4_wvalid & hum_buf_ready_out;
+          axi4_wready      = hum_buf_ready_out;
+
+          // We have got a full burst in the HUM buffer, thus stop storing.
+          if (wlast_in & !hum_buf_underfull | (n_wlast_SP > $signed(0))) begin
+            hum_buf_SN = WAIT_L1_BYPASS_YES;
+
+          // The buffer is full, thus wait for decision.
+          end else if (~hum_buf_ready_out) begin
+            hum_buf_SN = WAIT_L1_BYPASS_NO;
+          end
+
+          // Avoid the forwarding of L1 hits until we know whether we can bypass.
+          if (l1_fifo_valid_out & l1_save_cur) begin
+            block_forwarding = 1'b1;
+          end
+        end
+
+        WAIT_L1_BYPASS_YES : begin
+          // Wait for orders from L1 TLB.
+          if (l1_fifo_valid_out) begin
+
+            // L1 hit - forward data from buffer
+            if (l1_accept_cur) begin
+              m_axi4_wlast       = hum_buf_wlast;
+              m_axi4_wdata       = hum_buf_wdata;
+              m_axi4_wstrb       = hum_buf_wstrb;
+              m_axi4_wuser       = hum_buf_wuser;
+
+              m_axi4_wvalid      = hum_buf_valid_out;
+              hum_buf_ready_in   = m_axi4_wready;
+
+              master_select_o    = l1_master_cur;
+
+              // Detect last data beat.
+              if (wlast_out) begin
+                fifo_select      = 1'b0;
+                w_done           = 1'b1;
+                hum_buf_SN       = STORE;
+              end
+
+            // L1 miss - wait for L2
+            end else if (l1_save_cur) begin
+              fifo_select        = 1'b0;
+              w_done             = 1'b1;
+              hum_buf_SN         = WAIT_L2_BYPASS_YES;
+
+            // L1 prefetch, prot, multi - drop data
+            end else if (l1_drop_cur) begin
+              fifo_select_SN      = 1'b0; // L1
+              hum_buf_drop_req_SN = 1'b1;
+              hum_buf_drop_len_SN = l1_len_cur;
+              hum_buf_SN          = FLUSH;
+            end
+          end
+        end
+
+        WAIT_L2_BYPASS_YES : begin
+          // Wait for orders from L2 TLB.
+          if (l2_fifo_valid_out) begin
+
+            // L2 hit - forward data from buffer
+            if (l2_accept_cur) begin
+              m_axi4_wlast       = hum_buf_wlast;
+              m_axi4_wdata       = hum_buf_wdata;
+              m_axi4_wstrb       = hum_buf_wstrb;
+              m_axi4_wuser       = hum_buf_wuser;
+
+              m_axi4_wvalid      = hum_buf_valid_out;
+              hum_buf_ready_in   = m_axi4_wready;
+
+              master_select_o    = l2_master_cur;
+
+              // Detect last data beat.
+              if (wlast_out) begin
+                fifo_select      = 1'b1;
+                w_done           = 1'b1;
+                hum_buf_SN       = STORE;
+              end
+
+            // L2 miss/prefetch hit
+            end else if (l2_drop_cur) begin
+              fifo_select_SN      = 1'b1; // L2
+              hum_buf_drop_req_SN = 1'b1;
+              hum_buf_drop_len_SN = l2_len_cur;
+              hum_buf_SN          = FLUSH;
+            end
+
+          // While we wait for orders from L2 TLB, we can still drop and accept L1 transactions.
+          end else if (l1_fifo_valid_out) begin
+
+            // L1 hit
+            if (l1_accept_cur) begin
+              hum_buf_SN         = BYPASS;
+
+            // L1 prefetch/prot/multi
+            end else if (l1_drop_cur) begin
+              hum_buf_SN         = DISCARD;
+            end
+          end
+        end
+
+        FLUSH : begin
+          // Clear HUM buffer flush request.
+          hum_buf_drop_req_SN = 1'b0;
+
+          // perform handshake with B sender
+          fifo_select      = fifo_select_SP;
+          b_drop_o         = 1'b1;
+          if (b_done_i) begin
+            hum_buf_SN     = STORE;
+          end
+        end
+
+        BYPASS : begin
+          // Forward one full transaction from input buffer.
+          m_axi4_wlast       = axi4_wlast;
+          m_axi4_wdata       = axi4_wdata;
+          m_axi4_wstrb       = axi4_wstrb;
+          m_axi4_wuser       = axi4_wuser;
+
+          m_axi4_wvalid      = axi4_wvalid;
+          axi4_wready        = m_axi4_wready;
+
+          master_select_o    = l1_master_cur;
+
+          // We have got a full transaction.
+          if (axi4_wlast & axi4_wready & axi4_wvalid) begin
+            fifo_select      = 1'b0;
+            w_done           = 1'b1;
+            hum_buf_SN       = WAIT_L2_BYPASS_YES;
+          end
+        end
+
+        DISCARD : begin
+          // Discard one full transaction from input buffer.
+          axi4_wready        = 1'b1;
+
+          // We have got a full transaction.
+          if (axi4_wlast & axi4_wready & axi4_wvalid) begin
+            // Try to perform handshake with B sender.
+            fifo_select      = 1'b0;
+            b_drop_o         = 1'b1;
+            // We cannot wait here due to axi4_wready.
+            if (b_done_i) begin
+              hum_buf_SN     = WAIT_L2_BYPASS_YES;
+            end else begin
+              hum_buf_SN     = DISCARD_FINISH;
+            end
+          end
+        end
+
+        DISCARD_FINISH : begin
+          // Perform handshake with B sender.
+          fifo_select      = 1'b0;
+          b_drop_o         = 1'b1;
+          if (b_done_i) begin
+            hum_buf_SN     = WAIT_L2_BYPASS_YES;
+          end
+        end
+
+        WAIT_L1_BYPASS_NO : begin
+          // Do not allow the forwarding of L1 hits.
+          block_forwarding       = 1'b1;
+
+          // Wait for orders from L1 TLB.
+          if (l1_fifo_valid_out) begin
+
+            // L1 hit - forward data from/through HUM buffer and refill the buffer
+            if (l1_accept_cur) begin
+              // Forward data from HUM buffer.
+              m_axi4_wlast       = hum_buf_wlast;
+              m_axi4_wdata       = hum_buf_wdata;
+              m_axi4_wstrb       = hum_buf_wstrb;
+              m_axi4_wuser       = hum_buf_wuser;
+
+              m_axi4_wvalid      = hum_buf_valid_out;
+              hum_buf_ready_in   = m_axi4_wready;
+
+              master_select_o    = l1_master_cur;
+
+              // Refill the HUM buffer. Stop when buffer full.
+              stop_store         = ~hum_buf_ready_out;
+              hum_buf_valid_in   = stop_store ? 1'b0 : axi4_wvalid      ;
+              axi4_wready        = stop_store ? 1'b0 : hum_buf_ready_out;
+
+              // Detect last data beat.
+              if (wlast_out) begin
+                fifo_select      = 1'b0;
+                w_done           = 1'b1;
+                if (~hum_buf_ready_out | hum_buf_almost_full) begin
+                  hum_buf_SN     = WAIT_L1_BYPASS_NO;
+                end else begin
+                  hum_buf_SN     = STORE;
+                end
+              end
+
+              // Allow the forwarding of L1 hits.
+              block_forwarding   = 1'b0;
+
+            // L1 miss - wait for L2
+            end else if (l1_save_cur) begin
+              fifo_select        = 1'b0;
+              w_done             = 1'b1;
+              hum_buf_SN         = WAIT_L2_BYPASS_NO;
+
+            // L1 prefetch, prot, multi - drop data
+            end else if (l1_drop_cur) begin
+              fifo_select_SN      = 1'b0; // L1
+              hum_buf_drop_req_SN = 1'b1;
+              hum_buf_drop_len_SN = l1_len_cur;
+              hum_buf_SN          = FLUSH;
+
+              // Allow the forwarding of L1 hits.
+              block_forwarding   = 1'b0;
+            end
+          end
+        end
+
+        WAIT_L2_BYPASS_NO : begin
+          // Do not allow the forwarding of L1 hits.
+          block_forwarding       = 1'b1;
+
+          // Wait for orders from L2 TLB.
+          if (l2_fifo_valid_out) begin
+
+            // L2 hit - forward first part from HUM buffer, rest from input buffer
+            if (l2_accept_cur) begin
+              // Forward data from HUM buffer.
+              m_axi4_wlast       = hum_buf_wlast;
+              m_axi4_wdata       = hum_buf_wdata;
+              m_axi4_wstrb       = hum_buf_wstrb;
+              m_axi4_wuser       = hum_buf_wuser;
+
+              m_axi4_wvalid      = hum_buf_valid_out;
+              hum_buf_ready_in   = m_axi4_wready;
+
+              master_select_o    = l2_master_cur;
+
+              // Refill the HUM buffer. Stop when buffer full.
+              stop_store         = ~hum_buf_ready_out;
+              hum_buf_valid_in   = stop_store ? 1'b0 : axi4_wvalid      ;
+              axi4_wready        = stop_store ? 1'b0 : hum_buf_ready_out;
+
+              // Detect last data beat.
+              if (wlast_out) begin
+                fifo_select      = 1'b1;
+                w_done           = 1'b1;
+                if (~hum_buf_ready_out | hum_buf_almost_full) begin
+                  hum_buf_SN     = WAIT_L1_BYPASS_NO;
+                end else begin
+                  hum_buf_SN     = STORE;
+                end
+              end
+
+              // Allow the forwarding of L1 hits.
+              block_forwarding   = 1'b0;
+
+            // L2 miss/prefetch hit - drop data
+            end else if (l2_drop_cur) begin
+              fifo_select_SN      = 1'b1; // L2
+              hum_buf_drop_req_SN = 1'b1;
+              hum_buf_drop_len_SN = l2_len_cur;
+              hum_buf_SN          = FLUSH;
+
+              // Allow the forwarding of L1 hits.
+              block_forwarding   = 1'b0;
+            end
+          end
+        end
+
+
+        default: begin
+          hum_buf_SN = STORE;
+        end
+
+      endcase // hum_buf_SP
+    end // HUM_BUFFER_FSM
+
+    assign b_drop_set = 1'b0;
+
+  end else begin // HUM_BUFFER
+
+    // register to perform the handshake with B sender
+    always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
+      if (axi4_arstn == 0) begin
+        b_drop_o <= 1'b0;
+      end else if (b_done_i) begin
+        b_drop_o <= 1'b0;
+      end else if (b_drop_set) begin
+        b_drop_o <= 1'b1;;
+      end
+    end
+
+    always_comb begin : OUTPUT_CTRL
+
+      fifo_select   = 1'b0;
+      w_done        = 1'b0;
+      b_drop_set    = 1'b0;
+
+      m_axi4_wlast  = 1'b0;
+      m_axi4_wdata  =  'b0;
+      m_axi4_wstrb  =  'b0;
+      m_axi4_wuser  =  'b0;
+
+      m_axi4_wvalid = 1'b0;
+      axi4_wready   = 1'b0;
+
+      if (l1_fifo_valid_out) begin
+        // forward data
+        if (l1_accept_cur) begin
+          m_axi4_wlast  = axi4_wlast;
+          m_axi4_wdata  = axi4_wdata;
+          m_axi4_wstrb  = axi4_wstrb;
+          m_axi4_wuser  = axi4_wuser;
+
+          m_axi4_wvalid = axi4_wvalid;
+          axi4_wready   = m_axi4_wready;
+
+          // Simply pop from FIFO upon last data beat.
+          w_done        = axi4_wlast & axi4_wvalid & axi4_wready;
+
+        // discard entire burst
+        end else if (b_drop_o == 1'b0) begin
+          axi4_wready   = 1'b1;
+
+          // Simply pop from FIFO upon last data beat. Perform handshake with B sender.
+          if (axi4_wlast & axi4_wvalid & axi4_wready)
+            b_drop_set  = 1'b1;
+        end
+      end
+
+    end // OUTPUT_CTRL
+
+    assign master_select_o     = l1_master_cur;
+    assign l2_fifo_ready_out   = 1'b1;
+    assign block_forwarding    = 1'b0;
+
+    // unused signals
+    assign hum_buf_ready_out   = 1'b0;
+    assign hum_buf_valid_in    = 1'b0;
+    assign hum_buf_ready_in    = 1'b0;
+    assign hum_buf_valid_out   = 1'b0;
+    assign hum_buf_wdata       =  'b0;
+    assign hum_buf_wstrb       =  'b0;
+    assign hum_buf_wlast       = 1'b0;
+    assign hum_buf_wuser       =  'b0;
+    assign hum_buf_drop_len_SN =  'b0;
+    assign hum_buf_drop_req_SN = 1'b0;
+    assign hum_buf_almost_full = 1'b0;
+
+    assign l2_fifo_valid_in    = 1'b0;
+    assign l2_fifo_valid_out   = 1'b0;
+    assign l2_prefetch_cur     = 1'b0;
+    assign l2_hit_cur          = 1'b0;
+    assign l2_id_cur           =  'b0;
+    assign l2_len_cur          =  'b0;
+    assign l2_master_cur       = 1'b0;
+    assign l2_accept_cur       = 1'b0;
+    assign l2_drop_cur         = 1'b0;
+
+    assign l2_req              = 1'b0;
+
+    assign fifo_select_SN      = 1'b0;
+    assign fifo_select_SP      = 1'b0;
+
+    assign stop_store          = 1'b0;
+    assign n_wlast_SP          =  'b0;
+    assign wlast_in            = 1'b0;
+    assign wlast_out           = 1'b0;
+
+  end // HUM_BUFFER
+
+  endgenerate
+"""
diff --git a/src/soc/iommu/axi_rab/axi4_w_sender.py b/src/soc/iommu/axi_rab/axi4_w_sender.py
new file mode 100644
index 00000000..9916334f
--- /dev/null
+++ b/src/soc/iommu/axi_rab/axi4_w_sender.py
@@ -0,0 +1,78 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_w_sender(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.s_axi4_wdata = Signal()  # input
+        self.s_axi4_wvalid = Signal()  # input
+        self.s_axi4_wready = Signal()  # output
+        self.s_axi4_wstrb = Signal()  # input
+        self.s_axi4_wlast = Signal()  # input
+        self.s_axi4_wuser = Signal()  # input
+        self.m_axi4_wdata = Signal()  # output
+        self.m_axi4_wvalid = Signal()  # output
+        self.m_axi4_wready = Signal()  # input
+        self.m_axi4_wstrb = Signal()  # output
+        self.m_axi4_wlast = Signal()  # output
+        self.m_axi4_wuser = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.m_axi4_wdata.eq(self.s_axi4_wdata)
+        m.d.comb += self.m_axi4_wstrb.eq(self.s_axi4_wstrb)
+        m.d.comb += self.m_axi4_wlast.eq(self.s_axi4_wlast)
+        m.d.comb += self.m_axi4_wuser.eq(self.s_axi4_wuser)
+        m.d.comb += self.m_axi4_wvalid.eq(self.s_axi4_wvalid)
+        m.d.comb += self.s_axi4_wready.eq(self.m_axi4_wready)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_w_sender
+#  #(
+#    parameter AXI_DATA_WIDTH = 32,
+#    parameter AXI_USER_WIDTH = 2
+#  )
+#  (
+#    input                         axi4_aclk,
+#    input                         axi4_arstn,
+#
+#    input    [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
+#    input                         s_axi4_wvalid,
+#    output                        s_axi4_wready,
+#    input  [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
+#    input                         s_axi4_wlast,
+#    input    [AXI_USER_WIDTH-1:0] s_axi4_wuser,
+#
+#    output   [AXI_DATA_WIDTH-1:0] m_axi4_wdata,
+#    output                        m_axi4_wvalid,
+#    input                         m_axi4_wready,
+#    output [AXI_DATA_WIDTH/8-1:0] m_axi4_wstrb,
+#    output                        m_axi4_wlast,
+#    output   [AXI_USER_WIDTH-1:0] m_axi4_wuser
+#  );
+#
+#  assign m_axi4_wdata  = s_axi4_wdata;
+#  assign m_axi4_wstrb  = s_axi4_wstrb;
+#  assign m_axi4_wlast  = s_axi4_wlast;
+#  assign m_axi4_wuser  = s_axi4_wuser;
+#
+#  assign m_axi4_wvalid = s_axi4_wvalid;
+#  assign s_axi4_wready = m_axi4_wready;
+#
+# endmodule
+#
+#
diff --git a/src/soc/iommu/axi_rab/axi_buffer_rab.py b/src/soc/iommu/axi_rab/axi_buffer_rab.py
new file mode 100644
index 00000000..b4d99299
--- /dev/null
+++ b/src/soc/iommu/axi_rab/axi_buffer_rab.py
@@ -0,0 +1,151 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi_buffer_rab(Elaboratable):
+
+    def __init__(self):
+        self.clk = Signal()  # input
+        self.rstn = Signal()  # input
+        self.data_out = Signal(DATA_WIDTH)  # output
+        self.valid_out = Signal()  # output
+        self.ready_in = Signal()  # input
+        self.valid_in = Signal()  # input
+        self.data_in = Signal(DATA_WIDTH)  # input
+        self.ready_out = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.full.eq(self.None)
+        m.d.comb += self.data_out.eq(self.None)
+        m.d.comb += self.valid_out.eq(self.None)
+        m.d.comb += self.ready_out.eq(self.None)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# //import CfMath::log2;
+#
+# module axi_buffer_rab
+#  //#(
+#  //  parameter DATA_WIDTH,
+#  //  parameter BUFFER_DEPTH
+#  //)
+#  (
+#    input logic                   clk,
+#    input logic                   rstn,
+#
+#    // Downstream port
+#    output logic [DATA_WIDTH-1:0] data_out,
+#    output logic                  valid_out,
+#    input  logic                  ready_in,
+#
+#    // Upstream port
+#    input  logic                  valid_in,
+#    input  logic [DATA_WIDTH-1:0] data_in,
+#    output logic                  ready_out
+#  );
+#
+#  localparam integer LOG_BUFFER_DEPTH = log2(BUFFER_DEPTH);
+#
+#    // Internal data structures
+#    reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_in;   // location to which we last wrote
+#    reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_out;  // location from which we last sent
+#    reg     [LOG_BUFFER_DEPTH : 0] elements;     // number of elements in the buffer
+#    reg       [DATA_WIDTH - 1 : 0] buffer [BUFFER_DEPTH - 1 : 0];
+#
+#    wire full;
+#
+#    integer loop1;
+#
+#    assign full = (elements == BUFFER_DEPTH);
+#
+#    always @(posedge clk or negedge rstn)
+#      begin: elements_sequential
+#        if (rstn == 1'b0)
+#          elements <= 0;
+#        else
+#        begin
+#          // ------------------
+#          // Are we filling up?
+#          // ------------------
+#          // One out, none in
+#          if (ready_in && valid_out && (!valid_in || full))
+#            elements <= elements - 1;
+#          // None out, one in
+#          else if ((!valid_out || !ready_in) && valid_in && !full)
+#            elements <= elements + 1;
+#          // Else, either one out and one in, or none out and none in - stays unchanged
+#        end
+#      end
+#
+#    always @(posedge clk or negedge rstn)
+#      begin: buffers_sequential
+#        if (rstn == 1'b0)
+#        begin
+#          for (loop1 = 0 ; loop1 < BUFFER_DEPTH ; loop1 = loop1 + 1)
+#            buffer[loop1] <= 0;
+#        end
+#        else
+#        begin
+#          // Update the memory
+#          if (valid_in && !full)
+#            buffer[pointer_in] <= data_in;
+#        end
+#      end
+#
+#    always @(posedge clk or negedge rstn)
+#      begin: sequential
+#        if (rstn == 1'b0)
+#        begin
+#          pointer_out <= 0;
+#          pointer_in <= 0;
+#        end
+#        else
+#        begin
+#          // ------------------------------------
+#          // Check what to do with the input side
+#          // ------------------------------------
+#          // We have some input, increase by 1 the input pointer
+#          if (valid_in && !full)
+#          begin
+#            if (pointer_in == $unsigned(BUFFER_DEPTH - 1))
+#              pointer_in <= 0;
+#            else
+#              pointer_in <= pointer_in + 1;
+#          end
+#          // Else we don't have any input, the input pointer stays the same
+#
+#          // -------------------------------------
+#          // Check what to do with the output side
+#          // -------------------------------------
+#          // We had pushed one flit out, we can try to go for the next one
+#          if (ready_in && valid_out)
+#          begin
+#            if (pointer_out == $unsigned(BUFFER_DEPTH - 1))
+#              pointer_out <= 0;
+#            else
+#              pointer_out <= pointer_out + 1;
+#          end
+#          // Else stay on the same output location
+#        end
+#      end
+#
+#    // Update output ports
+#    assign data_out = buffer[pointer_out];
+#    assign valid_out = (elements != 0);
+#
+#    assign ready_out = ~full;
+#
+# endmodule
+#
+#
diff --git a/src/soc/iommu/axi_rab/axi_buffer_rab_bram.py b/src/soc/iommu/axi_rab/axi_buffer_rab_bram.py
new file mode 100644
index 00000000..349b314e
--- /dev/null
+++ b/src/soc/iommu/axi_rab/axi_buffer_rab_bram.py
@@ -0,0 +1,209 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi_buffer_rab_bram(Elaboratable):
+
+    def __init__(self):
+        self.clk = Signal()  # input
+        self.rstn = Signal()  # input
+        self.data_out = Signal(DATA_WIDTH)  # output
+        self.valid_out = Signal()  # output
+        self.ready_in = Signal()  # input
+        self.valid_in = Signal()  # input
+        self.data_in = Signal(DATA_WIDTH)  # input
+        self.ready_out = Signal()  # output
+        self.almost_full = Signal()  # output
+        self.underfull = Signal()  # output
+        self.drop_req = Signal()  # input
+        self.drop_len = Signal(8)  # input
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# ////import CfMath::log2;
+#
+# module axi_buffer_rab_bram
+#  //#(
+#  //  parameter DATA_WIDTH,
+#  //  parameter BUFFER_DEPTH
+#  //  )
+#   (
+#    input logic                   clk,
+#    input logic                   rstn,
+#
+#    // Downstream port
+#    output logic [DATA_WIDTH-1:0] data_out,
+#    output logic                  valid_out,
+#    input  logic                  ready_in,
+#
+#    // Upstream port
+#    input  logic                  valid_in,
+#    input  logic [DATA_WIDTH-1:0] data_in,
+#    output logic                  ready_out,
+#
+#    // Status and drop control
+#    output logic                  almost_full,
+#    output logic                  underfull,
+#    input  logic                  drop_req,
+#    // Number of items to drop.  As for AXI lengths, counting starts at zero, i.e., `drop_len == 0`
+#    // and `drop_req` means drop one item.
+#    input  logic [7:0]            drop_len
+#    );
+#
+"""  #docstring_begin
+  // The BRAM needs to be in "write-first" mode for first-word fall-through FIFO behavior.
+  // To still push and pop simultaneously if the buffer is full, we internally increase the
+  // buffer depth by 1.
+  localparam ACT_BUFFER_DEPTH     = BUFFER_DEPTH+1;
+  localparam ACT_LOG_BUFFER_DEPTH = log2(ACT_BUFFER_DEPTH+1);
+
+  /**
+    * Internal data structures
+    */
+  // Location to which we last wrote
+  logic        [ACT_LOG_BUFFER_DEPTH-1:0] ptr_in_d,                   ptr_in_q;
+  // Location from which we last sent
+  logic        [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_d,                  ptr_out_q;
+  // Required for fall-through behavior on the first word
+  logic        [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_bram;
+  // Number of elements in the buffer.  Can be negative if elements that have been dropped have not
+  // yet been written.
+  logic signed   [ACT_LOG_BUFFER_DEPTH:0] n_elems_d,                  n_elems_q;
+
+  logic           [DATA_WIDTH-1:0]        data_out_bram, data_out_q;
+  logic                                   valid_out_q;
+
+  logic full;
+
+  assign almost_full = (n_elems_q == BUFFER_DEPTH-1);
+  assign full        = (n_elems_q == BUFFER_DEPTH);
+
+  always_ff @(posedge clk, negedge rstn) begin
+    if (~rstn) begin
+      n_elems_q <= '0;
+      ptr_in_q  <= '0;
+      ptr_out_q <= '0;
+    end else begin
+      n_elems_q <= n_elems_d;
+      ptr_in_q  <= ptr_in_d;
+      ptr_out_q <= ptr_out_d;
+    end
+  end
+
+  // Update the number of elements.
+  always_comb begin
+    n_elems_d = n_elems_q;
+    if (drop_req) begin
+      n_elems_d -= (drop_len + 1);
+    end
+    if (valid_in && ready_out) begin
+      n_elems_d += 1;
+    end
+    if (valid_out && ready_in) begin
+      n_elems_d -= 1;
+    end
+  end
+
+  // Update the output pointer.
+  always_comb begin
+    ptr_out_d = ptr_out_q;
+    if (drop_req) begin
+      if ((ptr_out_q + drop_len + 1) > (ACT_BUFFER_DEPTH - 1)) begin
+        ptr_out_d = drop_len + 1 - (ACT_BUFFER_DEPTH - ptr_out_q);
+      end else begin
+        ptr_out_d += (drop_len + 1);
+      end
+    end
+    if (valid_out && ready_in) begin
+      if (ptr_out_d == (ACT_BUFFER_DEPTH - 1)) begin
+        ptr_out_d = '0;
+      end else begin
+        ptr_out_d += 1;
+      end
+    end
+  end
+
+  // The BRAM has a read latency of one cycle, so apply the new address one cycle earlier for
+  // first-word fall-through FIFO behavior.
+  //assign ptr_out_bram = (ptr_out_q == (ACT_BUFFER_DEPTH-1)) ? '0 : (ptr_out_q + 1);
+  assign ptr_out_bram = ptr_out_d;
+
+  // Update the input pointer.
+  always_comb begin
+    ptr_in_d = ptr_in_q;
+    if (valid_in && ready_out) begin
+      if (ptr_in_d == (ACT_BUFFER_DEPTH - 1)) begin
+        ptr_in_d = '0;
+      end else begin
+        ptr_in_d += 1;
+      end
+    end
+  end
+
+  // Update output ports.
+  assign valid_out = (n_elems_q > $signed(0));
+  assign underfull = (n_elems_q < $signed(0));
+  assign ready_out = ~full;
+
+  ram_tp_write_first #(
+    .ADDR_WIDTH ( ACT_LOG_BUFFER_DEPTH ),
+    .DATA_WIDTH ( DATA_WIDTH           )
+  )
+  ram_tp_write_first_0
+  (
+    .clk   ( clk              ),
+    .we    ( valid_in & ~full ),
+    .addr0 ( ptr_in_q         ),
+    .addr1 ( ptr_out_bram     ),
+    .d_i   ( data_in          ),
+    .d0_o  (                  ),
+    .d1_o  ( data_out_bram    )
+  );
+
+  // When reading from/writing two the same address on both ports ("Write-Read Collision"),
+  // the data on the read port is invalid (during the write cycle). In this implementation,
+  // this can happen only when the buffer is empty. Thus, we forward the data from an
+  // register in this case.
+  always @(posedge clk) begin
+    if (rstn == 1'b0) begin
+      data_out_q <= 'b0;
+    end else if ( (ptr_out_bram == ptr_in_q) && (valid_in && !full) ) begin
+      data_out_q <= data_in;
+    end
+  end
+
+  always @(posedge clk) begin
+    if (rstn == 1'b0) begin
+      valid_out_q <= 'b0;
+    end else begin
+      valid_out_q <= valid_out;
+    end
+  end
+
+  // Drive output data
+  always_comb begin
+    if (valid_out && !valid_out_q) begin // We have just written to an empty FIFO
+      data_out = data_out_q;
+    end else begin
+      data_out = data_out_bram;
+    end
+  end
+
+"""
+# endmodule
+#
+#
diff --git a/src/soc/iommu/axi_rab/axi_rab_cfg.py b/src/soc/iommu/axi_rab/axi_rab_cfg.py
new file mode 100644
index 00000000..43843b95
--- /dev/null
+++ b/src/soc/iommu/axi_rab/axi_rab_cfg.py
@@ -0,0 +1,707 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi_rab_cfg(Elaboratable):
+
+    def __init__(self):
+        self.Clk_CI = Signal()  # input
+        self.Rst_RBI = Signal()  # input
+        self.s_axi_awaddr = Signal(AXI_ADDR_WIDTH)  # input
+        self.s_axi_awvalid = Signal()  # input
+        self.s_axi_awready = Signal()  # output
+        self.s_axi_wdata = Signal()  # input
+        self.s_axi_wstrb = Signal(1+ERROR p_expression_25)  # input
+        self.s_axi_wvalid = Signal()  # input
+        self.s_axi_wready = Signal()  # output
+        self.s_axi_bresp = Signal(2)  # output
+        self.s_axi_bvalid = Signal()  # output
+        self.s_axi_bready = Signal()  # input
+        self.s_axi_araddr = Signal(AXI_ADDR_WIDTH)  # input
+        self.s_axi_arvalid = Signal()  # input
+        self.s_axi_arready = Signal()  # output
+        self.s_axi_rdata = Signal(AXI_DATA_WIDTH)  # output
+        self.s_axi_rresp = Signal(2)  # output
+        self.s_axi_rvalid = Signal()  # output
+        self.s_axi_rready = Signal()  # input
+        self.L1Cfg_DO = Signal()  # output
+        self.L1AllowMultiHit_SO = Signal()  # output
+        self.MissAddr_DI = Signal(ADDR_WIDTH_VIRT)  # input
+        self.MissMeta_DI = Signal(MISS_META_WIDTH)  # input
+        self.Miss_SI = Signal()  # input
+        self.MhFifoFull_SO = Signal()  # output
+        self.wdata_l2 = Signal()  # output
+        self.waddr_l2 = Signal()  # output
+        self.wren_l2 = Signal(N_PORTS)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# // --=========================================================================--
+# //
+# //  ââââââ âââ  ââââââ    âââââââ  ââââââ âââââââ      âââââââââââââââ âââââââ
+# // âââââââââââââââââââ    ââââââââââââââââââââââââ    ââââââââââââââââââââââââ
+# // ââââââââ ââââââ âââ    ââââââââââââââââââââââââ    âââ     ââââââ  âââ  ââââ
+# // ââââââââ ââââââ âââ    ââââââââââââââââââââââââ    âââ     ââââââ  âââ   âââ
+# // âââ  âââââââ ââââââ    âââ  ââââââ  âââââââââââ    âââââââââââ     âââââââââ
+# // âââ  ââââââ  ââââââ    âââ  ââââââ  ââââââââââ      ââââââââââ      âââââââ
+# //
+# //
+# // Author: Pirmin Vogel - vogelpi@iis.ee.ethz.ch
+# //
+# // Purpose : AXI4-Lite configuration and miss handling interface for RAB
+# //
+# // --=========================================================================--
+#
+# //import CfMath::log2;
+#
+# module axi_rab_cfg
+#  #(
+#    parameter N_PORTS         =   3,
+#    parameter N_REGS          = 196,
+#    parameter N_L2_SETS       =  32,
+#    parameter N_L2_SET_ENTRIES=  32,
+#    parameter ADDR_WIDTH_PHYS =  40,
+#    parameter ADDR_WIDTH_VIRT =  32,
+#    parameter N_FLAGS         =   4,
+#    parameter AXI_DATA_WIDTH  =  64,
+#    parameter AXI_ADDR_WIDTH  =  32,
+#    parameter MISS_META_WIDTH =  10,  // <= FIFO_WIDTH
+#    parameter MH_FIFO_DEPTH   =  16
+#    )
+#   (
+#    input  logic                                    Clk_CI,
+#    input  logic                                    Rst_RBI,
+#
+#    // AXI Lite interface
+#    input  logic [AXI_ADDR_WIDTH-1:0]               s_axi_awaddr,
+#    input  logic                                    s_axi_awvalid,
+#    output logic                                    s_axi_awready,
+#    input  logic [AXI_DATA_WIDTH/8-1:0][7:0]        s_axi_wdata,
+#    input  logic [AXI_DATA_WIDTH/8-1:0]             s_axi_wstrb,
+#    input  logic                                    s_axi_wvalid,
+#    output logic                                    s_axi_wready,
+#    output logic [1:0]                              s_axi_bresp,
+#    output logic                                    s_axi_bvalid,
+#    input  logic                                    s_axi_bready,
+#    input  logic [AXI_ADDR_WIDTH-1:0]               s_axi_araddr,
+#    input  logic                                    s_axi_arvalid,
+#    output logic                                    s_axi_arready,
+#    output logic [AXI_DATA_WIDTH-1:0]               s_axi_rdata,
+#    output logic [1:0]                              s_axi_rresp,
+#    output logic                                    s_axi_rvalid,
+#    input  logic                                    s_axi_rready,
+#
+#    // Slice configuration
+#    output logic [N_REGS-1:0][63:0]                 L1Cfg_DO,
+#    output logic                                    L1AllowMultiHit_SO,
+#
+#    // Miss handling
+#    input  logic [ADDR_WIDTH_VIRT-1:0]              MissAddr_DI,
+#    input  logic [MISS_META_WIDTH-1:0]              MissMeta_DI,
+#    input  logic                                    Miss_SI,
+#    output logic                                    MhFifoFull_SO,
+#
+#    // L2 TLB
+#    output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] wdata_l2,
+#    output logic [N_PORTS-1:0] [AXI_ADDR_WIDTH-1:0] waddr_l2,
+#    output logic [N_PORTS-1:0]                      wren_l2
+#  );
+#
+"""  #docstring_begin
+
+  localparam ADDR_LSB = log2(64/8); // 64 even if the AXI Lite interface is 32,
+                                    // because RAB slices are 64 bit wide.
+  localparam ADDR_MSB = log2(N_REGS)+ADDR_LSB-1;
+
+  localparam L2SINGLE_AMAP_SIZE = 16'h4000; // Maximum 2048 TLB entries in L2
+
+  localparam integer N_L2_ENTRIES = N_L2_SETS * N_L2_SET_ENTRIES;
+
+  localparam logic [AXI_ADDR_WIDTH-1:0] L2_VA_MAX_ADDR = (N_L2_ENTRIES-1) << 2;
+
+  logic [AXI_DATA_WIDTH/8-1:0][7:0] L1Cfg_DP[N_REGS]; // [Byte][Bit]
+  genvar j;
+
+  //  ââââââ âââ  âââââââââ  âââ      âââ     ââââââââââââââââââââ
+  // ââââââââââââââââââââââ  âââ      âââ     ââââââââââââââââââââ
+  // ââââââââ ââââââ ââââââââââââââââââââ     âââ   âââ   ââââââ
+  // ââââââââ ââââââ ââââââââââââââââââââ     âââ   âââ   ââââââ
+  // âââ  âââââââ ââââââ     âââ      âââââââââââ   âââ   ââââââââ
+  // âââ  ââââââ  ââââââ     âââ      âââââââââââ   âââ   ââââââââ
+  //
+  logic [AXI_ADDR_WIDTH-1:0]        awaddr_reg;
+  logic                             awaddr_done_rise;
+  logic                             awaddr_done_reg;
+  logic                             awaddr_done_reg_dly;
+
+  logic [AXI_DATA_WIDTH/8-1:0][7:0] wdata_reg;
+  logic [AXI_DATA_WIDTH/8-1:0]      wstrb_reg;
+  logic                             wdata_done_rise;
+  logic                             wdata_done_reg;
+  logic                             wdata_done_reg_dly;
+
+  logic                             wresp_done_reg;
+  logic                             wresp_running_reg;
+
+  logic [AXI_ADDR_WIDTH-1:0]        araddr_reg;
+  logic                             araddr_done_reg;
+
+  logic [AXI_DATA_WIDTH-1:0]        rdata_reg;
+  logic                             rresp_done_reg;
+  logic                             rresp_running_reg;
+
+  logic                             awready;
+  logic                             wready;
+  logic                             bvalid;
+
+  logic                             arready;
+  logic                             rvalid;
+
+  logic                             wren;
+  logic                             wren_l1;
+
+  assign wren = ( wdata_done_rise & awaddr_done_reg ) | ( awaddr_done_rise & wdata_done_reg );
+  assign wdata_done_rise  = wdata_done_reg  & ~wdata_done_reg_dly;
+  assign awaddr_done_rise = awaddr_done_reg & ~awaddr_done_reg_dly;
+
+  // reg_dly
+  always @(posedge Clk_CI or negedge Rst_RBI)
+    begin
+       if (!Rst_RBI)
+         begin
+            wdata_done_reg_dly  <= 1'b0;
+            awaddr_done_reg_dly <= 1'b0;
+         end
+       else
+         begin
+            wdata_done_reg_dly  <= wdata_done_reg;
+            awaddr_done_reg_dly <= awaddr_done_reg;
+         end
+    end
+
+  // AW Channel
+  always @(posedge Clk_CI or negedge Rst_RBI)
+    begin
+       if (!Rst_RBI)
+         begin
+            awaddr_done_reg <= 1'b0;
+            awaddr_reg      <= '0;
+            awready         <= 1'b1;
+         end
+       else
+         begin
+            if (awready && s_axi_awvalid)
+              begin
+                 awready         <= 1'b0;
+                 awaddr_done_reg <= 1'b1;
+                 awaddr_reg      <= s_axi_awaddr;
+              end
+            else if (awaddr_done_reg && wresp_done_reg)
+              begin
+                 awready         <= 1'b1;
+                 awaddr_done_reg <= 1'b0;
+              end
+         end
+    end
+
+  // W Channel
+  always @(posedge Clk_CI or negedge Rst_RBI)
+    begin
+       if (!Rst_RBI)
+         begin
+            wdata_done_reg <= 1'b0;
+            wready         <= 1'b1;
+            wdata_reg      <= '0;
+            wstrb_reg      <= '0;
+         end
+       else
+         begin
+            if (wready && s_axi_wvalid)
+              begin
+                 wready         <= 1'b0;
+                 wdata_done_reg <= 1'b1;
+                 wdata_reg      <= s_axi_wdata;
+                 wstrb_reg      <= s_axi_wstrb;
+              end
+            else if (wdata_done_reg && wresp_done_reg)
+              begin
+                 wready         <= 1'b1;
+                 wdata_done_reg <= 1'b0;
+              end
+         end
+    end
+
+  // B Channel
+  always @(posedge Clk_CI or negedge Rst_RBI)
+    begin
+       if (!Rst_RBI)
+         begin
+            bvalid            <= 1'b0;
+            wresp_done_reg    <= 1'b0;
+            wresp_running_reg <= 1'b0;
+         end
+       else
+         begin
+            if (awaddr_done_reg && wdata_done_reg && !wresp_done_reg)
+              begin
+                 if (!wresp_running_reg)
+                   begin
+                      bvalid            <= 1'b1;
+                      wresp_running_reg <= 1'b1;
+                   end
+                 else if (s_axi_bready)
+                   begin
+                      bvalid            <= 1'b0;
+                      wresp_done_reg    <= 1'b1;
+                      wresp_running_reg <= 1'b0;
+                   end
+              end
+            else
+              begin
+                 bvalid            <= 1'b0;
+                 wresp_done_reg    <= 1'b0;
+                 wresp_running_reg <= 1'b0;
+              end
+         end
+    end
+
+  // AR Channel
+  always @(posedge Clk_CI or negedge Rst_RBI)
+    begin
+       if (!Rst_RBI)
+         begin
+            araddr_done_reg <= 1'b0;
+            arready         <= 1'b1;
+            araddr_reg       <= '0;
+         end
+       else
+         begin
+            if (arready && s_axi_arvalid)
+              begin
+                 arready         <= 1'b0;
+                 araddr_done_reg <= 1'b1;
+                 araddr_reg      <= s_axi_araddr;
+              end
+            else if (araddr_done_reg && rresp_done_reg)
+              begin
+                 arready         <= 1'b1;
+                 araddr_done_reg <= 1'b0;
+              end
+         end
+    end
+
+  // R Channel
+  always @(posedge Clk_CI or negedge Rst_RBI)
+    begin
+       if (!Rst_RBI)
+         begin
+            rresp_done_reg    <= 1'b0;
+            rvalid            <= 1'b0;
+            rresp_running_reg <= 1'b0;
+         end
+       else
+         begin
+            if (araddr_done_reg && !rresp_done_reg)
+              begin
+                 if (!rresp_running_reg)
+                   begin
+                      rvalid            <= 1'b1;
+                      rresp_running_reg <= 1'b1;
+                   end
+                 else if (s_axi_rready)
+                   begin
+                      rvalid            <= 1'b0;
+                      rresp_done_reg    <= 1'b1;
+                      rresp_running_reg <= 1'b0;
+                   end
+              end
+            else
+              begin
+                 rvalid            <= 1'b0;
+                 rresp_done_reg    <= 1'b0;
+                 rresp_running_reg <= 1'b0;
+              end
+         end
+    end
+
+  // âââ     âââ     âââââââââââââââ âââââââ     âââââââ ââââââââ âââââââ
+  // âââ    ââââ    ââââââââââââââââââââââââ     ââââââââââââââââââââââââ
+  // âââ    ââââ    âââ     ââââââ  âââ  ââââ    ââââââââââââââ  âââ  ââââ
+  // âââ     âââ    âââ     ââââââ  âââ   âââ    ââââââââââââââ  âââ   âââ
+  // âââââââââââ    âââââââââââ     âââââââââ    âââ  ââââââââââââââââââââ
+  // âââââââââââ     ââââââââââ      âââââââ     âââ  âââââââââââ âââââââ
+  //
+  assign wren_l1 = wren && (awaddr_reg < L2SINGLE_AMAP_SIZE);
+
+  always @( posedge Clk_CI or negedge Rst_RBI )
+    begin
+      var integer idx_reg, idx_byte;
+      if ( Rst_RBI == 1'b0 )
+        begin
+          for ( idx_reg = 0; idx_reg < N_REGS; idx_reg++ )
+            L1Cfg_DP[idx_reg] <= '0;
+        end
+      else if ( wren_l1 )
+          begin
+            if ( awaddr_reg[ADDR_LSB+1] == 1'b0 ) begin                     // VIRT_ADDR
+              for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
+                if ( (idx_byte < ADDR_WIDTH_VIRT/8) ) begin
+                  if ( wstrb_reg[idx_byte] ) begin
+                    L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte];
+                  end
+                end
+                else begin  // Let synthesizer optimize away unused registers.
+                  L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
+                end
+              end
+            end
+            else if ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b10 ) begin      // PHYS_ADDR
+              for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
+                if ( (idx_byte < ADDR_WIDTH_PHYS/8) ) begin
+                  if ( wstrb_reg[idx_byte] ) begin
+                    L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte];
+                  end
+                end
+                else begin  // Let synthesizer optimize away unused registers.
+                  L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
+                end
+              end
+            end
+            else begin // ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b11 )      // FLAGS
+              for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
+                if ( (idx_byte < 1) ) begin
+                  if ( wstrb_reg[idx_byte] ) begin
+                    L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte] & { {{8-N_FLAGS}{1'b0}}, {{N_FLAGS}{1'b1}} };
+                  end
+                end
+                else begin  // Let synthesizer optimize away unused registers.
+                  L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
+                end
+              end
+            end
+          end
+    end // always @ ( posedge Clk_CI or negedge Rst_RBI )
+
+  generate
+    // Mask unused bits -> Synthesizer should optimize away unused registers
+    for( j=0; j<N_REGS; j++ ) begin
+      if ( j[1] == 1'b0 ) // VIRT_ADDR
+        assign L1Cfg_DO[j] = { {{64-ADDR_WIDTH_VIRT}{1'b0}},{ADDR_WIDTH_VIRT{1'b1}} } & L1Cfg_DP[j];
+      else if ( j[1:0] == 2'b10 ) // PHYS_ADDR
+        assign L1Cfg_DO[j] = { {{64-ADDR_WIDTH_PHYS}{1'b0}},{ADDR_WIDTH_PHYS{1'b1}} } & L1Cfg_DP[j];
+      else // if ( j[1:0] == 2'b11 ) // FLAGS
+        assign L1Cfg_DO[j] = { {{64-N_FLAGS}{1'b0}},{N_FLAGS{1'b1}} } & L1Cfg_DP[j];
+    end
+  endgenerate
+
+  always_comb
+    begin
+      if ( araddr_reg[ADDR_LSB-1] == 1'b1 ) // read upper 32 bit, for debugging over 32-bit interface
+        rdata_reg = { {32'h00000000},{L1Cfg_DO[araddr_reg[ADDR_MSB:ADDR_LSB]][63:32]} };
+      else
+        rdata_reg = L1Cfg_DO[araddr_reg[ADDR_MSB:ADDR_LSB]];
+    end
+
+  assign s_axi_awready = awready;
+  assign s_axi_wready  = wready;
+
+  assign s_axi_bresp   = 2'b00;
+  assign s_axi_bvalid  = bvalid;
+
+  assign s_axi_arready = arready;
+  assign s_axi_rresp   = 2'b00;
+  assign s_axi_rvalid  = rvalid;
+
+  // âââ     âââââââ      âââââââââââââââ âââââââ
+  // âââ     ââââââââ    ââââââââââââââââââââââââ
+  // âââ      âââââââ    âââ     ââââââ  âââ  ââââ
+  // âââ     âââââââ     âââ     ââââââ  âââ   âââ
+  // ââââââââââââââââ    âââââââââââ     âââââââââ
+  // ââââââââââââââââ     ââââââââââ      âââââââ
+  //
+  logic [N_PORTS-1:0] l2_addr_is_in_va_rams;
+  logic [N_PORTS-1:0] upper_word_is_written;
+  logic [N_PORTS-1:0] lower_word_is_written;
+  generate
+    for( j=0; j< N_PORTS; j++)
+      begin
+        if (AXI_DATA_WIDTH == 64) begin
+          assign l2_addr_is_in_va_rams[j] = (awaddr_reg >= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg[log2(L2SINGLE_AMAP_SIZE)-1:0] <= L2_VA_MAX_ADDR);
+          assign upper_word_is_written[j] = (wstrb_reg[7:4] != 4'b0000);
+          assign lower_word_is_written[j] = (wstrb_reg[3:0] != 4'b0000);
+        end else begin
+          assign l2_addr_is_in_va_rams[j] = 1'b0;
+          assign upper_word_is_written[j] = 1'b0;
+          assign lower_word_is_written[j] = 1'b0;
+        end
+
+        always @( posedge Clk_CI or negedge Rst_RBI ) begin
+          var integer idx_byte, off_byte;
+          if ( Rst_RBI == 1'b0 )
+            begin
+              wren_l2[j]  <= 1'b0;
+              wdata_l2[j] <= '0;
+            end
+          else if (wren)
+            begin
+              if ( (awaddr_reg >= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg < (j+2)*L2SINGLE_AMAP_SIZE) && (|wstrb_reg) )
+                wren_l2[j] <= 1'b1;
+              if      (AXI_DATA_WIDTH == 32) begin
+                for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ )
+                  wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte] & {8{wstrb_reg[idx_byte]}};
+              end
+              else if (AXI_DATA_WIDTH == 64) begin
+                if (lower_word_is_written[j] == 1'b1)
+                  off_byte = 0;
+                else
+                  off_byte = 4;
+                // always put the payload in the lower word and set upper word to 0
+                for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8/2; idx_byte++ )
+                    wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte+off_byte] & {8{wstrb_reg[idx_byte+off_byte]}};
+                wdata_l2[j][AXI_DATA_WIDTH-1:AXI_DATA_WIDTH/2] <= 'b0;
+              end
+              // pragma translate_off
+              else
+                $fatal(1, "Unsupported AXI_DATA_WIDTH!");
+              // pragma translate_on
+            end
+          else
+            wren_l2[j] <= '0;
+        end // always @ ( posedge Clk_CI or negedge Rst_RBI )
+
+        // Properly align the 32-bit word address when writing from 64-bit interface:
+        // Depending on the system, the incoming address is (non-)aligned to the 64-bit
+        // word when writing the upper 32-bit word.
+        always_comb begin
+          waddr_l2[j] = (awaddr_reg -(j+1)*L2SINGLE_AMAP_SIZE)/4;
+          if (wren_l2[j]) begin
+            if (AXI_DATA_WIDTH == 64) begin
+              if (upper_word_is_written[j] == 1'b1) begin
+                // address must be non-aligned
+                waddr_l2[j][0] = 1'b1;
+              end
+            end
+            // pragma translate_off
+            else if (AXI_DATA_WIDTH != 32) begin
+              $fatal(1, "Unsupported AXI_DATA_WIDTH!");
+            end
+            // pragma translate_on
+          end
+        end
+
+        // Assert that only one 32-bit word is ever written at a time to VA RAMs on 64-bit data
+        // systems.
+        // pragma translate_off
+        always_ff @ (posedge Clk_CI) begin
+          if (AXI_DATA_WIDTH == 64) begin
+            if  (l2_addr_is_in_va_rams[j]) begin
+              if (upper_word_is_written[j]) begin
+                assert (!lower_word_is_written[j])
+                  else $error("Unsupported write across two 32-bit words to VA RAMs!");
+              end
+              else if (lower_word_is_written[j]) begin
+                assert (!upper_word_is_written[j])
+                  else $error("Unsupported write across two 32-bit words to VA RAMs!");
+              end
+            end
+          end
+        end
+        // pragma translate_on
+
+      end // for (j=0; j< N_PORTS; j++)
+   endgenerate
+
+  // ââââ   âââââââ  âââ    âââââââââââââââââââ âââââââ ââââââââ
+  // âââââ ââââââââ  âââ    ââââââââââââââââââââââââââââââââââââ
+  // âââââââââââââââââââ    ââââââ  âââââââââ  âââ   âââââââââââ
+  // âââââââââââââââââââ    ââââââ  âââââââââ  âââ   âââââââââââ
+  // âââ âââ ââââââ  âââ    âââ     ââââââ     âââââââââââââââââ
+  // âââ     ââââââ  âââ    âââ     ââââââ      âââââââ ââââââââ
+  //
+  logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDin_D;
+  logic                       AddrFifoWen_S;
+  logic                       AddrFifoRen_S;
+  logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDout_D;
+  logic                       AddrFifoFull_S;
+  logic                       AddrFifoEmpty_S;
+  logic                       AddrFifoEmpty_SB;
+  logic                       AddrFifoFull_SB;
+
+  logic [MISS_META_WIDTH-1:0] MetaFifoDin_D;
+  logic                       MetaFifoWen_S;
+  logic                       MetaFifoRen_S;
+  logic [MISS_META_WIDTH-1:0] MetaFifoDout_D;
+  logic                       MetaFifoFull_S;
+  logic                       MetaFifoEmpty_S;
+  logic                       MetaFifoEmpty_SB;
+  logic                       MetaFifoFull_SB;
+
+  logic                       FifosDisabled_S;
+  logic                       ConfRegWen_S;
+  logic                 [1:0] ConfReg_DN;
+  logic                 [1:0] ConfReg_DP;
+
+  logic [AXI_DATA_WIDTH-1:0] wdata_reg_vec;
+
+  assign FifosDisabled_S    = ConfReg_DP[0];
+  assign L1AllowMultiHit_SO = ConfReg_DP[1];
+
+  assign AddrFifoEmpty_S = ~AddrFifoEmpty_SB;
+  assign MetaFifoEmpty_S = ~MetaFifoEmpty_SB;
+
+  assign AddrFifoFull_S = ~AddrFifoFull_SB;
+  assign MetaFifoFull_S = ~MetaFifoFull_SB;
+
+  assign MhFifoFull_SO = (AddrFifoWen_S & AddrFifoFull_S) | (MetaFifoWen_S & MetaFifoFull_S);
+
+  generate
+     for ( j=0; j<AXI_DATA_WIDTH/8; j++ )
+       assign wdata_reg_vec[(j+1)*8-1:j*8] = wdata_reg[j];
+  endgenerate
+
+  // write address FIFO
+  always_comb
+    begin
+       AddrFifoWen_S = 1'b0;
+       AddrFifoDin_D = 'b0;
+       if ( (Miss_SI == 1'b1) && (FifosDisabled_S == 1'b0) ) // register a new miss
+         begin
+            AddrFifoWen_S = 1'b1;
+            AddrFifoDin_D = MissAddr_DI;
+         end
+       else if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 'b0) && (FifosDisabled_S == 1'b0)) // write request from AXI interface
+         begin
+            AddrFifoWen_S = 1'b1;
+            AddrFifoDin_D = wdata_reg_vec[ADDR_WIDTH_VIRT-1:0];
+         end
+    end
+
+  // write meta FIFO
+  always_comb
+    begin
+       MetaFifoWen_S = 1'b0;
+       MetaFifoDin_D = 'b0;
+       if ( (Miss_SI == 1'b1) && (FifosDisabled_S == 1'b0) ) // register a new miss
+         begin
+            MetaFifoWen_S                      = 1'b1;
+            MetaFifoDin_D[MISS_META_WIDTH-1:0] = MissMeta_DI;
+         end
+       else if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 4'h8) && (FifosDisabled_S == 1'b0) ) // write request from AXI interface
+         begin
+            MetaFifoWen_S = 1'b1;
+            MetaFifoDin_D = wdata_reg_vec[MISS_META_WIDTH-1:0];
+         end
+    end
+
+  // write configuration register
+  always_comb
+    begin
+       ConfRegWen_S = 1'b0;
+       ConfReg_DN   = 1'b0;
+       if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 8'h10) ) // write request from AXI interface
+         begin
+            ConfRegWen_S = 1'b1;
+            ConfReg_DN   = wdata_reg_vec[$high(ConfReg_DN):0];
+         end
+    end
+
+  // AXI read data
+  always_comb
+    begin
+       s_axi_rdata   = rdata_reg; // read L1 config
+       AddrFifoRen_S = 1'b0;
+       MetaFifoRen_S = 1'b0;
+       if ( rvalid == 1'b1 )
+         begin
+            // read address FIFO
+            if ( araddr_reg[ADDR_MSB:0] == 'b0 )
+              begin
+                s_axi_rdata                      = {AXI_DATA_WIDTH{1'b0}};
+                s_axi_rdata[ADDR_WIDTH_VIRT-1:0] = AddrFifoDout_D;
+                if ( AddrFifoEmpty_S == 1'b0 )
+                  AddrFifoRen_S = 1'b1;
+              end
+            // read meta FIFO
+            else if ( araddr_reg[ADDR_MSB:0] == 4'h8 )
+              begin
+                s_axi_rdata                      = {AXI_DATA_WIDTH{1'b0}};
+                s_axi_rdata[31]                  = MetaFifoEmpty_S;
+                s_axi_rdata[MISS_META_WIDTH-1:0] = MetaFifoDout_D;
+                if ( MetaFifoEmpty_S == 1'b0 )
+                  MetaFifoRen_S = 1'b1;
+              end
+            // read configuration register
+            else if ( araddr_reg[ADDR_MSB:0] == 8'h10 )
+              begin
+                s_axi_rdata                      = {AXI_DATA_WIDTH{1'b0}};
+                s_axi_rdata[$high(ConfReg_DP):0] = ConfReg_DP;
+              end
+         end // if ( rvalid == 1'b1 )
+    end // always_comb begin
+
+  // configuration register
+  always_ff @(posedge Clk_CI or negedge Rst_RBI) begin
+    if (Rst_RBI == 1'b0)
+      begin
+        ConfReg_DP <= 'b0;
+      end
+    else if (ConfRegWen_S == 1'b1)
+      begin
+        ConfReg_DP <= ConfReg_DN;
+      end
+  end
+
+  generic_fifo
+    #(
+      .DATA_WIDTH ( ADDR_WIDTH_VIRT ),
+      .DATA_DEPTH ( MH_FIFO_DEPTH   )
+      )
+    fifo_addr_i
+    (
+      .clk         ( Clk_CI                          ),
+      .rst_n       ( Rst_RBI                         ),
+      .data_i      ( AddrFifoDin_D                   ),
+      .valid_i     ( AddrFifoWen_S & AddrFifoFull_SB ),
+      .grant_o     ( AddrFifoFull_SB                 ),
+      .data_o      ( AddrFifoDout_D                  ),
+      .valid_o     ( AddrFifoEmpty_SB                ),
+      .grant_i     ( AddrFifoRen_S                   ),
+      .test_mode_i ( 1'b0                            )
+    );
+
+  generic_fifo
+    #(
+      .DATA_WIDTH ( MISS_META_WIDTH ),
+      .DATA_DEPTH ( MH_FIFO_DEPTH   )
+      )
+    fifo_meta_i
+    (
+      .clk         ( Clk_CI                          ),
+      .rst_n       ( Rst_RBI                         ),
+      .data_i      ( MetaFifoDin_D                   ),
+      .valid_i     ( MetaFifoWen_S & MetaFifoFull_SB ),
+      .grant_o     ( MetaFifoFull_SB                 ),
+      .data_o      ( MetaFifoDout_D                  ),
+      .valid_o     ( MetaFifoEmpty_SB                ),
+      .grant_i     ( MetaFifoRen_S                   ),
+      .test_mode_i ( 1'b0                            )
+    );
+"""
+#
+# endmodule
+#
+#
diff --git a/src/soc/iommu/axi_rab/axi_rab_top.py b/src/soc/iommu/axi_rab/axi_rab_top.py
new file mode 100644
index 00000000..ea1a802d
--- /dev/null
+++ b/src/soc/iommu/axi_rab/axi_rab_top.py
@@ -0,0 +1,2642 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi_rab_top(Elaboratable):
+
+    def __init__(self):
+        self.Clk_CI = Signal()  # input
+        self.NonGatedClk_CI = Signal()  # input
+        self.Rst_RBI = Signal()  # input
+        self.s_axi4_awid = Signal()  # input
+        self.s_axi4_awaddr = Signal()  # input
+        self.s_axi4_awvalid = Signal(N_PORTS)  # input
+        self.s_axi4_awready = Signal(N_PORTS)  # output
+        self.s_axi4_awlen = Signal()  # input
+        self.s_axi4_awsize = Signal()  # input
+        self.s_axi4_awburst = Signal()  # input
+        self.s_axi4_awlock = Signal(N_PORTS)  # input
+        self.s_axi4_awprot = Signal()  # input
+        self.s_axi4_awcache = Signal()  # input
+        self.s_axi4_awregion = Signal()  # input
+        self.s_axi4_awqos = Signal()  # input
+        self.s_axi4_awuser = Signal()  # input
+        self.s_axi4_wdata = Signal()  # input
+        self.s_axi4_wvalid = Signal(N_PORTS)  # input
+        self.s_axi4_wready = Signal(N_PORTS)  # output
+        self.s_axi4_wstrb = Signal()  # input
+        self.s_axi4_wlast = Signal(N_PORTS)  # input
+        self.s_axi4_wuser = Signal()  # input
+        self.s_axi4_bid = Signal()  # output
+        self.s_axi4_bresp = Signal()  # output
+        self.s_axi4_bvalid = Signal(N_PORTS)  # output
+        self.s_axi4_buser = Signal()  # output
+        self.s_axi4_bready = Signal(N_PORTS)  # input
+        self.s_axi4_arid = Signal()  # input
+        self.s_axi4_araddr = Signal()  # input
+        self.s_axi4_arvalid = Signal(N_PORTS)  # input
+        self.s_axi4_arready = Signal(N_PORTS)  # output
+        self.s_axi4_arlen = Signal()  # input
+        self.s_axi4_arsize = Signal()  # input
+        self.s_axi4_arburst = Signal()  # input
+        self.s_axi4_arlock = Signal(N_PORTS)  # input
+        self.s_axi4_arprot = Signal()  # input
+        self.s_axi4_arcache = Signal()  # input
+        self.s_axi4_aruser = Signal()  # input
+        self.s_axi4_rid = Signal()  # output
+        self.s_axi4_rdata = Signal()  # output
+        self.s_axi4_rresp = Signal()  # output
+        self.s_axi4_rvalid = Signal(N_PORTS)  # output
+        self.s_axi4_rready = Signal(N_PORTS)  # input
+        self.s_axi4_rlast = Signal(N_PORTS)  # output
+        self.s_axi4_ruser = Signal()  # output
+        self.m0_axi4_awid = Signal()  # output
+        self.m0_axi4_awaddr = Signal()  # output
+        self.m0_axi4_awvalid = Signal(N_PORTS)  # output
+        self.m0_axi4_awready = Signal(N_PORTS)  # input
+        self.m0_axi4_awlen = Signal()  # output
+        self.m0_axi4_awsize = Signal()  # output
+        self.m0_axi4_awburst = Signal()  # output
+        self.m0_axi4_awlock = Signal(N_PORTS)  # output
+        self.m0_axi4_awprot = Signal()  # output
+        self.m0_axi4_awcache = Signal()  # output
+        self.m0_axi4_awregion = Signal()  # output
+        self.m0_axi4_awqos = Signal()  # output
+        self.m0_axi4_awuser = Signal()  # output
+        self.m0_axi4_wdata = Signal()  # output
+        self.m0_axi4_wvalid = Signal(N_PORTS)  # output
+        self.m0_axi4_wready = Signal(N_PORTS)  # input
+        self.m0_axi4_wstrb = Signal()  # output
+        self.m0_axi4_wlast = Signal(N_PORTS)  # output
+        self.m0_axi4_wuser = Signal()  # output
+        self.m0_axi4_bid = Signal()  # input
+        self.m0_axi4_bresp = Signal()  # input
+        self.m0_axi4_bvalid = Signal(N_PORTS)  # input
+        self.m0_axi4_buser = Signal()  # input
+        self.m0_axi4_bready = Signal(N_PORTS)  # output
+        self.m0_axi4_arid = Signal()  # output
+        self.m0_axi4_araddr = Signal()  # output
+        self.m0_axi4_arvalid = Signal(N_PORTS)  # output
+        self.m0_axi4_arready = Signal(N_PORTS)  # input
+        self.m0_axi4_arlen = Signal()  # output
+        self.m0_axi4_arsize = Signal()  # output
+        self.m0_axi4_arburst = Signal()  # output
+        self.m0_axi4_arlock = Signal(N_PORTS)  # output
+        self.m0_axi4_arprot = Signal()  # output
+        self.m0_axi4_arcache = Signal()  # output
+        self.m0_axi4_aruser = Signal()  # output
+        self.m0_axi4_rid = Signal()  # input
+        self.m0_axi4_rdata = Signal()  # input
+        self.m0_axi4_rresp = Signal()  # input
+        self.m0_axi4_rvalid = Signal(N_PORTS)  # input
+        self.m0_axi4_rready = Signal(N_PORTS)  # output
+        self.m0_axi4_rlast = Signal(N_PORTS)  # input
+        self.m0_axi4_ruser = Signal()  # input
+        self.m1_axi4_awid = Signal()  # output
+        self.m1_axi4_awaddr = Signal()  # output
+        self.m1_axi4_awvalid = Signal(N_PORTS)  # output
+        self.m1_axi4_awready = Signal(N_PORTS)  # input
+        self.m1_axi4_awlen = Signal()  # output
+        self.m1_axi4_awsize = Signal()  # output
+        self.m1_axi4_awburst = Signal()  # output
+        self.m1_axi4_awlock = Signal(N_PORTS)  # output
+        self.m1_axi4_awprot = Signal()  # output
+        self.m1_axi4_awcache = Signal()  # output
+        self.m1_axi4_awregion = Signal()  # output
+        self.m1_axi4_awqos = Signal()  # output
+        self.m1_axi4_awuser = Signal()  # output
+        self.m1_axi4_wdata = Signal()  # output
+        self.m1_axi4_wvalid = Signal(N_PORTS)  # output
+        self.m1_axi4_wready = Signal(N_PORTS)  # input
+        self.m1_axi4_wstrb = Signal()  # output
+        self.m1_axi4_wlast = Signal(N_PORTS)  # output
+        self.m1_axi4_wuser = Signal()  # output
+        self.m1_axi4_bid = Signal()  # input
+        self.m1_axi4_bresp = Signal()  # input
+        self.m1_axi4_bvalid = Signal(N_PORTS)  # input
+        self.m1_axi4_buser = Signal()  # input
+        self.m1_axi4_bready = Signal(N_PORTS)  # output
+        self.m1_axi4_arid = Signal()  # output
+        self.m1_axi4_araddr = Signal()  # output
+        self.m1_axi4_arvalid = Signal(N_PORTS)  # output
+        self.m1_axi4_arready = Signal(N_PORTS)  # input
+        self.m1_axi4_arlen = Signal()  # output
+        self.m1_axi4_arsize = Signal()  # output
+        self.m1_axi4_arburst = Signal()  # output
+        self.m1_axi4_arlock = Signal(N_PORTS)  # output
+        self.m1_axi4_arprot = Signal()  # output
+        self.m1_axi4_arcache = Signal()  # output
+        self.m1_axi4_aruser = Signal()  # output
+        self.m1_axi4_rid = Signal()  # input
+        self.m1_axi4_rdata = Signal()  # input
+        self.m1_axi4_rresp = Signal()  # input
+        self.m1_axi4_rvalid = Signal(N_PORTS)  # input
+        self.m1_axi4_rready = Signal(N_PORTS)  # output
+        self.m1_axi4_rlast = Signal(N_PORTS)  # input
+        self.m1_axi4_ruser = Signal()  # input
+        self.s_axi4lite_awaddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
+        self.s_axi4lite_awvalid = Signal()  # input
+        self.s_axi4lite_awready = Signal()  # output
+        self.s_axi4lite_wdata = Signal(AXI_LITE_DATA_WIDTH)  # input
+        self.s_axi4lite_wvalid = Signal()  # input
+        self.s_axi4lite_wready = Signal()  # output
+        self.s_axi4lite_wstrb = Signal(1+ERROR p_expression_25)  # input
+        self.s_axi4lite_bresp = Signal(2)  # output
+        self.s_axi4lite_bvalid = Signal()  # output
+        self.s_axi4lite_bready = Signal()  # input
+        self.s_axi4lite_araddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
+        self.s_axi4lite_arvalid = Signal()  # input
+        self.s_axi4lite_arready = Signal()  # output
+        self.s_axi4lite_rdata = Signal(AXI_LITE_DATA_WIDTH)  # output
+        self.s_axi4lite_rresp = Signal(2)  # output
+        self.s_axi4lite_rvalid = Signal()  # output
+        self.s_axi4lite_rready = Signal()  # input
+        self.int_miss = Signal(N_PORTS)  # output
+        self.int_multi = Signal(N_PORTS)  # output
+        self.int_prot = Signal(N_PORTS)  # output
+        self.int_mhf_full = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# // --=========================================================================--
+# //
+# //  ââââââ âââ  ââââââ    âââââââ  ââââââ âââââââ     âââââââââ âââââââ âââââââ
+# // âââââââââââââââââââ    ââââââââââââââââââââââââ    ââââââââââââââââââââââââââ
+# // ââââââââ ââââââ âââ    ââââââââââââââââââââââââ       âââ   âââ   âââââââââââ
+# // ââââââââ ââââââ âââ    ââââââââââââââââââââââââ       âââ   âââ   ââââââââââ
+# // âââ  âââââââ ââââââ    âââ  ââââââ  âââââââââââ       âââ   ââââââââââââ
+# // âââ  ââââââ  ââââââ    âââ  ââââââ  ââââââââââ        âââ    âââââââ âââ
+# //
+# // --=========================================================================--
+# /*
+# * axi_rab_top
+# *
+# * The remapping address block (RAB) performs address translation for AXI
+# * transactions arriving at the input port and forwards them to different
+# * downstream AXI ports.
+# *
+# * The five axi channels are each buffered on the input side using a FIFO,
+# * described in axi4_XX_buffer. The RAB lookup result is merged into the
+# * AXI transaction via the axi4_XX_sender instances, which manages upstream
+# * error signaling for failed lookups.
+# *
+# * Address translation is performed based on data stored in up to two
+# * translation lookaside buffers (TLBs), which are private per RAB port (each
+# * of which having two AXI master ports and one AXI slave port). These TLBs
+# * are managed in software through the AXI-Lite interface.
+# *
+# * If ACP is enabled, the `cache_coherent` flag in the TLBs is used to
+# * multiplex between the two ports. If ACP is disabled, only the first master
+# * port is used. In this case, the `cache_coherent` flag is used to set the
+# * AxCACHE signals of the AXI bus accordingly.
+# *
+# * Authors:
+# * Antonio Pullini <pullinia@iis.ee.ethz.ch>
+# * Conrad Burchert <bconrad@ethz.ch>
+# * Maheshwara Sharma <msharma@student.ethz.ch>
+# * Andreas Kurth <akurth@iis.ee.ethz.ch>
+# * Johannes Weinbuch <jweinbuch@student.ethz.ch>
+# * Pirmin Vogel <vogelpi@iis.ee.ethz.ch>
+# */
+#
+# //`include "pulp_soc_defines.sv"
+#
+# ////import CfMath::log2;
+#
+# module axi_rab_top
+#
+#  // Parameters {{{
+#  #(
+#    parameter N_PORTS             =  2,
+#    parameter N_L2_SETS           = 32,
+#    parameter N_L2_SET_ENTRIES    = 32,
+#    parameter AXI_DATA_WIDTH      = 64,
+#    parameter AXI_S_ADDR_WIDTH    = 32,
+#    parameter AXI_M_ADDR_WIDTH    = 40,
+#    parameter AXI_LITE_DATA_WIDTH = 64,
+#    parameter AXI_LITE_ADDR_WIDTH = 32,
+#    parameter AXI_ID_WIDTH        = 10,
+#    parameter AXI_USER_WIDTH      =  6,
+#    parameter MH_FIFO_DEPTH       = 16
+#  )
+#  // }}}
+#
+#  // Ports {{{
+#  (
+#
+#    input logic                                            Clk_CI,  // This clock may be gated.
+#    input logic                                            NonGatedClk_CI,
+#    input logic                                            Rst_RBI,
+#
+#    // For every slave port there are two master ports. The master
+#    // port to use can be set using the master_select flag of the protection
+#    // bits of a slice
+#
+#    // AXI4 Slave {{{
+#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_awid,
+#    input  logic    [N_PORTS-1:0]   [AXI_S_ADDR_WIDTH-1:0] s_axi4_awaddr,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_awvalid,
+#    output logic    [N_PORTS-1:0]                          s_axi4_awready,
+#    input  logic    [N_PORTS-1:0]                    [7:0] s_axi4_awlen,
+#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_awsize,
+#    input  logic    [N_PORTS-1:0]                    [1:0] s_axi4_awburst,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_awlock,
+#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_awprot,
+#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_awcache,
+#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_awregion,
+#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_awqos,
+#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_awuser,
+#
+#    input  logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_wvalid,
+#    output logic    [N_PORTS-1:0]                          s_axi4_wready,
+#    input  logic    [N_PORTS-1:0]   [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_wlast,
+#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_wuser,
+#
+#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_bid,
+#    output logic    [N_PORTS-1:0]                    [1:0] s_axi4_bresp,
+#    output logic    [N_PORTS-1:0]                          s_axi4_bvalid,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_buser,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_bready,
+#
+#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_arid,
+#    input  logic    [N_PORTS-1:0]   [AXI_S_ADDR_WIDTH-1:0] s_axi4_araddr,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_arvalid,
+#    output logic    [N_PORTS-1:0]                          s_axi4_arready,
+#    input  logic    [N_PORTS-1:0]                    [7:0] s_axi4_arlen,
+#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_arsize,
+#    input  logic    [N_PORTS-1:0]                    [1:0] s_axi4_arburst,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_arlock,
+#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_arprot,
+#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_arcache,
+#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_aruser,
+#
+#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_rid,
+#    output logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
+#    output logic    [N_PORTS-1:0]                    [1:0] s_axi4_rresp,
+#    output logic    [N_PORTS-1:0]                          s_axi4_rvalid,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_rready,
+#    output logic    [N_PORTS-1:0]                          s_axi4_rlast,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_ruser,
+#    // }}}
+#
+#    // AXI4 Master 0 {{{
+#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_awid,
+#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m0_axi4_awaddr,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_awvalid,
+#    input  logic    [N_PORTS-1:0]                          m0_axi4_awready,
+#    output logic    [N_PORTS-1:0]                    [7:0] m0_axi4_awlen,
+#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_awsize,
+#    output logic    [N_PORTS-1:0]                    [1:0] m0_axi4_awburst,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_awlock,
+#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_awprot,
+#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_awcache,
+#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_awregion,
+#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_awqos,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_awuser,
+#
+#    output logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m0_axi4_wdata,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_wvalid,
+#    input  logic    [N_PORTS-1:0]                          m0_axi4_wready,
+#    output logic    [N_PORTS-1:0]   [AXI_DATA_WIDTH/8-1:0] m0_axi4_wstrb,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_wlast,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_wuser,
+#
+#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_bid,
+#    input  logic    [N_PORTS-1:0]                    [1:0] m0_axi4_bresp,
+#    input  logic    [N_PORTS-1:0]                          m0_axi4_bvalid,
+#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_buser,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_bready,
+#
+#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_arid,
+#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m0_axi4_araddr,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_arvalid,
+#    input  logic    [N_PORTS-1:0]                          m0_axi4_arready,
+#    output logic    [N_PORTS-1:0]                    [7:0] m0_axi4_arlen,
+#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_arsize,
+#    output logic    [N_PORTS-1:0]                    [1:0] m0_axi4_arburst,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_arlock,
+#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_arprot,
+#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_arcache,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_aruser,
+#
+#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_rid,
+#    input  logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m0_axi4_rdata,
+#    input  logic    [N_PORTS-1:0]                    [1:0] m0_axi4_rresp,
+#    input  logic    [N_PORTS-1:0]                          m0_axi4_rvalid,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_rready,
+#    input  logic    [N_PORTS-1:0]                          m0_axi4_rlast,
+#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_ruser,
+#    // }}}
+#
+#    // AXI4 Master 1 {{{
+#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_awid,
+#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m1_axi4_awaddr,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_awvalid,
+#    input  logic    [N_PORTS-1:0]                          m1_axi4_awready,
+#    output logic    [N_PORTS-1:0]                    [7:0] m1_axi4_awlen,
+#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_awsize,
+#    output logic    [N_PORTS-1:0]                    [1:0] m1_axi4_awburst,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_awlock,
+#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_awprot,
+#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_awcache,
+#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_awregion,
+#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_awqos,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_awuser,
+#
+#    output logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m1_axi4_wdata,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_wvalid,
+#    input  logic    [N_PORTS-1:0]                          m1_axi4_wready,
+#    output logic    [N_PORTS-1:0]   [AXI_DATA_WIDTH/8-1:0] m1_axi4_wstrb,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_wlast,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_wuser,
+#
+#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_bid,
+#    input  logic    [N_PORTS-1:0]                    [1:0] m1_axi4_bresp,
+#    input  logic    [N_PORTS-1:0]                          m1_axi4_bvalid,
+#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_buser,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_bready,
+#
+#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_arid,
+#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m1_axi4_araddr,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_arvalid,
+#    input  logic    [N_PORTS-1:0]                          m1_axi4_arready,
+#    output logic    [N_PORTS-1:0]                    [7:0] m1_axi4_arlen,
+#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_arsize,
+#    output logic    [N_PORTS-1:0]                    [1:0] m1_axi4_arburst,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_arlock,
+#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_arprot,
+#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_arcache,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_aruser,
+#
+#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_rid,
+#    input  logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m1_axi4_rdata,
+#    input  logic    [N_PORTS-1:0]                    [1:0] m1_axi4_rresp,
+#    input  logic    [N_PORTS-1:0]                          m1_axi4_rvalid,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_rready,
+#    input  logic    [N_PORTS-1:0]                          m1_axi4_rlast,
+#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_ruser,
+#    // }}}
+#
+#    // AXI 4 Lite Slave (Configuration Interface) {{{
+#    // AXI4-Lite port to setup the rab slices
+#    // use this to program the configuration registers
+#    input  logic                 [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_awaddr,
+#    input  logic                                           s_axi4lite_awvalid,
+#    output logic                                           s_axi4lite_awready,
+#
+#    input  logic                 [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_wdata,
+#    input  logic                                           s_axi4lite_wvalid,
+#    output logic                                           s_axi4lite_wready,
+#    input  logic               [AXI_LITE_DATA_WIDTH/8-1:0] s_axi4lite_wstrb,
+#
+#    output logic                                     [1:0] s_axi4lite_bresp,
+#    output logic                                           s_axi4lite_bvalid,
+#    input  logic                                           s_axi4lite_bready,
+#
+#    input  logic                 [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_araddr,
+#    input  logic                                           s_axi4lite_arvalid,
+#    output logic                                           s_axi4lite_arready,
+#
+#    output logic                 [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_rdata,
+#    output logic                                     [1:0] s_axi4lite_rresp,
+#    output logic                                           s_axi4lite_rvalid,
+#    input  logic                                           s_axi4lite_rready,
+#    // }}}
+#
+#    // BRAMs {{{
+# //`ifdef RAB_AX_LOG_EN
+# //    BramPort.Slave                                         ArBram_PS,
+# //    BramPort.Slave                                         AwBram_PS,
+# //`endif
+#    // }}}
+#
+#    // Logger Control {{{
+# //`ifdef RAB_AX_LOG_EN
+# //   input  logic                                           LogEn_SI,
+# //   input  logic                                           ArLogClr_SI,
+# //   input  logic                                           AwLogClr_SI,
+#  //  output logic                                           ArLogRdy_SO,
+#  //  output logic                                           AwLogRdy_SO,
+# //`endif
+#    // }}}
+#
+#    // Interrupt Outputs {{{
+#    // Interrupt lines to handle misses, collisions of slices/multiple hits,
+#    // protection faults and overflow of the miss handling fifo
+# //`ifdef RAB_AX_LOG_EN
+# //   output logic                                           int_ar_log_full,
+# //   output logic                                           int_aw_log_full,
+# //`endif
+#    output logic                             [N_PORTS-1:0] int_miss,
+#    output logic                             [N_PORTS-1:0] int_multi,
+#    output logic                             [N_PORTS-1:0] int_prot,
+#    output logic                                           int_mhf_full
+#    // }}}
+#
+#  );
+#
+"""#docstring_begin
+
+  // }}}
+
+  // Signals {{{
+  // âââââââââââ âââââââ ââââ   âââ ââââââ âââ     ââââââââ
+  // âââââââââââââââââââ âââââ  ââââââââââââââ     ââââââââ
+  // ââââââââââââââ  ââââââââââ ââââââââââââââ     ââââââââ
+  // ââââââââââââââ   ââââââââââââââââââââââââ     ââââââââ
+  // âââââââââââââââââââââââ âââââââââ  âââââââââââââââââââ
+  // âââââââââââ âââââââ âââ  ââââââââ  âââââââââââââââââââ
+  //
+
+  // Internal AXI4 lines, these connect buffers on the slave side to the rab core and
+  // multiplexers which switch between the two master outputs
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_awid;
+  logic [N_PORTS-1:0]  [AXI_S_ADDR_WIDTH-1:0] int_awaddr;
+  logic [N_PORTS-1:0]                         int_awvalid;
+  logic [N_PORTS-1:0]                         int_awready;
+  logic [N_PORTS-1:0]                   [7:0] int_awlen;
+  logic [N_PORTS-1:0]                   [2:0] int_awsize;
+  logic [N_PORTS-1:0]                   [1:0] int_awburst;
+  logic [N_PORTS-1:0]                         int_awlock;
+  logic [N_PORTS-1:0]                   [2:0] int_awprot;
+  logic [N_PORTS-1:0]                   [3:0] int_awcache;
+  logic [N_PORTS-1:0]                   [3:0] int_awregion;
+  logic [N_PORTS-1:0]                   [3:0] int_awqos;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_awuser;
+
+  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_wdata;
+  logic [N_PORTS-1:0]                         int_wvalid;
+  logic [N_PORTS-1:0]                         int_wready;
+  logic [N_PORTS-1:0]  [AXI_DATA_WIDTH/8-1:0] int_wstrb;
+  logic [N_PORTS-1:0]                         int_wlast;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_wuser;
+
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_bid;
+  logic [N_PORTS-1:0]                   [1:0] int_bresp;
+  logic [N_PORTS-1:0]                         int_bvalid;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_buser;
+  logic [N_PORTS-1:0]                         int_bready;
+
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_arid;
+  logic [N_PORTS-1:0]  [AXI_S_ADDR_WIDTH-1:0] int_araddr;
+  logic [N_PORTS-1:0]                         int_arvalid;
+  logic [N_PORTS-1:0]                         int_arready;
+  logic [N_PORTS-1:0]                   [7:0] int_arlen;
+  logic [N_PORTS-1:0]                   [2:0] int_arsize;
+  logic [N_PORTS-1:0]                   [1:0] int_arburst;
+  logic [N_PORTS-1:0]                         int_arlock;
+  logic [N_PORTS-1:0]                   [2:0] int_arprot;
+  logic [N_PORTS-1:0]                   [3:0] int_arcache;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_aruser;
+
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_rid;
+  logic [N_PORTS-1:0]                   [1:0] int_rresp;
+  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_rdata;
+  logic [N_PORTS-1:0]                         int_rlast;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_ruser;
+  logic [N_PORTS-1:0]                         int_rvalid;
+  logic [N_PORTS-1:0]                         int_rready;
+
+  // rab_core outputs
+  logic [N_PORTS-1:0]  [AXI_M_ADDR_WIDTH-1:0] int_wtrans_addr;
+  logic [N_PORTS-1:0]                         int_wtrans_accept;
+  logic [N_PORTS-1:0]                         int_wtrans_drop;
+  logic [N_PORTS-1:0]                         int_wtrans_miss;
+  logic [N_PORTS-1:0]                         int_wtrans_sent;
+  logic [N_PORTS-1:0]                         int_wtrans_cache_coherent;
+  logic [N_PORTS-1:0]                         int_wmaster_select;
+
+  logic [N_PORTS-1:0]  [AXI_M_ADDR_WIDTH-1:0] int_rtrans_addr;
+  logic [N_PORTS-1:0]                         int_rtrans_accept;
+  logic [N_PORTS-1:0]                         int_rtrans_drop;
+  logic [N_PORTS-1:0]                         int_rtrans_miss;
+  logic [N_PORTS-1:0]                         int_rtrans_sent;
+  logic [N_PORTS-1:0]                         int_rtrans_cache_coherent;
+  logic [N_PORTS-1:0]                         int_rmaster_select;
+
+  logic [N_PORTS-1:0]                         w_master_select;
+
+  // Internal master0 AXI4 lines. These connect the first master port to the
+  // multiplexers
+  // For channels read address, write address and write data the other lines
+  // are ignored if valid is not set, therefore we only need to multiplex those
+  logic [N_PORTS-1:0]                         int_m0_awvalid;
+  logic [N_PORTS-1:0]                         int_m0_awready;
+
+  logic [N_PORTS-1:0]                         int_m0_wvalid;
+  logic [N_PORTS-1:0]                         int_m0_wready;
+
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m0_bid;
+  logic [N_PORTS-1:0]                   [1:0] int_m0_bresp;
+  logic [N_PORTS-1:0]                         int_m0_bvalid;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m0_buser;
+  logic [N_PORTS-1:0]                         int_m0_bready;
+
+  logic [N_PORTS-1:0]                         int_m0_arvalid;
+  logic [N_PORTS-1:0]                         int_m0_arready;
+
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m0_rid;
+  logic [N_PORTS-1:0]                   [1:0] int_m0_rresp;
+  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_m0_rdata;
+  logic [N_PORTS-1:0]                         int_m0_rlast;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m0_ruser;
+  logic [N_PORTS-1:0]                         int_m0_rready;
+  logic [N_PORTS-1:0]                         int_m0_rvalid;
+
+  logic [N_PORTS-1:0]                         l1_m0_ar_accept;
+  logic [N_PORTS-1:0]                         l1_m0_ar_drop;
+  logic [N_PORTS-1:0]                         l1_m0_ar_save;
+  logic [N_PORTS-1:0]                         l1_m0_ar_done;
+  logic [N_PORTS-1:0]                         l2_m0_ar_accept;
+  logic [N_PORTS-1:0]                         l2_m0_ar_drop;
+  logic [N_PORTS-1:0]                         l2_m0_ar_done;
+  logic [N_PORTS-1:0]                         l2_m0_ar_sending;
+
+  logic [N_PORTS-1:0]                         l1_m0_aw_accept;
+  logic [N_PORTS-1:0]                         l1_m0_aw_drop;
+  logic [N_PORTS-1:0]                         l1_m0_aw_save;
+  logic [N_PORTS-1:0]                         l1_m0_aw_done;
+  logic [N_PORTS-1:0]                         l2_m0_aw_accept;
+  logic [N_PORTS-1:0]                         l2_m0_aw_drop;
+  logic [N_PORTS-1:0]                         l2_m0_aw_done;
+  logic [N_PORTS-1:0]                         l2_m0_aw_sending;
+
+  // Internal master1 AXI4 lines. These connect the second master port to the
+  // multiplexers
+  // For channels read address, write address and write data the other lines
+  // are ignored if valid is not set, therefore we only need to multiplex those
+  logic [N_PORTS-1:0]                         int_m1_awvalid;
+  logic [N_PORTS-1:0]                         int_m1_awready;
+
+  logic [N_PORTS-1:0]                         int_m1_wvalid;
+  logic [N_PORTS-1:0]                         int_m1_wready;
+
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m1_bid;
+  logic [N_PORTS-1:0]                   [1:0] int_m1_bresp;
+  logic [N_PORTS-1:0]                         int_m1_bvalid;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m1_buser;
+  logic [N_PORTS-1:0]                         int_m1_bready;
+
+  logic [N_PORTS-1:0]                         int_m1_arvalid;
+  logic [N_PORTS-1:0]                         int_m1_arready;
+
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m1_rid;
+  logic [N_PORTS-1:0]                   [1:0] int_m1_rresp;
+  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_m1_rdata;
+  logic [N_PORTS-1:0]                         int_m1_rlast;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m1_ruser;
+  logic [N_PORTS-1:0]                         int_m1_rvalid;
+  logic [N_PORTS-1:0]                         int_m1_rready;
+
+  logic [N_PORTS-1:0]                         l1_m1_ar_accept;
+  logic [N_PORTS-1:0]                         l1_m1_ar_drop;
+  logic [N_PORTS-1:0]                         l1_m1_ar_save;
+  logic [N_PORTS-1:0]                         l1_m1_ar_done;
+  logic [N_PORTS-1:0]                         l2_m1_ar_accept;
+  logic [N_PORTS-1:0]                         l2_m1_ar_drop;
+  logic [N_PORTS-1:0]                         l2_m1_ar_done;
+
+  logic [N_PORTS-1:0]                         l1_m1_aw_accept;
+  logic [N_PORTS-1:0]                         l1_m1_aw_drop;
+  logic [N_PORTS-1:0]                         l1_m1_aw_save;
+  logic [N_PORTS-1:0]                         l1_m1_aw_done;
+  logic [N_PORTS-1:0]                         l2_m1_aw_accept;
+  logic [N_PORTS-1:0]                         l2_m1_aw_drop;
+  logic [N_PORTS-1:0]                         l2_m1_aw_done;
+
+  // L1 outputs
+  logic [N_PORTS-1:0]                         rab_miss; // L1 RAB miss
+  logic [N_PORTS-1:0]                         rab_prot;
+  logic [N_PORTS-1:0]                         rab_multi;
+  logic [N_PORTS-1:0]                         rab_prefetch;
+
+  //
+  // Signals used to support L2 TLB
+  //
+  // L2 RAM configuration signals
+  logic [N_PORTS-1:0] [AXI_LITE_DATA_WIDTH-1:0] L2CfgWData_D;
+  logic [N_PORTS-1:0] [AXI_LITE_ADDR_WIDTH-1:0] L2CfgWAddr_D;
+  logic [N_PORTS-1:0]                           L2CfgWE_S;
+
+  // L1 output and drop Buffer
+  logic [N_PORTS-1:0]                           L1OutRwType_D, L1DropRwType_DP;
+  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] L1OutUser_D, L1DropUser_DP;
+  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] L1OutId_D, L1DropId_DP;
+  logic [N_PORTS-1:0]                     [7:0] L1OutLen_D, L1DropLen_DP;
+  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] L1OutAddr_D, L1DropAddr_DP;
+  logic [N_PORTS-1:0]                           L1OutProt_D, L1DropProt_DP;
+  logic [N_PORTS-1:0]                           L1OutMulti_D, L1DropMulti_DP;
+  logic [N_PORTS-1:0]                           L1DropEn_S;
+  logic [N_PORTS-1:0]                           L1DropPrefetch_S;
+
+  logic [N_PORTS-1:0]                           L1DropValid_SN, L1DropValid_SP;
+
+  // L2 input Buffer
+  logic [N_PORTS-1:0]                           L2InRwType_DP;
+  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] L2InUser_DP;
+  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] L2InId_DP;
+  logic [N_PORTS-1:0]                     [7:0] L2InLen_DP;
+  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] L2InAddr_DP;
+  logic [N_PORTS-1:0]                           L2InEn_S;
+
+  // L2 output Buffer
+  logic [N_PORTS-1:0]                           L2OutRwType_DP;
+  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] L2OutUser_DP;
+  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] L2OutId_DP;
+  logic [N_PORTS-1:0]                     [7:0] L2OutLen_DP;
+  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] L2OutInAddr_DP;
+
+  logic [N_PORTS-1:0]                           L2OutHit_SN, L2OutHit_SP;
+  logic [N_PORTS-1:0]                           L2OutMiss_SN, L2OutMiss_SP;
+  logic [N_PORTS-1:0]                           L2OutProt_SN, L2OutProt_SP;
+  logic [N_PORTS-1:0]                           L2OutMulti_SN, L2OutMulti_SP;
+  logic [N_PORTS-1:0]                           L2OutCC_SN, L2OutCC_SP;
+  logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] L2OutAddr_DN, L2OutAddr_DP;
+
+  logic [N_PORTS-1:0]                           L2OutValid_SN, L2OutValid_SP;
+  logic [N_PORTS-1:0]                           L2OutPrefetch_S;
+  logic [N_PORTS-1:0]                           L2OutReady_S;
+  logic [N_PORTS-1:0]                           L2OutEn_S;
+
+   // L2 outputs
+  logic [N_PORTS-1:0]                           L2Busy_S;
+  logic [N_PORTS-1:0]                           L2OutValid_S;
+
+  logic [N_PORTS-1:0]                           L2Miss_S;
+
+  // Signals for interfacing the AXI modules
+  logic [N_PORTS-1:0]                           l1_ar_accept;
+  logic [N_PORTS-1:0]                           l1_aw_accept;
+  logic [N_PORTS-1:0]                           l1_w_accept;
+  logic [N_PORTS-1:0]                           l1_xw_accept;
+
+  logic [N_PORTS-1:0]                           l1_ar_drop;
+  logic [N_PORTS-1:0]                           l1_aw_drop;
+  logic [N_PORTS-1:0]                           l1_w_drop;
+  logic [N_PORTS-1:0]                           l1_xw_drop;
+
+  logic [N_PORTS-1:0]                           l1_ar_save;
+  logic [N_PORTS-1:0]                           l1_aw_save;
+  logic [N_PORTS-1:0]                           l1_w_save;
+  logic [N_PORTS-1:0]                           l1_xw_save;
+
+  logic [N_PORTS-1:0]                           l1_ar_done;
+  logic [N_PORTS-1:0]                           l1_r_done;
+  logic [N_PORTS-1:0]                           l1_r_drop;
+  logic [N_PORTS-1:0]                           lx_r_drop;
+  logic [N_PORTS-1:0]                           lx_r_done;
+
+  logic [N_PORTS-1:0]                           l1_aw_done;
+  logic [N_PORTS-1:0]                           l1_w_done;
+  logic [N_PORTS-1:0]                           l1_xw_done;
+  logic [N_PORTS-1:0]                           l1_aw_done_SP;
+  logic [N_PORTS-1:0]                           l1_w_done_SP;
+
+  logic [N_PORTS-1:0]                           l2_ar_accept;
+  logic [N_PORTS-1:0]                           l2_aw_accept;
+  logic [N_PORTS-1:0]                           l2_w_accept;
+  logic [N_PORTS-1:0]                           l2_xw_accept;
+
+  logic [N_PORTS-1:0]                           l2_ar_drop;
+  logic [N_PORTS-1:0]                           l2_r_drop;
+  logic [N_PORTS-1:0]                           l2_xr_drop;
+  logic [N_PORTS-1:0]                           l2_aw_drop;
+  logic [N_PORTS-1:0]                           l2_w_drop;
+  logic [N_PORTS-1:0]                           l2_xw_drop;
+
+  logic [N_PORTS-1:0]                           l2_aw_done;
+  logic [N_PORTS-1:0]                           l2_w_done;
+  logic [N_PORTS-1:0]                           l2_xw_done;
+  logic [N_PORTS-1:0]                           l2_aw_done_SP;
+  logic [N_PORTS-1:0]                           l2_w_done_SP;
+
+  logic [N_PORTS-1:0]                           l2_ar_done;
+  logic [N_PORTS-1:0]                           l2_r_done;
+  logic [N_PORTS-1:0]                           l2_xr_done;
+  logic [N_PORTS-1:0]                           l2_ar_done_SP;
+  logic [N_PORTS-1:0]                           l2_r_done_SP;
+
+  logic [N_PORTS-1:0]                           l1_mx_aw_done;
+  logic [N_PORTS-1:0]                           l1_mx_ar_done;
+  logic [N_PORTS-1:0]                           l1_m0_aw_done_SP;
+  logic [N_PORTS-1:0]                           l1_m0_ar_done_SP;
+  logic [N_PORTS-1:0]                           l1_m1_aw_done_SP;
+  logic [N_PORTS-1:0]                           l1_m1_ar_done_SP;
+
+  logic [N_PORTS-1:0]                           l2_mx_aw_done;
+  logic [N_PORTS-1:0]                           l2_mx_ar_done;
+  logic [N_PORTS-1:0]                           l2_m0_aw_done_SP;
+  logic [N_PORTS-1:0]                           l2_m0_ar_done_SP;
+  logic [N_PORTS-1:0]                           l2_m1_aw_done_SP;
+  logic [N_PORTS-1:0]                           l2_m1_ar_done_SP;
+
+  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] l1_id_drop, lx_id_drop, b_id_drop;
+  logic [N_PORTS-1:0]                     [7:0] l1_len_drop, lx_len_drop;
+  logic [N_PORTS-1:0]                           l1_prefetch_drop, lx_prefetch_drop, b_prefetch_drop;
+  logic [N_PORTS-1:0]                           l1_hit_drop, lx_hit_drop, b_hit_drop;
+
+  logic [N_PORTS-1:0]                           b_drop;
+  logic [N_PORTS-1:0]                           b_done;
+
+  logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] l2_aw_addr;
+  logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] l2_ar_addr;
+
+  logic [N_PORTS-1:0]                           l2_cache_coherent;
+  logic [N_PORTS-1:0]                           l2_master_select;
+
+  logic [N_PORTS-1:0]                           aw_in_stall;
+  logic [N_PORTS-1:0]                           aw_out_stall;
+
+  genvar                                        i;
+
+  // RRESP FSM
+  typedef enum logic                    {IDLE, BUSY} r_resp_mux_ctrl_state_t;
+  r_resp_mux_ctrl_state_t [N_PORTS-1:0] RRespMuxCtrl_SN, RRespMuxCtrl_SP;
+  logic                   [N_PORTS-1:0] RRespSel_SN, RRespSel_SP;
+  logic                   [N_PORTS-1:0] RRespBurst_S;
+  logic                   [N_PORTS-1:0] RRespSelIm_S;
+
+  // }}}
+
+  // Local parameters {{{
+
+  // Enable L2 for select ports
+  localparam integer ENABLE_L2TLB[N_PORTS-1:0] = `EN_L2TLB_ARRAY;
+
+  // L2TLB parameters
+  localparam integer HUM_BUFFER_DEPTH = (N_L2_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS)+13;
+
+  // }}}
+
+  // Derive `master_select` from cache coherency flag. {{{
+  `ifdef EN_ACP
+    assign int_wmaster_select = int_wtrans_cache_coherent;
+    assign int_rmaster_select = int_rtrans_cache_coherent;
+    assign l2_master_select   = l2_cache_coherent;
+  `else
+    assign int_wmaster_select = '0;
+    assign int_rmaster_select = '0;
+    assign l2_master_select   = '0;
+  `endif
+  // }}}
+
+  // Buf and Send {{{
+  // âââââââ âââ   âââââââââââ       âââ       ââââââââââââââââââââ   ââââââââââ
+  // âââââââââââ   âââââââââââ       âââ       âââââââââââââââââââââ  âââââââââââ
+  // âââââââââââ   âââââââââ      âââââââââ    ââââââââââââââ  ââââââ ââââââ  âââ
+  // âââââââââââ   âââââââââ      âââââââââ    ââââââââââââââ  âââââââââââââ  âââ
+  // ââââââââââââââââââââ         âââââââ      âââââââââââââââââââ ââââââââââââââ
+  // âââââââ  âââââââ âââ         âââââââ      âââââââââââââââââââ  ââââââââââââ
+  //
+  logic[N_PORTS-1:0] m0_write_is_burst, m0_read_is_burst;
+  logic[N_PORTS-1:0] m1_write_is_burst, m1_read_is_burst;
+
+  generate for (i = 0; i < N_PORTS; i++) begin : BUF_AND_SEND
+
+  // Write Address channel (aw) {{{
+  /*
+   * write address channel (aw)
+   *
+   * âââ    ââââââââââ ââââââââââââââââââââ     ââââââ âââââââ âââââââ âââââââ
+   * âââ    âââââââââââââââââââââââââââââââ    ââââââââââââââââââââââââââââââââ
+   * âââ ââ ââââââââââââââ   âââ   ââââââ      âââââââââââ  ââââââ  âââââââââââ
+   * âââââââââââââââââââââ   âââ   ââââââ      âââââââââââ  ââââââ  âââââââââââ
+   * âââââââââââââ  ââââââ   âââ   ââââââââ    âââ  ââââââââââââââââââââââ  âââ
+   *  ââââââââ âââ  ââââââ   âââ   ââââââââ    âââ  ââââââââââ âââââââ âââ  âââ
+   *
+   */
+
+  axi4_aw_buffer
+    #(
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+      )
+    u_aw_buffer
+    (
+      .axi4_aclk       ( Clk_CI             ),
+      .axi4_arstn      ( Rst_RBI            ),
+      .s_axi4_awid     ( s_axi4_awid[i]     ),
+      .s_axi4_awaddr   ( s_axi4_awaddr[i]   ),
+      .s_axi4_awvalid  ( s_axi4_awvalid[i]  ),
+      .s_axi4_awready  ( s_axi4_awready[i]  ),
+      .s_axi4_awlen    ( s_axi4_awlen[i]    ),
+      .s_axi4_awsize   ( s_axi4_awsize[i]   ),
+      .s_axi4_awburst  ( s_axi4_awburst[i]  ),
+      .s_axi4_awlock   ( s_axi4_awlock[i]   ),
+      .s_axi4_awprot   ( s_axi4_awprot[i]   ),
+      .s_axi4_awcache  ( s_axi4_awcache[i]  ),
+      .s_axi4_awregion ( s_axi4_awregion[i] ),
+      .s_axi4_awqos    ( s_axi4_awqos[i]    ),
+      .s_axi4_awuser   ( s_axi4_awuser[i]   ),
+      .m_axi4_awid     ( int_awid[i]        ),
+      .m_axi4_awaddr   ( int_awaddr[i]      ),
+      .m_axi4_awvalid  ( int_awvalid[i]     ),
+      .m_axi4_awready  ( int_awready[i]     ),
+      .m_axi4_awlen    ( int_awlen[i]       ),
+      .m_axi4_awsize   ( int_awsize[i]      ),
+      .m_axi4_awburst  ( int_awburst[i]     ),
+      .m_axi4_awlock   ( int_awlock[i]      ),
+      .m_axi4_awprot   ( int_awprot[i]      ),
+      .m_axi4_awcache  ( int_awcache[i]     ),
+      .m_axi4_awregion ( int_awregion[i]    ),
+      .m_axi4_awqos    ( int_awqos[i]       ),
+      .m_axi4_awuser   ( int_awuser[i]      )
+    );
+
+  axi4_aw_sender
+    #(
+      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
+      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
+      )
+    u_aw_sender_m0
+    (
+      .axi4_aclk       ( Clk_CI                ),
+      .axi4_arstn      ( Rst_RBI               ),
+      .l1_done_o       ( l1_m0_aw_done[i]      ),
+      .l1_accept_i     ( l1_m0_aw_accept[i]    ),
+      .l1_drop_i       ( l1_m0_aw_drop[i]      ),
+      .l1_save_i       ( l1_m0_aw_save[i]      ),
+      .l2_done_o       ( l2_m0_aw_done[i]      ),
+      .l2_accept_i     ( l2_m0_aw_accept[i]    ),
+      .l2_drop_i       ( l2_m0_aw_drop[i]      ),
+      .l2_sending_o    ( l2_m0_aw_sending[i]   ),
+      .l1_awaddr_i     ( int_wtrans_addr[i]    ),
+      .l2_awaddr_i     ( l2_aw_addr[i]         ),
+      .s_axi4_awid     ( int_awid[i]           ),
+      .s_axi4_awvalid  ( int_m0_awvalid[i]     ),
+      .s_axi4_awready  ( int_m0_awready[i]     ),
+      .s_axi4_awlen    ( int_awlen[i]          ),
+      .s_axi4_awsize   ( int_awsize[i]         ),
+      .s_axi4_awburst  ( int_awburst[i]        ),
+      .s_axi4_awlock   ( int_awlock[i]         ),
+      .s_axi4_awprot   ( int_awprot[i]         ),
+      .s_axi4_awcache  ( int_awcache[i]        ),
+      .s_axi4_awregion ( int_awregion[i]       ),
+      .s_axi4_awqos    ( int_awqos[i]          ),
+      .s_axi4_awuser   ( int_awuser[i]         ),
+      .m_axi4_awid     ( m0_axi4_awid[i]       ),
+      .m_axi4_awaddr   ( m0_axi4_awaddr[i]     ),
+      .m_axi4_awvalid  ( m0_axi4_awvalid[i]    ),
+      .m_axi4_awready  ( m0_axi4_awready[i]    ),
+      .m_axi4_awlen    ( m0_axi4_awlen[i]      ),
+      .m_axi4_awsize   ( m0_axi4_awsize[i]     ),
+      .m_axi4_awburst  ( m0_axi4_awburst[i]    ),
+      .m_axi4_awlock   ( m0_axi4_awlock[i]     ),
+      .m_axi4_awprot   ( m0_axi4_awprot[i]     ),
+      .m_axi4_awcache  (                       ),
+      .m_axi4_awregion ( m0_axi4_awregion[i]   ),
+      .m_axi4_awqos    ( m0_axi4_awqos[i]      ),
+      .m_axi4_awuser   ( m0_axi4_awuser[i]     )
+    );
+
+  // The AXCACHE signals are set according to burstiness and cache coherence or statically
+  // when not connected to ACP on Zynq (implemented below).
+    assign m0_write_is_burst[i] = (m0_axi4_awlen[i] != {8{1'b0}}) && (m0_axi4_awburst[i] != 2'b00);
+  `ifndef EN_ACP
+    always_comb begin
+      if ( (l2_m0_aw_sending[i] & l2_cache_coherent[i]) | int_wtrans_cache_coherent[i]) begin
+        if (m0_write_is_burst[i]) begin
+          m0_axi4_awcache[i]  = 4'b0111;
+        end else begin
+          m0_axi4_awcache[i]  = 4'b1111;
+        end
+      end else begin
+        m0_axi4_awcache[i]    = 4'b0011;
+      end
+    end
+  `else
+    assign m0_axi4_awcache[i] = 4'b0011;
+  `endif
+
+  axi4_aw_sender
+    #(
+      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
+      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
+      )
+    u_aw_sender_m1
+    (
+      .axi4_aclk       ( Clk_CI                ),
+      .axi4_arstn      ( Rst_RBI               ),
+      .l1_accept_i     ( l1_m1_aw_accept[i]    ),
+      .l1_drop_i       ( l1_m1_aw_drop[i]      ),
+      .l1_save_i       ( l1_m1_aw_save[i]      ),
+      .l1_done_o       ( l1_m1_aw_done[i]      ),
+      .l2_accept_i     ( l2_m1_aw_accept[i]    ),
+      .l2_drop_i       ( l2_m1_aw_drop[i]      ),
+      .l2_done_o       ( l2_m1_aw_done[i]      ),
+      .l2_sending_o    (                       ), // just helps to set axcache
+      .l1_awaddr_i     ( int_wtrans_addr[i]    ),
+      .l2_awaddr_i     ( l2_aw_addr[i]         ),
+      .s_axi4_awid     ( int_awid[i]           ),
+      .s_axi4_awvalid  ( int_m1_awvalid[i]     ),
+      .s_axi4_awready  ( int_m1_awready[i]     ),
+      .s_axi4_awlen    ( int_awlen[i]          ),
+      .s_axi4_awsize   ( int_awsize[i]         ),
+      .s_axi4_awburst  ( int_awburst[i]        ),
+      .s_axi4_awlock   ( int_awlock[i]         ),
+      .s_axi4_awprot   ( int_awprot[i]         ),
+      .s_axi4_awcache  ( int_awcache[i]        ),
+      .s_axi4_awregion ( int_awregion[i]       ),
+      .s_axi4_awqos    ( int_awqos[i]          ),
+      .s_axi4_awuser   ( int_awuser[i]         ),
+      .m_axi4_awid     ( m1_axi4_awid[i]       ),
+      .m_axi4_awaddr   ( m1_axi4_awaddr[i]     ),
+      .m_axi4_awvalid  ( m1_axi4_awvalid[i]    ),
+      .m_axi4_awready  ( m1_axi4_awready[i]    ),
+      .m_axi4_awlen    ( m1_axi4_awlen[i]      ),
+      .m_axi4_awsize   ( m1_axi4_awsize[i]     ),
+      .m_axi4_awburst  ( m1_axi4_awburst[i]    ),
+      .m_axi4_awlock   ( m1_axi4_awlock[i]     ),
+      .m_axi4_awprot   ( m1_axi4_awprot[i]     ),
+      .m_axi4_awcache  (                       ),
+      .m_axi4_awregion ( m1_axi4_awregion[i]   ),
+      .m_axi4_awqos    ( m1_axi4_awqos[i]      ),
+      .m_axi4_awuser   ( m1_axi4_awuser[i]     )
+    );
+
+    // The AXCACHE signals are set according to burstiness and cache coherence or statically
+    // when not connected to ACP on Zynq (implemented below).
+      assign m1_write_is_burst[i] = (m1_axi4_awlen[i] != {8{1'b0}}) && (m1_axi4_awburst[i] != 2'b00);
+    `ifdef EN_ACP
+      always_comb begin
+        if (m1_write_is_burst[i]) begin
+          m1_axi4_awcache[i]    = 4'b1011;
+        end else begin
+          m1_axi4_awcache[i]    = 4'b1111;
+        end
+      end
+    `else
+      assign m1_axi4_awcache[i] = 4'b0011;
+    `endif
+
+  // }}}
+
+  // Write Data channel (w) {{{
+  /*
+   * write data channel (w)
+   *
+   * âââ    ââââââââââ ââââââââââââââââââââ    âââââââ  ââââââ âââââââââ ââââââ
+   * âââ    âââââââââââââââââââââââââââââââ    âââââââââââââââââââââââââââââââââ
+   * âââ ââ ââââââââââââââ   âââ   ââââââ      âââ  âââââââââââ   âââ   ââââââââ
+   * âââââââââââââââââââââ   âââ   ââââââ      âââ  âââââââââââ   âââ   ââââââââ
+   * âââââââââââââ  ââââââ   âââ   ââââââââ    âââââââââââ  âââ   âââ   âââ  âââ
+   *  ââââââââ âââ  ââââââ   âââ   ââââââââ    âââââââ âââ  âââ   âââ   âââ  âââ
+   *
+   */
+  axi4_w_buffer
+    #(
+      .AXI_DATA_WIDTH   ( AXI_DATA_WIDTH   ),
+      .AXI_ID_WIDTH     ( AXI_ID_WIDTH     ),
+      .AXI_USER_WIDTH   ( AXI_USER_WIDTH   ),
+      .ENABLE_L2TLB     ( ENABLE_L2TLB[i]  ),
+      .HUM_BUFFER_DEPTH ( HUM_BUFFER_DEPTH )
+      )
+    u_w_buffer
+    (
+      .axi4_aclk       ( Clk_CI                ),
+      .axi4_arstn      ( Rst_RBI               ),
+
+      // L1 interface
+      .l1_done_o       ( l1_w_done[i]          ),
+      .l1_accept_i     ( l1_w_accept[i]        ),
+      .l1_save_i       ( l1_w_save[i]          ),
+      .l1_drop_i       ( l1_w_drop[i]          ),
+      .l1_master_i     ( int_wmaster_select[i] ),
+      .l1_id_i         ( l1_id_drop[i]         ),
+      .l1_len_i        ( l1_len_drop[i]        ),
+      .l1_prefetch_i   ( l1_prefetch_drop[i]   ),
+      .l1_hit_i        ( l1_hit_drop[i]        ),
+
+      // L2 interface
+      .l2_done_o       ( l2_w_done[i]          ),
+      .l2_accept_i     ( l2_w_accept[i]        ),
+      .l2_drop_i       ( l2_w_drop[i]          ),
+      .l2_master_i     ( l2_master_select[i]   ),
+      .l2_id_i         ( lx_id_drop[i]         ),
+      .l2_len_i        ( lx_len_drop[i]        ),
+      .l2_prefetch_i   ( lx_prefetch_drop[i]   ),
+      .l2_hit_i        ( lx_hit_drop[i]        ),
+
+      // Top-level control outputs
+      .master_select_o ( w_master_select[i]    ),
+      .input_stall_o   ( aw_in_stall[i]        ), // stall L1 AW input if request buffers full
+      .output_stall_o  ( aw_out_stall[i]       ), // stall L1 AW hit forwarding if bypass not possible
+
+      // B sender interface
+      .b_drop_o        ( b_drop[i]             ),
+      .b_done_i        ( b_done[i]             ),
+      .id_o            ( b_id_drop[i]          ),
+      .prefetch_o      ( b_prefetch_drop[i]    ),
+      .hit_o           ( b_hit_drop[i]         ),
+
+      // AXI W channel interfaces
+      .s_axi4_wdata    ( s_axi4_wdata[i]       ),
+      .s_axi4_wvalid   ( s_axi4_wvalid[i]      ),
+      .s_axi4_wready   ( s_axi4_wready[i]      ),
+      .s_axi4_wstrb    ( s_axi4_wstrb[i]       ),
+      .s_axi4_wlast    ( s_axi4_wlast[i]       ),
+      .s_axi4_wuser    ( s_axi4_wuser[i]       ),
+      .m_axi4_wdata    ( int_wdata[i]          ),
+      .m_axi4_wvalid   ( int_wvalid[i]         ),
+      .m_axi4_wready   ( int_wready[i]         ),
+      .m_axi4_wstrb    ( int_wstrb[i]          ),
+      .m_axi4_wlast    ( int_wlast[i]          ),
+      .m_axi4_wuser    ( int_wuser[i]          )
+    );
+
+  axi4_w_sender
+    #(
+      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+      )
+    u_w_sender_m0
+    (
+      .axi4_aclk       ( Clk_CI            ),
+      .axi4_arstn      ( Rst_RBI           ),
+      .s_axi4_wdata    ( int_wdata[i]      ),
+      .s_axi4_wvalid   ( int_m0_wvalid[i]  ),
+      .s_axi4_wready   ( int_m0_wready[i]  ),
+      .s_axi4_wstrb    ( int_wstrb[i]      ),
+      .s_axi4_wlast    ( int_wlast[i]      ),
+      .s_axi4_wuser    ( int_wuser[i]      ),
+      .m_axi4_wdata    ( m0_axi4_wdata[i]  ),
+      .m_axi4_wvalid   ( m0_axi4_wvalid[i] ),
+      .m_axi4_wready   ( m0_axi4_wready[i] ),
+      .m_axi4_wstrb    ( m0_axi4_wstrb[i]  ),
+      .m_axi4_wlast    ( m0_axi4_wlast[i]  ),
+      .m_axi4_wuser    ( m0_axi4_wuser[i]  )
+    );
+
+  axi4_w_sender
+    #(
+      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+
+      )
+    u_w_sender_m1
+    (
+      .axi4_aclk       ( Clk_CI            ),
+      .axi4_arstn      ( Rst_RBI           ),
+      .s_axi4_wdata    ( int_wdata[i]      ),
+      .s_axi4_wvalid   ( int_m1_wvalid[i]  ),
+      .s_axi4_wready   ( int_m1_wready[i]  ),
+      .s_axi4_wstrb    ( int_wstrb[i]      ),
+      .s_axi4_wlast    ( int_wlast[i]      ),
+      .s_axi4_wuser    ( int_wuser[i]      ),
+      .m_axi4_wdata    ( m1_axi4_wdata[i]  ),
+      .m_axi4_wvalid   ( m1_axi4_wvalid[i] ),
+      .m_axi4_wready   ( m1_axi4_wready[i] ),
+      .m_axi4_wstrb    ( m1_axi4_wstrb[i]  ),
+      .m_axi4_wlast    ( m1_axi4_wlast[i]  ),
+      .m_axi4_wuser    ( m1_axi4_wuser[i]  )
+    );
+
+  /*
+   * Multiplexer to switch between the two output master ports on the write data (w) channel
+   */
+  always_comb begin
+    /* Only one output can be selected at any time */
+    if (w_master_select[i] == 1'b0) begin
+      int_m0_wvalid[i] = int_wvalid[i];
+      int_m1_wvalid[i] = 1'b0;
+      int_wready[i]    = int_m0_wready[i];
+    end else begin
+      int_m0_wvalid[i] = 1'b0;
+      int_m1_wvalid[i] = int_wvalid[i];
+      int_wready[i]    = int_m1_wready[i];
+    end
+  end
+
+  // }}}
+
+  // Write Response channel (b) {{{
+  /*
+   * write response channel (b)
+   *
+   * âââ    ââââââââââ ââââââââââââââââââââ    âââââââ âââââââââââââââââââââââ
+   * âââ    âââââââââââââââââââââââââââââââ    ââââââââââââââââââââââââââââââââ
+   * âââ ââ ââââââââââââââ   âââ   ââââââ      ââââââââââââââ  ââââââââââââââââ
+   * âââââââââââââââââââââ   âââ   ââââââ      ââââââââââââââ  âââââââââââââââ
+   * âââââââââââââ  ââââââ   âââ   ââââââââ    âââ  ââââââââââââââââââââââ
+   *  ââââââââ âââ  ââââââ   âââ   ââââââââ    âââ  ââââââââââââââââââââââ
+   *
+   */
+  axi4_b_buffer
+    #(
+        .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
+        .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+      )
+    u_b_buffer_m0
+    (
+      .axi4_aclk     ( Clk_CI            ),
+      .axi4_arstn    ( Rst_RBI           ),
+      .s_axi4_bid    ( int_m0_bid[i]     ),
+      .s_axi4_bresp  ( int_m0_bresp[i]   ),
+      .s_axi4_bvalid ( int_m0_bvalid[i]  ),
+      .s_axi4_buser  ( int_m0_buser[i]   ),
+      .s_axi4_bready ( int_m0_bready[i]  ),
+      .m_axi4_bid    ( m0_axi4_bid[i]    ),
+      .m_axi4_bresp  ( m0_axi4_bresp[i]  ),
+      .m_axi4_bvalid ( m0_axi4_bvalid[i] ),
+      .m_axi4_buser  ( m0_axi4_buser[i]  ),
+      .m_axi4_bready ( m0_axi4_bready[i] )
+    );
+
+  axi4_b_buffer
+    #(
+        .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
+        .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+      )
+    u_b_buffer_m1
+    (
+      .axi4_aclk      ( Clk_CI            ),
+      .axi4_arstn     ( Rst_RBI           ),
+      .s_axi4_bid     ( int_m1_bid[i]     ),
+      .s_axi4_bresp   ( int_m1_bresp[i]   ),
+      .s_axi4_bvalid  ( int_m1_bvalid[i]  ),
+      .s_axi4_buser   ( int_m1_buser[i]   ),
+      .s_axi4_bready  ( int_m1_bready[i]  ),
+      .m_axi4_bid     ( m1_axi4_bid[i]    ),
+      .m_axi4_bresp   ( m1_axi4_bresp[i]  ),
+      .m_axi4_bvalid  ( m1_axi4_bvalid[i] ),
+      .m_axi4_buser   ( m1_axi4_buser[i]  ),
+      .m_axi4_bready  ( m1_axi4_bready[i] )
+    );
+
+  axi4_b_sender
+    #(
+        .AXI_ID_WIDTH   ( AXI_ID_WIDTH    ),
+        .AXI_USER_WIDTH ( AXI_USER_WIDTH  )
+      )
+    u_b_sender
+    (
+      .axi4_aclk      ( Clk_CI             ),
+      .axi4_arstn     ( Rst_RBI            ),
+      .drop_i         ( b_drop[i]          ),
+      .done_o         ( b_done[i]          ),
+      .id_i           ( b_id_drop[i]       ),
+      .prefetch_i     ( b_prefetch_drop[i] ),
+      .hit_i          ( b_hit_drop[i]      ),
+      .s_axi4_bid     ( s_axi4_bid[i]      ),
+      .s_axi4_bresp   ( s_axi4_bresp[i]    ),
+      .s_axi4_bvalid  ( s_axi4_bvalid[i]   ),
+      .s_axi4_buser   ( s_axi4_buser[i]    ),
+      .s_axi4_bready  ( s_axi4_bready[i]   ),
+      .m_axi4_bid     ( int_bid[i]         ),
+      .m_axi4_bresp   ( int_bresp[i]       ),
+      .m_axi4_bvalid  ( int_bvalid[i]      ),
+      .m_axi4_buser   ( int_buser[i]       ),
+      .m_axi4_bready  ( int_bready[i]      )
+    );
+
+  /*
+   * Multiplexer to switch between the two output master ports on the write response (b) channel
+   */
+  always_comb begin
+     /* Output 1 always gets priority, so if it has something to send connect
+      it and let output 0 wait using rready = 0 */
+    if (int_m1_bvalid[i] == 1'b1) begin
+      int_m0_bready[i] = 1'b0;
+      int_m1_bready[i] = int_bready[i];
+
+      int_bid[i]       = int_m1_bid[i];
+      int_bresp[i]     = int_m1_bresp[i];
+      int_buser[i]     = int_m1_buser[i];
+      int_bvalid[i]    = int_m1_bvalid[i];
+    end else begin
+      int_m0_bready[i] = int_bready[i];
+      int_m1_bready[i] = 1'b0;
+
+      int_bid[i]       = int_m0_bid[i];
+      int_bresp[i]     = int_m0_bresp[i];
+      int_buser[i]     = int_m0_buser[i];
+      int_bvalid[i]    = int_m0_bvalid[i];
+    end
+  end
+
+  // }}}
+
+  // Read Address channel (ar) {{{
+  /*
+   * read address channel (ar)
+   *
+   * âââââââ ââââââââ ââââââ âââââââ      ââââââ âââââââ âââââââ âââââââ
+   * ââââââââââââââââââââââââââââââââ    ââââââââââââââââââââââââââââââââ
+   * ââââââââââââââ  âââââââââââ  âââ    âââââââââââ  ââââââ  âââââââââââ
+   * ââââââââââââââ  âââââââââââ  âââ    âââââââââââ  ââââââ  âââââââââââ
+   * âââ  ââââââââââââââ  âââââââââââ    âââ  ââââââââââââââââââââââ  âââ
+   * âââ  ââââââââââââââ  ââââââââââ     âââ  ââââââââââ âââââââ âââ  âââ
+   *
+   */
+  axi4_ar_buffer
+    #(
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+      )
+    u_ar_buffer
+    (
+      .axi4_aclk      ( Clk_CI            ),
+      .axi4_arstn     ( Rst_RBI           ),
+      .s_axi4_arid    ( s_axi4_arid[i]    ),
+      .s_axi4_araddr  ( s_axi4_araddr[i]  ),
+      .s_axi4_arvalid ( s_axi4_arvalid[i] ),
+      .s_axi4_arready ( s_axi4_arready[i] ),
+      .s_axi4_arlen   ( s_axi4_arlen[i]   ),
+      .s_axi4_arsize  ( s_axi4_arsize[i]  ),
+      .s_axi4_arburst ( s_axi4_arburst[i] ),
+      .s_axi4_arlock  ( s_axi4_arlock[i]  ),
+      .s_axi4_arprot  ( s_axi4_arprot[i]  ),
+      .s_axi4_arcache ( s_axi4_arcache[i] ),
+      .s_axi4_aruser  ( s_axi4_aruser[i]  ),
+      .m_axi4_arid    ( int_arid[i]       ),
+      .m_axi4_araddr  ( int_araddr[i]     ),
+      .m_axi4_arvalid ( int_arvalid[i]    ),
+      .m_axi4_arready ( int_arready[i]    ),
+      .m_axi4_arlen   ( int_arlen[i]      ),
+      .m_axi4_arsize  ( int_arsize[i]     ),
+      .m_axi4_arburst ( int_arburst[i]    ),
+      .m_axi4_arlock  ( int_arlock[i]     ),
+      .m_axi4_arprot  ( int_arprot[i]     ),
+      .m_axi4_arcache ( int_arcache[i]    ),
+      .m_axi4_aruser  ( int_aruser[i]     )
+    );
+
+  axi4_ar_sender
+    #(
+      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
+      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
+      )
+    u_ar_sender_m0
+    (
+      .axi4_aclk       ( Clk_CI                ),
+      .axi4_arstn      ( Rst_RBI               ),
+      .l1_done_o       ( l1_m0_ar_done[i]      ),
+      .l1_accept_i     ( l1_m0_ar_accept[i]    ),
+      .l1_drop_i       ( l1_m0_ar_drop[i]      ),
+      .l1_save_i       ( l1_m0_ar_save[i]      ),
+      .l2_done_o       ( l2_m0_ar_done[i]      ),
+      .l2_accept_i     ( l2_m0_ar_accept[i]    ),
+      .l2_drop_i       ( l2_m0_ar_drop[i]      ),
+      .l2_sending_o    ( l2_m0_ar_sending[i]   ),
+      .l1_araddr_i     ( int_rtrans_addr[i]    ),
+      .l2_araddr_i     ( l2_ar_addr[i]         ),
+      .s_axi4_arid     ( int_arid[i]           ),
+      .s_axi4_arvalid  ( int_m0_arvalid[i]     ),
+      .s_axi4_arready  ( int_m0_arready[i]     ),
+      .s_axi4_arlen    ( int_arlen[i]          ),
+      .s_axi4_arsize   ( int_arsize[i]         ),
+      .s_axi4_arburst  ( int_arburst[i]        ),
+      .s_axi4_arlock   ( int_arlock[i]         ),
+      .s_axi4_arprot   ( int_arprot[i]         ),
+      .s_axi4_arcache  ( int_arcache[i]        ),
+      .s_axi4_aruser   ( int_aruser[i]         ),
+      .m_axi4_arid     ( m0_axi4_arid[i]       ),
+      .m_axi4_araddr   ( m0_axi4_araddr[i]     ),
+      .m_axi4_arvalid  ( m0_axi4_arvalid[i]    ),
+      .m_axi4_arready  ( m0_axi4_arready[i]    ),
+      .m_axi4_arlen    ( m0_axi4_arlen[i]      ),
+      .m_axi4_arsize   ( m0_axi4_arsize[i]     ),
+      .m_axi4_arburst  ( m0_axi4_arburst[i]    ),
+      .m_axi4_arlock   ( m0_axi4_arlock[i]     ),
+      .m_axi4_arprot   ( m0_axi4_arprot[i]     ),
+      .m_axi4_arcache  (                       ),
+      .m_axi4_aruser   ( m0_axi4_aruser[i]     )
+    );
+
+    // The AXCACHE signals are set according to burstiness and cache coherence or statically
+    // when not connected to ACP on Zynq (implemented below).
+      assign m0_read_is_burst[i] = (m0_axi4_arlen[i] != {8{1'b0}}) && (m0_axi4_arburst[i] != 2'b00);
+    `ifndef EN_ACP
+      always_comb begin
+        if ( (l2_m0_ar_sending[i] & l2_cache_coherent[i]) | int_rtrans_cache_coherent[i]) begin
+          if (m0_read_is_burst[i]) begin
+            m0_axi4_arcache[i]  = 4'b1011;
+          end else begin
+            m0_axi4_arcache[i]  = 4'b1111;
+          end
+        end else begin
+          m0_axi4_arcache[i]    = 4'b0011;
+        end
+      end
+    `else
+      assign m0_axi4_arcache[i] = 4'b0011;
+    `endif
+
+  axi4_ar_sender
+    #(
+      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
+      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
+      )
+    u_ar_sender_m1
+    (
+      .axi4_aclk       ( Clk_CI                ),
+      .axi4_arstn      ( Rst_RBI               ),
+      .l1_done_o       ( l1_m1_ar_done[i]      ),
+      .l1_accept_i     ( l1_m1_ar_accept[i]    ),
+      .l1_drop_i       ( l1_m1_ar_drop[i]      ),
+      .l1_save_i       ( l1_m1_ar_save[i]      ),
+      .l2_done_o       ( l2_m1_ar_done[i]      ),
+      .l2_accept_i     ( l2_m1_ar_accept[i]    ),
+      .l2_drop_i       ( l2_m1_ar_drop[i]      ),
+      .l2_sending_o    (                       ), // just helps to set axcache
+      .l1_araddr_i     ( int_rtrans_addr[i]    ),
+      .l2_araddr_i     ( l2_ar_addr[i]         ),
+      .s_axi4_arid     ( int_arid[i]           ),
+      .s_axi4_arvalid  ( int_m1_arvalid[i]     ),
+      .s_axi4_arready  ( int_m1_arready[i]     ),
+      .s_axi4_arlen    ( int_arlen[i]          ),
+      .s_axi4_arsize   ( int_arsize[i]         ),
+      .s_axi4_arburst  ( int_arburst[i]        ),
+      .s_axi4_arlock   ( int_arlock[i]         ),
+      .s_axi4_arprot   ( int_arprot[i]         ),
+      .s_axi4_arcache  ( int_arcache[i]        ),
+      .s_axi4_aruser   ( int_aruser[i]         ),
+      .m_axi4_arid     ( m1_axi4_arid[i]       ),
+      .m_axi4_araddr   ( m1_axi4_araddr[i]     ),
+      .m_axi4_arvalid  ( m1_axi4_arvalid[i]    ),
+      .m_axi4_arready  ( m1_axi4_arready[i]    ),
+      .m_axi4_arlen    ( m1_axi4_arlen[i]      ),
+      .m_axi4_arsize   ( m1_axi4_arsize[i]     ),
+      .m_axi4_arburst  ( m1_axi4_arburst[i]    ),
+      .m_axi4_arlock   ( m1_axi4_arlock[i]     ),
+      .m_axi4_arprot   ( m1_axi4_arprot[i]     ),
+      .m_axi4_arcache  (                       ),
+      .m_axi4_aruser   ( m1_axi4_aruser[i]     )
+    );
+
+    // The AXCACHE signals are set according to burstiness and cache coherence or statically
+    // when not connected to ACP on Zynq (implemented below).
+      assign m1_read_is_burst[i] = (m1_axi4_arlen[i] != {8{1'b0}}) && (m1_axi4_arburst[i] != 2'b00);
+    `ifdef EN_ACP
+      always_comb begin
+        if (m1_read_is_burst[i]) begin
+          m1_axi4_arcache[i]    = 4'b1011;
+        end else begin
+          m1_axi4_arcache[i]    = 4'b1111;
+        end
+      end
+    `else
+      assign m1_axi4_arcache[i] = 4'b0011;
+    `endif
+
+  // }}}
+
+  // Read Response channel (r) {{{
+  /*
+   * read response channel (r)
+   *
+   * âââââââ ââââââââ ââââââ âââââââ     âââââââ âââââââââââââââââââââââ
+   * ââââââââââââââââââââââââââââââââ    ââââââââââââââââââââââââââââââââ
+   * ââââââââââââââ  âââââââââââ  âââ    ââââââââââââââ  ââââââââââââââââ
+   * ââââââââââââââ  âââââââââââ  âââ    ââââââââââââââ  âââââââââââââââ
+   * âââ  ââââââââââââââ  âââââââââââ    âââ  ââââââââââââââââââââââ
+   * âââ  ââââââââââââââ  ââââââââââ     âââ  ââââââââââââââââââââââ
+   *
+   */
+  axi4_r_buffer
+    #(
+      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+      )
+    u_r_buffer_m0
+    (
+      .axi4_aclk     ( Clk_CI            ),
+      .axi4_arstn    ( Rst_RBI           ),
+      .s_axi4_rid    ( int_m0_rid[i]     ),
+      .s_axi4_rresp  ( int_m0_rresp[i]   ),
+      .s_axi4_rdata  ( int_m0_rdata[i]   ),
+      .s_axi4_rlast  ( int_m0_rlast[i]   ),
+      .s_axi4_rvalid ( int_m0_rvalid[i]  ),
+      .s_axi4_ruser  ( int_m0_ruser[i]   ),
+      .s_axi4_rready ( int_m0_rready[i]  ),
+      .m_axi4_rid    ( m0_axi4_rid[i]    ),
+      .m_axi4_rresp  ( m0_axi4_rresp[i]  ),
+      .m_axi4_rdata  ( m0_axi4_rdata[i]  ),
+      .m_axi4_rlast  ( m0_axi4_rlast[i]  ),
+      .m_axi4_rvalid ( m0_axi4_rvalid[i] ),
+      .m_axi4_ruser  ( m0_axi4_ruser[i]  ),
+      .m_axi4_rready ( m0_axi4_rready[i] )
+    );
+
+  axi4_r_buffer
+    #(
+      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+      )
+    u_r_buffer_m1
+    (
+      .axi4_aclk     ( Clk_CI            ),
+      .axi4_arstn    ( Rst_RBI           ),
+      .s_axi4_rid    ( int_m1_rid[i]     ),
+      .s_axi4_rresp  ( int_m1_rresp[i]   ),
+      .s_axi4_rdata  ( int_m1_rdata[i]   ),
+      .s_axi4_rlast  ( int_m1_rlast[i]   ),
+      .s_axi4_rvalid ( int_m1_rvalid[i]  ),
+      .s_axi4_ruser  ( int_m1_ruser[i]   ),
+      .s_axi4_rready ( int_m1_rready[i]  ),
+      .m_axi4_rid    ( m1_axi4_rid[i]    ),
+      .m_axi4_rresp  ( m1_axi4_rresp[i]  ),
+      .m_axi4_rdata  ( m1_axi4_rdata[i]  ),
+      .m_axi4_rlast  ( m1_axi4_rlast[i]  ),
+      .m_axi4_rvalid ( m1_axi4_rvalid[i] ),
+      .m_axi4_ruser  ( m1_axi4_ruser[i]  ),
+      .m_axi4_rready ( m1_axi4_rready[i] )
+    );
+
+  axi4_r_sender
+    #(
+      .AXI_DATA_WIDTH  ( AXI_DATA_WIDTH ),
+      .AXI_ID_WIDTH    ( AXI_ID_WIDTH   ),
+      .AXI_USER_WIDTH  ( AXI_USER_WIDTH )
+      )
+    u_r_sender
+    (
+      .axi4_aclk     ( Clk_CI              ),
+      .axi4_arstn    ( Rst_RBI             ),
+      .drop_i        ( lx_r_drop[i]        ),
+      .drop_len_i    ( lx_len_drop[i]      ),
+      .done_o        ( lx_r_done[i]        ),
+      .id_i          ( lx_id_drop[i]       ),
+      .prefetch_i    ( lx_prefetch_drop[i] ),
+      .hit_i         ( lx_hit_drop[i]      ),
+      .s_axi4_rid    ( s_axi4_rid[i]       ),
+      .s_axi4_rresp  ( s_axi4_rresp[i]     ),
+      .s_axi4_rdata  ( s_axi4_rdata[i]     ),
+      .s_axi4_rlast  ( s_axi4_rlast[i]     ),
+      .s_axi4_rvalid ( s_axi4_rvalid[i]    ),
+      .s_axi4_ruser  ( s_axi4_ruser[i]     ),
+      .s_axi4_rready ( s_axi4_rready[i]    ),
+      .m_axi4_rid    ( int_rid[i]          ),
+      .m_axi4_rresp  ( int_rresp[i]        ),
+      .m_axi4_rdata  ( int_rdata[i]        ),
+      .m_axi4_rlast  ( int_rlast[i]        ),
+      .m_axi4_rvalid ( int_rvalid[i]       ),
+      .m_axi4_ruser  ( int_ruser[i]        ),
+      .m_axi4_rready ( int_rready[i]       )
+    );
+
+  /*
+   * Multiplexer to switch between the two output master ports on the read response(r) channel
+   *
+   * Do not perform read burst interleaving as the DMA does not support it. This means we can only
+   * switch between the two masters upon sending rlast or when idle.
+   *
+   * However, if the downstream already performs burst interleaving, this cannot be undone here.
+   * Also, the downstream may interleave a burst reponse with a single-beat transaction. In this
+   * case, the FSM below falls out of the burst mode. To avoid it performing burst interleaving
+   * after such an event, it gives priority to the master which received the last burst in case
+   * both have a have a burst ready (rvalid).
+   *
+   * Order of priority:
+   * 1. Ongoing burst transaction
+   * 2. Single-beat transaction on Master 1.
+   * 3. Single-beat transaction on Master 0.
+   * 4. Burst transaction on master that received the last burst.
+   */
+  // Select signal
+  always_ff @(posedge Clk_CI) begin
+    if (Rst_RBI == 0) begin
+      RRespSel_SP[i] <= 1'b0;
+    end else begin
+      RRespSel_SP[i] <= RRespSel_SN[i];
+    end
+  end
+
+  // FSM
+  always_comb begin : RRespMuxFsm
+    RRespMuxCtrl_SN[i] = RRespMuxCtrl_SP[i];
+    RRespSel_SN[i]     = RRespSel_SP[i];
+
+    RRespBurst_S[i]    = 1'b0;
+    RRespSelIm_S[i]    = 1'b0;
+
+    unique case (RRespMuxCtrl_SP[i])
+
+      IDLE: begin
+        // immediately forward single-beat transactions
+        if      (int_m1_rvalid[i] && int_m1_rlast[i])
+          RRespSelIm_S[i] = 1'b1;
+        else if (int_m0_rvalid[i] && int_m0_rlast[i])
+          RRespSelIm_S[i] = 1'b0;
+
+        // bursts - they also start immediately
+        else if (int_m1_rvalid[i] || int_m0_rvalid[i]) begin
+          RRespMuxCtrl_SN[i] = BUSY;
+
+          // in case both are ready, continue with the master that had the last burst
+          if    (int_m1_rvalid[i] && int_m0_rvalid[i]) begin
+            RRespSel_SN[i]  = RRespSel_SP[i];
+            RRespSelIm_S[i] = RRespSel_SP[i];
+          end else if (int_m1_rvalid[i]) begin
+            RRespSel_SN[i]  = 1'b1;
+            RRespSelIm_S[i] = 1'b1;
+          end else begin
+            RRespSel_SN[i]  = 1'b0;
+            RRespSelIm_S[i] = 1'b0;
+          end
+        end
+      end
+
+      BUSY: begin
+        RRespBurst_S[i] = 1'b1;
+        // detect last handshake of currently ongoing transfer
+        if (int_rvalid[i] && int_rready[i] && int_rlast[i])
+          RRespMuxCtrl_SN[i] = IDLE;
+      end
+
+      default: begin
+        RRespMuxCtrl_SN[i] = IDLE;
+      end
+
+    endcase
+  end
+
+  // FSM state
+  always_ff @(posedge Clk_CI) begin
+    if (Rst_RBI == 0) begin
+      RRespMuxCtrl_SP[i] <= IDLE;
+    end else begin
+      RRespMuxCtrl_SP[i] <= RRespMuxCtrl_SN[i];
+    end
+  end
+
+  // Actual multiplexer
+  always_comb begin
+    if ( (RRespBurst_S[i] && RRespSel_SP[i]) || (!RRespBurst_S[i] && RRespSelIm_S[i]) ) begin
+      int_m0_rready[i] = 1'b0;
+      int_m1_rready[i] = int_rready[i];
+
+      int_rid[i]    = int_m1_rid[i];
+      int_rresp[i]  = int_m1_rresp[i];
+      int_rdata[i]  = int_m1_rdata[i];
+      int_rlast[i]  = int_m1_rlast[i];
+      int_ruser[i]  = int_m1_ruser[i];
+      int_rvalid[i] = int_m1_rvalid[i];
+    end else begin
+      int_m0_rready[i] = int_rready[i];
+      int_m1_rready[i] = 1'b0;
+
+      int_rid[i]    = int_m0_rid[i];
+      int_rresp[i]  = int_m0_rresp[i];
+      int_rdata[i]  = int_m0_rdata[i];
+      int_rlast[i]  = int_m0_rlast[i];
+      int_ruser[i]  = int_m0_ruser[i];
+      int_rvalid[i] = int_m0_rvalid[i];
+    end
+  end
+
+  end // BUF & SEND
+
+  // }}}
+
+  endgenerate // BUF & SEND }}}
+
+  // Log {{{
+
+`ifdef RAB_AX_LOG_EN
+  AxiBramLogger
+    #(
+      .AXI_ID_BITW     ( AXI_ID_WIDTH        ),
+      .AXI_ADDR_BITW   ( AXI_S_ADDR_WIDTH    ),
+      .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES )
+    )
+    u_aw_logger
+    (
+      .Clk_CI          ( NonGatedClk_CI    ),
+      .TimestampClk_CI ( Clk_CI            ),
+      .Rst_RBI         ( Rst_RBI           ),
+      .AxiValid_SI     ( s_axi4_awvalid[1] ),
+      .AxiReady_SI     ( s_axi4_awready[1] ),
+      .AxiId_DI        ( s_axi4_awid[1]    ),
+      .AxiAddr_DI      ( s_axi4_awaddr[1]  ),
+      .AxiLen_DI       ( s_axi4_awlen[1]   ),
+      .Clear_SI        ( AwLogClr_SI       ),
+      .LogEn_SI        ( LogEn_SI          ),
+      .Full_SO         ( int_aw_log_full   ),
+      .Ready_SO        ( AwLogRdy_SO       ),
+      .Bram_PS         ( AwBram_PS         )
+    );
+
+  AxiBramLogger
+    #(
+      .AXI_ID_BITW     ( AXI_ID_WIDTH        ),
+      .AXI_ADDR_BITW   ( AXI_S_ADDR_WIDTH    ),
+      .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES )
+    )
+    u_ar_logger
+    (
+      .Clk_CI          ( NonGatedClk_CI    ),
+      .TimestampClk_CI ( Clk_CI            ),
+      .Rst_RBI         ( Rst_RBI           ),
+      .AxiValid_SI     ( s_axi4_arvalid[1] ),
+      .AxiReady_SI     ( s_axi4_arready[1] ),
+      .AxiId_DI        ( s_axi4_arid[1]    ),
+      .AxiAddr_DI      ( s_axi4_araddr[1]  ),
+      .AxiLen_DI       ( s_axi4_arlen[1]   ),
+      .Clear_SI        ( ArLogClr_SI       ),
+      .LogEn_SI        ( LogEn_SI          ),
+      .Full_SO         ( int_ar_log_full   ),
+      .Ready_SO        ( ArLogRdy_SO       ),
+      .Bram_PS         ( ArBram_PS         )
+    );
+`endif
+
+  // }}}
+
+  // RAB Core {{{
+  // âââââââ  ââââââ âââââââ      âââââââ âââââââ âââââââ ââââââââ
+  // ââââââââââââââââââââââââ    âââââââââââââââââââââââââââââââââ
+  // ââââââââââââââââââââââââ    âââ     âââ   âââââââââââââââââ
+  // ââââââââââââââââââââââââ    âââ     âââ   âââââââââââââââââ
+  // âââ  ââââââ  âââââââââââ    ââââââââââââââââââââ  âââââââââââ
+  // âââ  ââââââ  ââââââââââ      âââââââ âââââââ âââ  âââââââââââ
+  //
+  /*
+   * rab_core
+   *
+   * The rab core translates addresses. It has two ports, which can be used
+   * independently, however they will compete for time internally, as lookups
+   * are serialized.
+   *
+   * type is the read(0) or write(1) used to check the protection flags. If they
+   * don't match an interrupt is created on the int_prot line.
+   */
+
+  rab_core
+    #(
+      .N_PORTS             ( N_PORTS             ),
+      .N_L2_SETS           ( N_L2_SETS           ),
+      .N_L2_SET_ENTRIES    ( N_L2_SET_ENTRIES    ),
+      .AXI_DATA_WIDTH      ( AXI_DATA_WIDTH      ),
+      .AXI_S_ADDR_WIDTH    ( AXI_S_ADDR_WIDTH    ),
+      .AXI_M_ADDR_WIDTH    ( AXI_M_ADDR_WIDTH    ),
+      .AXI_LITE_DATA_WIDTH ( AXI_LITE_DATA_WIDTH ),
+      .AXI_LITE_ADDR_WIDTH ( AXI_LITE_ADDR_WIDTH ),
+      .AXI_ID_WIDTH        ( AXI_ID_WIDTH        ),
+      .AXI_USER_WIDTH      ( AXI_USER_WIDTH      ),
+      .MH_FIFO_DEPTH       ( MH_FIFO_DEPTH       )
+    )
+    u_rab_core
+    (
+      .Clk_CI               ( Clk_CI                     ),
+      .Rst_RBI              ( Rst_RBI                    ),
+
+      // Config IF
+      .s_axi_awaddr         ( s_axi4lite_awaddr          ),
+      .s_axi_awvalid        ( s_axi4lite_awvalid         ),
+      .s_axi_awready        ( s_axi4lite_awready         ),
+      .s_axi_wdata          ( s_axi4lite_wdata           ),
+      .s_axi_wstrb          ( s_axi4lite_wstrb           ),
+      .s_axi_wvalid         ( s_axi4lite_wvalid          ),
+      .s_axi_wready         ( s_axi4lite_wready          ),
+      .s_axi_bresp          ( s_axi4lite_bresp           ),
+      .s_axi_bvalid         ( s_axi4lite_bvalid          ),
+      .s_axi_bready         ( s_axi4lite_bready          ),
+      .s_axi_araddr         ( s_axi4lite_araddr          ),
+      .s_axi_arvalid        ( s_axi4lite_arvalid         ),
+      .s_axi_arready        ( s_axi4lite_arready         ),
+      .s_axi_rready         ( s_axi4lite_rready          ),
+      .s_axi_rdata          ( s_axi4lite_rdata           ),
+      .s_axi_rresp          ( s_axi4lite_rresp           ),
+      .s_axi_rvalid         ( s_axi4lite_rvalid          ),
+
+      // L1 miss info outputs -> L2 TLB arbitration
+      .int_miss             ( rab_miss                   ),
+      .int_multi            ( rab_multi                  ),
+      .int_prot             ( rab_prot                   ),
+      .int_prefetch         ( rab_prefetch               ),
+      .int_mhf_full         ( int_mhf_full               ),
+
+      // L1 transaction info outputs -> L2 TLB arbitration
+      .int_axaddr_o         ( L1OutAddr_D                ),
+      .int_axid_o           ( L1OutId_D                  ),
+      .int_axlen_o          ( L1OutLen_D                 ),
+      .int_axuser_o         ( L1OutUser_D                ),
+
+      // Write Req IF
+      .port1_addr           ( int_awaddr                 ),
+      .port1_id             ( int_awid                   ),
+      .port1_len            ( int_awlen                  ),
+      .port1_size           ( int_awsize                 ),
+      .port1_addr_valid     ( int_awvalid & ~aw_in_stall ), // avoid the FSM accepting new AW requests
+      .port1_type           ( {N_PORTS{1'b1}}            ),
+      .port1_user           ( int_awuser                 ),
+      .port1_sent           ( int_wtrans_sent            ), // signal done to L1 FSM
+      .port1_out_addr       ( int_wtrans_addr            ),
+      .port1_cache_coherent ( int_wtrans_cache_coherent  ),
+      .port1_accept         ( int_wtrans_accept          ),
+      .port1_drop           ( int_wtrans_drop            ),
+      .port1_miss           ( int_wtrans_miss            ),
+
+      // Read Req IF
+      .port2_addr           ( int_araddr                 ),
+      .port2_id             ( int_arid                   ),
+      .port2_len            ( int_arlen                  ),
+      .port2_size           ( int_arsize                 ),
+      .port2_addr_valid     ( int_arvalid                ),
+      .port2_type           ( {N_PORTS{1'b0}}            ),
+      .port2_user           ( int_aruser                 ),
+      .port2_sent           ( int_rtrans_sent            ), // signal done to L1 FSM
+      .port2_out_addr       ( int_rtrans_addr            ),
+      .port2_cache_coherent ( int_rtrans_cache_coherent  ),
+      .port2_accept         ( int_rtrans_accept          ),
+      .port2_drop           ( int_rtrans_drop            ),
+      .port2_miss           ( int_rtrans_miss            ),
+
+      // L2 miss info inputs -> axi_rab_cfg
+      .miss_l2_i            ( L2Miss_S                   ),
+      .miss_l2_addr_i       ( L2OutInAddr_DP             ),
+      .miss_l2_id_i         ( L2OutId_DP                 ),
+      .miss_l2_user_i       ( L2OutUser_DP               ),
+
+      // L2 config outputs
+      .wdata_l2_o           ( L2CfgWData_D               ),
+      .waddr_l2_o           ( L2CfgWAddr_D               ),
+      .wren_l2_o            ( L2CfgWE_S                  )
+    );
+
+  // }}}
+
+  // AX SPLITS {{{
+  //  ââââââ âââ  âââ    âââââââââââââââ âââ     ââââââââââââ
+  // ââââââââââââââââ    âââââââââââââââââââ     ââââââââââââ
+  // ââââââââ ââââââ     âââââââââââââââââââ     âââ   âââ
+  // ââââââââ ââââââ     âââââââââââââââ âââ     âââ   âââ
+  // âââ  âââââââ âââ    âââââââââââ     âââââââââââ   âââ
+  // âââ  ââââââ  âââ    âââââââââââ     âââââââââââ   âââ
+  //
+  /**
+   * Multiplex the two output master ports of the Read Address and Write Address (AR/AW) channels.
+   *
+   * Use the `int_xmaster_select` signal to route the signals to either Master 0 (to memory) or
+   * Master 1 (to ACP). In case of an L1 miss: Route the signals to both masters. They shall be
+   * saved until the L2 outputs are available.
+   */
+  generate for (i = 0; i < N_PORTS; i++) begin : AX_SPLIT
+
+    /*
+     * When accepting L1 transactions, we must just do so on the selected master. Drop requests must
+     * be performed on any one of the two masters. Save requests must be performed by both masters.
+     */
+    always_comb begin : AW_L1_SPLIT
+
+      // TLB handshake
+      l1_m0_aw_accept[i] = 1'b0;
+      l1_m1_aw_accept[i] = 1'b0;
+      l1_m0_aw_drop[i]   = 1'b0;
+      l1_m1_aw_drop[i]   = 1'b0;
+      l1_m0_aw_save[i]   = 1'b0;
+      l1_m1_aw_save[i]   = 1'b0;
+
+      l1_mx_aw_done[i]   = 1'b0;
+
+      // AXI sender input handshake
+      int_m0_awvalid[i]  = 1'b0;
+      int_m1_awvalid[i]  = 1'b0;
+      int_awready[i]     = 1'b0;
+
+      // accept on selected master only
+      if (l1_aw_accept[i]) begin
+        if (int_wmaster_select[i]) begin
+          l1_m1_aw_accept[i] = 1'b1;
+          l1_mx_aw_done[i]   = l1_m1_aw_done[i];
+
+          int_m1_awvalid[i]  = int_awvalid[i];
+          int_awready[i]     = int_m1_awready[i];
+
+        end else begin
+          l1_m0_aw_accept[i] = 1'b1;
+          l1_mx_aw_done[i]   = l1_m0_aw_done[i];
+
+          int_m0_awvalid[i]  = int_awvalid[i];
+          int_awready[i]     = int_m0_awready[i];
+        end
+
+      // drop on Master 0 only
+      end else if (l1_aw_drop[i]) begin
+        l1_m0_aw_drop[i]     = 1'b1;
+        l1_mx_aw_done[i]     = l1_m0_aw_done[i];
+
+        int_m0_awvalid[i]    = int_awvalid[i];
+        int_awready[i]       = l1_m0_aw_done[i];
+
+      // save on both masters
+      end else if (l1_aw_save[i]) begin
+        // split save
+        l1_m0_aw_save[i]     = ~l1_m0_aw_done_SP[i];
+        l1_m1_aw_save[i]     = ~l1_m1_aw_done_SP[i];
+
+        // combine done
+        l1_mx_aw_done[i]     = l1_m0_aw_done_SP[i] & l1_m1_aw_done_SP[i];
+
+        int_m0_awvalid[i]    = int_awvalid[i];
+        int_m1_awvalid[i]    = int_awvalid[i];
+        int_awready[i]       = l1_mx_aw_done[i];
+      end
+    end
+
+    // signal back to handshake splitter
+    assign l1_aw_done[i]     = l1_mx_aw_done[i];
+
+    always_ff @(posedge Clk_CI) begin : L1_MX_AW_DONE_REG
+      if (Rst_RBI == 0) begin
+        l1_m0_aw_done_SP[i] <= 1'b0;
+        l1_m1_aw_done_SP[i] <= 1'b0;
+      end else if (l1_mx_aw_done[i]) begin
+        l1_m0_aw_done_SP[i] <= 1'b0;
+        l1_m1_aw_done_SP[i] <= 1'b0;
+      end else begin
+        l1_m0_aw_done_SP[i] <= l1_m0_aw_done_SP[i] | l1_m0_aw_done[i];
+        l1_m1_aw_done_SP[i] <= l1_m1_aw_done_SP[i] | l1_m1_aw_done[i];
+      end
+    end
+
+    /*
+     * When accepting L2 transactions, we must drop the corresponding transaction from the other
+     * master to make it available again for save requests from L1_DROP_SAVE.
+     */
+    always_comb begin : AW_L2_SPLIT
+
+      l2_m0_aw_accept[i] = 1'b0;
+      l2_m1_aw_accept[i] = 1'b0;
+      l2_m0_aw_drop[i]   = 1'b0;
+      l2_m1_aw_drop[i]   = 1'b0;
+
+      // de-assert request signals individually upon handshakes
+      if (l2_aw_accept[i]) begin
+        if (l2_master_select[i]) begin
+          l2_m1_aw_accept[i] = ~l2_m1_aw_done_SP[i];
+          l2_m0_aw_drop[i]   = ~l2_m0_aw_done_SP[i];
+
+        end else begin
+          l2_m0_aw_accept[i] = ~l2_m0_aw_done_SP[i];
+          l2_m1_aw_drop[i]   = ~l2_m1_aw_done_SP[i];
+
+        end
+      end else begin
+        l2_m0_aw_drop[i]     = ~l2_m0_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0;
+        l2_m1_aw_drop[i]     = ~l2_m1_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0;
+
+      end
+
+      // combine done
+      l2_mx_aw_done[i] = l2_m0_aw_done_SP[i] & l2_m1_aw_done_SP[i];
+
+      l2_aw_done[i]    = l2_mx_aw_done[i];
+    end
+
+    always_ff @(posedge Clk_CI) begin : L2_MX_AW_DONE_REG
+      if (Rst_RBI == 0) begin
+        l2_m0_aw_done_SP[i] <= 1'b0;
+        l2_m1_aw_done_SP[i] <= 1'b0;
+      end else if (l2_mx_aw_done[i]) begin
+        l2_m0_aw_done_SP[i] <= 1'b0;
+        l2_m1_aw_done_SP[i] <= 1'b0;
+      end else begin
+        l2_m0_aw_done_SP[i] <= l2_m0_aw_done_SP[i] | l2_m0_aw_done[i];
+        l2_m1_aw_done_SP[i] <= l2_m1_aw_done_SP[i] | l2_m1_aw_done[i];
+      end
+    end
+
+    /*
+     * When accepting L1 transactions, we must just do so on the selected master. Drop requests must
+     * be performed on any one of the two masters. Save requests must be performed by both masters.
+     */
+    always_comb begin : AR_L1_SPLIT
+
+      // TLB handshake
+      l1_m0_ar_accept[i] = 1'b0;
+      l1_m1_ar_accept[i] = 1'b0;
+      l1_m0_ar_drop[i]   = 1'b0;
+      l1_m1_ar_drop[i]   = 1'b0;
+      l1_m0_ar_save[i]   = 1'b0;
+      l1_m1_ar_save[i]   = 1'b0;
+
+      l1_mx_ar_done[i]   = 1'b0;
+
+      // AXI sender input handshake
+      int_m0_arvalid[i]  = 1'b0;
+      int_m1_arvalid[i]  = 1'b0;
+      int_arready[i]     = 1'b0;
+
+      // accept on selected master only
+      if (l1_ar_accept[i]) begin
+        if (int_rmaster_select[i]) begin
+          l1_m1_ar_accept[i] = 1'b1;
+          l1_mx_ar_done[i]   = l1_m1_ar_done[i];
+
+          int_m1_arvalid[i]  = int_arvalid[i];
+          int_arready[i]     = int_m1_arready[i];
+
+        end else begin
+          l1_m0_ar_accept[i] = 1'b1;
+          l1_mx_ar_done[i]   = l1_m0_ar_done[i];
+
+          int_m0_arvalid[i]  = int_arvalid[i];
+          int_arready[i]     = int_m0_arready[i];
+        end
+
+      // drop on Master 0 only
+      end else if (l1_ar_drop[i]) begin
+        l1_m0_ar_drop[i]     = 1'b1;
+        l1_mx_ar_done[i]     = l1_m0_ar_done[i];
+
+        int_m0_arvalid[i]    = int_arvalid[i];
+        int_arready[i]       = l1_m0_ar_done[i];
+
+      // save on both masters
+      end else if (l1_ar_save[i]) begin
+        // split save
+        l1_m0_ar_save[i]     = ~l1_m0_ar_done_SP[i];
+        l1_m1_ar_save[i]     = ~l1_m1_ar_done_SP[i];
+
+        // combine done
+        l1_mx_ar_done[i]     = l1_m0_ar_done_SP[i] & l1_m1_ar_done_SP[i];
+
+        int_m0_arvalid[i]    = int_arvalid[i];
+        int_m1_arvalid[i]    = int_arvalid[i];
+        int_arready[i]       = l1_mx_ar_done[i];
+      end
+    end
+
+    // signal back to handshake splitter
+    assign l1_ar_done[i]     = l1_mx_ar_done[i];
+
+    always_ff @(posedge Clk_CI) begin : L1_MX_AR_DONE_REG
+      if (Rst_RBI == 0) begin
+        l1_m0_ar_done_SP[i] <= 1'b0;
+        l1_m1_ar_done_SP[i] <= 1'b0;
+      end else if (l1_mx_ar_done[i]) begin
+        l1_m0_ar_done_SP[i] <= 1'b0;
+        l1_m1_ar_done_SP[i] <= 1'b0;
+      end else begin
+        l1_m0_ar_done_SP[i] <= l1_m0_ar_done_SP[i] | l1_m0_ar_done[i];
+        l1_m1_ar_done_SP[i] <= l1_m1_ar_done_SP[i] | l1_m1_ar_done[i];
+      end
+    end
+
+    /*
+     * When accepting L2 transactions, we must drop the corresponding transaction from the other
+     * master to make it available again for save requests from L1_DROP_SAVE.
+     */
+    always_comb begin : AR_L2_SPLIT
+
+      l2_m0_ar_accept[i] = 1'b0;
+      l2_m1_ar_accept[i] = 1'b0;
+      l2_m0_ar_drop[i]   = 1'b0;
+      l2_m1_ar_drop[i]   = 1'b0;
+
+      // de-assert request signals individually upon handshakes
+      if (l2_ar_accept[i]) begin
+        if (l2_master_select[i]) begin
+          l2_m1_ar_accept[i] = ~l2_m1_ar_done_SP[i];
+          l2_m0_ar_drop[i]   = ~l2_m0_ar_done_SP[i];
+
+        end else begin
+          l2_m0_ar_accept[i] = ~l2_m0_ar_done_SP[i];
+          l2_m1_ar_drop[i]   = ~l2_m1_ar_done_SP[i];
+
+        end
+      end else if (l2_ar_drop[i]) begin
+        l2_m0_ar_drop[i]     = ~l2_m0_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0;
+        l2_m1_ar_drop[i]     = ~l2_m1_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0;
+
+      end
+
+      // combine done
+      l2_mx_ar_done[i] = l2_m0_ar_done_SP[i] & l2_m1_ar_done_SP[i];
+
+      l2_ar_done[i]    = l2_mx_ar_done[i];
+    end
+
+    always_ff @(posedge Clk_CI) begin : L2_MX_AR_DONE_REG
+      if (Rst_RBI == 0) begin
+        l2_m0_ar_done_SP[i] <= 1'b0;
+        l2_m1_ar_done_SP[i] <= 1'b0;
+      end else if (l2_mx_ar_done[i]) begin
+        l2_m0_ar_done_SP[i] <= 1'b0;
+        l2_m1_ar_done_SP[i] <= 1'b0;
+      end else begin
+        l2_m0_ar_done_SP[i] <= l2_m0_ar_done_SP[i] | l2_m0_ar_done[i];
+        l2_m1_ar_done_SP[i] <= l2_m1_ar_done_SP[i] | l2_m1_ar_done[i];
+      end
+    end
+
+  end // AX_SPLIT
+  endgenerate // AX_SPLIT
+
+  // }}}
+
+  // HANDSHAKE SPLITS {{{
+  // âââ  âââââââââââ    âââââââââââââââ âââ     ââââââââââââ
+  // âââ  âââââââââââ    âââââââââââââââââââ     ââââââââââââ
+  // ââââââââââââââââ    âââââââââââââââââââ     âââ   âââ
+  // ââââââââââââââââ    âââââââââââââââ âââ     âââ   âââ
+  // âââ  âââââââââââ    âââââââââââ     âââââââââââ   âââ
+  // âââ  âââââââââââ    âââââââââââ     âââââââââââ   âââ
+  //
+  /*
+   * We need to perform combined handshakes with multiple AXI modules
+   * upon transactions drops, accepts, saves etc. from two TLBs.
+   */
+  generate for (i = 0; i < N_PORTS; i++) begin : HANDSHAKE_SPLIT
+
+    assign l1_xw_accept[i]    = int_wtrans_accept[i] & ~aw_out_stall[i];
+    assign int_wtrans_sent[i] = l1_xw_done[i];
+
+    assign l1_ar_accept[i]    = int_rtrans_accept[i];
+    assign int_rtrans_sent[i] = l1_ar_done[i];
+
+    /*
+     * L1 AW sender + W buffer handshake split
+     */
+    // forward
+    assign l1_aw_accept[i] = l1_xw_accept[i] & ~l1_aw_done_SP[i];
+    assign l1_w_accept[i]  = l1_xw_accept[i] & ~l1_w_done_SP[i];
+
+    assign l1_aw_save[i]   = l1_xw_save[i]   & ~l1_aw_done_SP[i];
+    assign l1_w_save[i]    = l1_xw_save[i]   & ~l1_w_done_SP[i];
+
+    assign l1_aw_drop[i]   = l1_xw_drop[i]   & ~l1_aw_done_SP[i];
+    assign l1_w_drop[i]    = l1_xw_drop[i]   & ~l1_w_done_SP[i];
+
+    // backward
+    assign l1_xw_done[i]   = l1_aw_done_SP[i] & l1_w_done_SP[i];
+
+    always_ff @(posedge Clk_CI) begin : L1_XW_HS_SPLIT
+      if (Rst_RBI == 0) begin
+        l1_aw_done_SP[i] <= 1'b0;
+        l1_w_done_SP[i]  <= 1'b0;
+      end else if (l1_xw_done[i]) begin
+        l1_aw_done_SP[i] <= 1'b0;
+        l1_w_done_SP[i]  <= 1'b0;
+      end else begin
+        l1_aw_done_SP[i] <= l1_aw_done_SP[i] | l1_aw_done[i];
+        l1_w_done_SP[i]  <= l1_w_done_SP[i]  | l1_w_done[i];
+      end
+    end
+
+    if (ENABLE_L2TLB[i] == 1) begin : L2_HS_SPLIT
+
+      /*
+       * L1 AR sender + R sender handshake split
+       *
+       * AR and R do not need to be strictly in sync. We thus use separate handshakes.
+       * But the handshake signals for the R sender are multiplexed with the those for
+       * the L2. However, L2_ACCEPT_DROP_SAVE has always higher priority.
+       */
+      assign lx_r_drop[i] = l2_r_drop[i] | l1_r_drop[i];
+      assign l1_r_done[i] = l2_r_drop[i] ? 1'b0         : lx_r_done[i];
+      assign l2_r_done[i] = l2_r_drop[i] ? lx_r_done[i] : 1'b0;
+
+      /*
+       * L2 AW sender + W buffer handshake split
+       */
+      // forward
+      assign l2_aw_accept[i] = l2_xw_accept[i] & ~l2_aw_done_SP[i];
+      assign l2_w_accept[i]  = l2_xw_accept[i] & ~l2_w_done_SP[i];
+
+      assign l2_aw_drop[i]   = l2_xw_drop[i]   & ~l2_aw_done_SP[i];
+      assign l2_w_drop[i]    = l2_xw_drop[i]   & ~l2_w_done_SP[i];
+
+      // backward
+      assign l2_xw_done[i]   = l2_aw_done_SP[i] & l2_w_done_SP[i];
+
+      always_ff @(posedge Clk_CI) begin : L2_XW_HS_SPLIT
+        if (Rst_RBI == 0) begin
+          l2_aw_done_SP[i] <= 1'b0;
+          l2_w_done_SP[i]  <= 1'b0;
+        end else if (l2_xw_done[i]) begin
+          l2_aw_done_SP[i] <= 1'b0;
+          l2_w_done_SP[i]  <= 1'b0;
+        end else begin
+          l2_aw_done_SP[i] <= l2_aw_done_SP[i] | l2_aw_done[i];
+          l2_w_done_SP[i]  <= l2_w_done_SP[i]  | l2_w_done[i];
+        end
+      end
+
+      /*
+       * L2 AR + R sender handshake split
+       */
+      // forward
+      assign l2_ar_drop[i]   = l2_xr_drop[i]   & ~l2_ar_done_SP[i];
+      assign l2_r_drop[i]    = l2_xr_drop[i]   & ~l2_r_done_SP[i];
+
+      // backward - make sure to always clear L2_XR_HS_SPLIT
+      always_comb begin
+        if (l2_xr_drop[i]) begin
+          l2_xr_done[i]      = l2_ar_done_SP[i] & l2_r_done_SP[i];
+        end else begin
+          l2_xr_done[i]      = l2_ar_done_SP[i];
+        end
+      end
+
+      always_ff @(posedge Clk_CI) begin : L2_XR_HS_SPLIT
+        if (Rst_RBI == 0) begin
+          l2_ar_done_SP[i] <= 1'b0;
+          l2_r_done_SP[i]  <= 1'b0;
+        end else if (l2_xr_done[i]) begin
+          l2_ar_done_SP[i] <= 1'b0;
+          l2_r_done_SP[i]  <= 1'b0;
+        end else begin
+          l2_ar_done_SP[i] <= l2_ar_done_SP[i] | l2_ar_done[i];
+          l2_r_done_SP[i]  <= l2_r_done_SP[i]  | l2_r_done[i];
+        end
+      end
+
+    end else begin // if (ENABLE_L2TLB[i] == 1)
+
+      assign lx_r_drop[i]     = l1_r_drop[i];
+      assign l1_r_done[i]     = lx_r_done[i];
+
+      assign l2_aw_accept[i]  = 1'b0;
+      assign l2_w_accept[i]   = 1'b0;
+      assign l2_aw_drop[i]    = 1'b0;
+      assign l2_w_drop[i]     = 1'b0;
+      assign l2_xw_done[i]    = 1'b0;
+      assign l2_aw_done_SP[i] = 1'b0;
+      assign l2_w_done_SP[i]  = 1'b0;
+
+      assign l2_ar_accept[i]  = 1'b0;
+      assign l2_ar_drop[i]    = 1'b0;
+      assign l2_r_drop[i]     = 1'b0;
+      assign l2_xr_done[i]    = 1'b0;
+      assign l2_r_done[i]     = 1'b0;
+      assign l2_ar_done_SP[i] = 1'b0;
+      assign l2_r_done_SP[i]  = 1'b0;
+
+    end // if (ENABLE_L2TLB[i] == 1)
+
+  end // HANDSHAKE_SPLIT
+  endgenerate // HANDSHAKE_SPLIT
+
+  // }}}
+
+  // L2 TLB {{{
+  // âââ     âââââââ     ââââââââââââ     âââââââ
+  // âââ     ââââââââ    ââââââââââââ     ââââââââ
+  // âââ      âââââââ       âââ   âââ     ââââââââ
+  // âââ     âââââââ        âââ   âââ     ââââââââ
+  // ââââââââââââââââ       âââ   ââââââââââââââââ
+  // ââââââââââââââââ       âââ   âââââââââââââââ
+  //
+  /*
+   * l2_tlb
+   *
+   * The L2 TLB translates addresses upon misses in the L1 TLB (rab_core).
+   *
+   * It supports one ongoing translation at a time. If an L1 miss occurs while the L2 is busy,
+   * the L1 is stalled untill the L2 is available again.
+   *
+   */
+  generate for (i = 0; i < N_PORTS; i++) begin : L2_TLB
+    if (ENABLE_L2TLB[i] == 1) begin : L2_TLB
+
+      /*
+       * L1 output selector
+       */
+      assign L1OutRwType_D[i] = int_wtrans_drop[i] ? 1'b1 : 1'b0;
+      assign L1OutProt_D[i]   = rab_prot[i];
+      assign L1OutMulti_D[i]  = rab_multi[i];
+
+      /*
+       * L1 output control + L1_DROP_BUF, L2_IN_BUF management
+       *
+       * Forward the L1 drop request to AR/AW sender modules if
+       * 1. the transactions needs to be dropped (L1 multi, prot, prefetch), or
+       * 2. if a lookup in the L2 TLB is required (L1 miss) and the input buffer is not full.
+       *
+       * The AR/AW senders do not support more than 1 oustanding L1 miss. The push back towards
+       * the upstream is realized by not accepting the save request (saving the L1 transaction)
+       * in the senders as long as the L2 TLB is busy or has valid output. This ultimately
+       * blocks the L1 TLB.
+       *
+       * Together with the AW drop/save, we also perform the W drop/save as AW and W need to
+       * absolutely remain in order. In contrast, the R drop is performed
+       */
+      always_comb begin : L1_DROP_SAVE
+
+        l1_ar_drop[i]       = 1'b0;
+        l1_ar_save[i]       = 1'b0;
+        l1_xw_drop[i]       = 1'b0;
+        l1_xw_save[i]       = 1'b0;
+
+        l1_id_drop[i]       = L1OutId_D[i];
+        l1_len_drop[i]      = L1OutLen_D[i];
+        l1_prefetch_drop[i] = rab_prefetch[i];
+        l1_hit_drop[i]      = 1'b1; // there are no drops for L1 misses
+
+        L1DropEn_S[i]       = 1'b0;
+        L2InEn_S[i]         = 1'b0;
+
+        if ( rab_prot[i] | rab_multi[i] | rab_prefetch[i] ) begin
+          // 1. Drop
+          l1_ar_drop[i] = int_rtrans_drop[i] & ~L1DropValid_SP[i];
+          l1_xw_drop[i] = int_wtrans_drop[i] & ~L1DropValid_SP[i];
+
+          // Store to L1_DROP_BUF upon handshake
+          L1DropEn_S[i] = (l1_ar_drop[i] & l1_ar_done[i]) |
+                          (l1_xw_drop[i] & l1_xw_done[i]);
+
+        end else if ( rab_miss[i] ) begin
+          // 2. Save - Make sure L2 is really available.
+          l1_ar_save[i] = int_rtrans_drop[i] & ~L2Busy_S[i];
+          l1_xw_save[i] = int_wtrans_drop[i] & ~L2Busy_S[i];
+
+          // Store to L2_IN_BUF upon handshake - triggers the L2 TLB
+          L2InEn_S[i]   = (l1_ar_save[i] & l1_ar_done[i]) |
+                          (l1_xw_save[i] & l1_xw_done[i]);
+        end
+      end
+
+      /*
+       * L2 output control + L2_OUT_BUF management + R/B sender control + W buffer control
+       *
+       * Perform L1 R transaction drops unless the L2 output buffer holds valid data. The AXI specs
+       * require the B response to be sent only after consuming/discarding the corresponding data
+       * in the W channel. Thus, we only send L2 drop request to the W buffer here. The drop
+       * request to the B sender is then sent by the W buffer autonomously.
+       *
+       * L1 AW/W drop requests are managed by L1_DROP_SAVE.
+       */
+      always_comb begin : L2_ACCEPT_DROP_SAVE
+
+        l2_ar_addr[i]       =  'b0;
+        l2_aw_addr[i]       =  'b0;
+        l2_ar_accept[i]     = 1'b0;
+        l2_xr_drop[i]       = 1'b0;
+        l2_xw_accept[i]     = 1'b0;
+        l2_xw_drop[i]       = 1'b0;
+
+        l1_r_drop[i]        = 1'b0;
+
+        lx_id_drop[i]       =  'b0;
+        lx_len_drop[i]      =  'b0;
+        lx_prefetch_drop[i] = 1'b0;
+        lx_hit_drop[i]      = 1'b0;
+
+        L1DropValid_SN[i]   = L1DropValid_SP[i] | L1DropEn_S[i];
+        L2OutValid_SN[i]    = L2OutValid_SP[i];
+        L2OutReady_S[i]     = 1'b0;
+        L2OutEn_S[i]        = 1'b0;
+
+        L2Miss_S[i]         = 1'b0;
+        int_multi[i]        = 1'b0;
+        int_prot[i]         = 1'b0;
+
+        if (L2OutValid_SP[i] == 1'b0) begin
+
+          // Drop L1 from R senders
+          if (L1DropValid_SP[i] == 1'b1) begin
+
+            // Only perform the R sender drop here.
+            if (~L1DropRwType_DP[i]) begin
+
+              l1_r_drop[i]        = 1'b1;
+              lx_id_drop[i]       = L1DropId_DP[i];
+              lx_len_drop[i]      = L1DropLen_DP[i];
+              lx_prefetch_drop[i] = L1DropPrefetch_S[i];
+              lx_hit_drop[i]      = 1'b1; // there are no drops for L1 misses
+
+              // Invalidate L1_DROP_BUF upon handshake
+              if ( l1_r_drop[i] & l1_r_done[i] ) begin
+
+                L1DropValid_SN[i] = 1'b0;
+                int_prot[i]       = L1DropProt_DP[i];
+                int_multi[i]      = L1DropMulti_DP[i];
+              end
+
+            end else begin
+              // Invalidate L1_DROP_BUF
+              L1DropValid_SN[i]   = 1'b0;
+              int_prot[i]         = L1DropProt_DP[i];
+              int_multi[i]        = L1DropMulti_DP[i];
+            end
+          end
+
+        end else begin // L2_OUT_BUF has valid data
+
+          if ( L2OutHit_SP[i] & ~(L2OutPrefetch_S[i] | L2OutProt_SP[i] | L2OutMulti_SP[i]) ) begin
+
+            l2_ar_addr[i]       = L2OutAddr_DP[i];
+            l2_aw_addr[i]       = L2OutAddr_DP[i];
+
+            l2_ar_accept[i]     = L2OutRwType_DP[i] ? 1'b0 : 1'b1;
+            l2_xw_accept[i]     = L2OutRwType_DP[i] ? 1'b1 : 1'b0;
+
+            // Invalidate L2_OUT_BUF upon handshake
+            L2OutValid_SN[i] = ~( (l2_ar_accept[i] & l2_ar_done[i]) |
+                                  (l2_xw_accept[i] & l2_xw_done[i]) );
+          end else begin
+
+            lx_id_drop[i]       = L2OutId_DP[i];
+            lx_len_drop[i]      = L2OutLen_DP[i];
+            lx_prefetch_drop[i] = L2OutPrefetch_S[i];
+            lx_hit_drop[i]      = L2OutHit_SP[i];
+
+            // The l2_xr_drop will also perform the handshake with the R sender
+            l2_xr_drop[i]       = L2OutRwType_DP[i] ? 1'b0 : 1'b1;
+            l2_xw_drop[i]       = L2OutRwType_DP[i] ? 1'b1 : 1'b0;
+
+            // Invalidate L1_DROP_BUF upon handshake
+            if ( (l2_xr_drop[i] & l2_xr_done[i]) | (l2_xw_drop[i] & l2_xw_done[i]) ) begin
+
+              L2OutValid_SN[i]  = 1'b0;
+              L2Miss_S[i]       = ~L2OutHit_SP[i];
+              int_prot[i]       = L2OutProt_SP[i];
+              int_multi[i]      = L2OutMulti_SP[i];
+            end
+          end
+        end
+
+        // Only accept new L2 output after ongoing drops have finished.
+        if ( (l2_xr_drop[i] == l2_xr_done[i]) &
+             (l2_xw_drop[i] == l2_xw_done[i]) &
+             (l1_r_drop[i]  == l1_r_done[i] ) ) begin
+          // Store to L2_OUT_BUF upon handshake with L2 TLB module
+          if ( (L2OutValid_SP[i] == 1'b0) && (L2OutValid_S[i] == 1'b1) ) begin
+            L2OutValid_SN[i]   = 1'b1;
+            L2OutReady_S[i]    = 1'b1;
+            L2OutEn_S[i]       = 1'b1;
+          end
+        end
+      end
+
+      /*
+       * L1 drop buffer
+       *
+       * Used in case of multi, prot and prefetch hits in the L1 TLB.
+       */
+      always_ff @(posedge Clk_CI) begin : L1_DROP_BUF
+         if (Rst_RBI == 0) begin
+            L1DropProt_DP[i]   <= 1'b0;
+            L1DropMulti_DP[i]  <= 1'b0;
+            L1DropRwType_DP[i] <= 1'b0;
+            L1DropUser_DP[i]   <=  'b0;
+            L1DropId_DP[i]     <=  'b0;
+            L1DropLen_DP[i]    <=  'b0;
+            L1DropAddr_DP[i]   <=  'b0;
+         end else if (L1DropEn_S[i] == 1'b1) begin
+            L1DropProt_DP[i]   <= L1OutProt_D[i]  ;
+            L1DropMulti_DP[i]  <= L1OutMulti_D[i] ;
+            L1DropRwType_DP[i] <= L1OutRwType_D[i];
+            L1DropUser_DP[i]   <= L1OutUser_D[i]  ;
+            L1DropId_DP[i]     <= L1OutId_D[i]    ;
+            L1DropLen_DP[i]    <= L1OutLen_D[i]   ;
+            L1DropAddr_DP[i]   <= L1OutAddr_D[i]  ;
+         end
+      end // always_ff @ (posedge Clk_CI)
+
+      /*
+       * L2 input buffer
+       *
+       * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB.
+       */
+      always_ff @(posedge Clk_CI) begin : L2_IN_BUF
+         if (Rst_RBI == 0) begin
+            L2InRwType_DP[i] <= 1'b0;
+            L2InUser_DP[i]   <=  'b0;
+            L2InId_DP[i]     <=  'b0;
+            L2InLen_DP[i]    <=  'b0;
+            L2InAddr_DP[i]   <=  'b0;
+         end else if (L2InEn_S[i] == 1'b1) begin
+            L2InRwType_DP[i] <= L1OutRwType_D[i];
+            L2InUser_DP[i]   <= L1OutUser_D[i]  ;
+            L2InId_DP[i]     <= L1OutId_D[i]    ;
+            L2InLen_DP[i]    <= L1OutLen_D[i]   ;
+            L2InAddr_DP[i]   <= L1OutAddr_D[i]  ;
+         end
+      end // always_ff @ (posedge Clk_CI)
+
+      l2_tlb
+        #(
+          .AXI_S_ADDR_WIDTH       ( AXI_S_ADDR_WIDTH                                    ),
+          .AXI_M_ADDR_WIDTH       ( AXI_M_ADDR_WIDTH                                    ),
+          .AXI_LITE_DATA_WIDTH    ( AXI_LITE_DATA_WIDTH                                 ),
+          .AXI_LITE_ADDR_WIDTH    ( AXI_LITE_ADDR_WIDTH                                 ),
+          .N_SETS                 ( `RAB_L2_N_SETS                                      ),
+          .N_OFFSETS              ( `RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS       ),
+          .N_PAR_VA_RAMS          ( `RAB_L2_N_PAR_VA_RAMS                               ),
+          .HIT_OFFSET_STORE_WIDTH ( log2(`RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS) )
+          )
+      u_l2_tlb
+        (
+          .clk_i              ( Clk_CI           ),
+          .rst_ni             ( Rst_RBI          ),
+
+          // Config inputs
+          .we_i               ( L2CfgWE_S[i]     ),
+          .waddr_i            ( L2CfgWAddr_D[i]  ),
+          .wdata_i            ( L2CfgWData_D[i]  ),
+
+          // Request input
+          .start_i            ( L2InEn_S[i]      ),
+          .busy_o             ( L2Busy_S[i]      ),
+          .rw_type_i          ( L2InRwType_DP[i] ),
+          .in_addr_i          ( L2InAddr_DP[i]   ),
+
+          // Response output
+          .out_ready_i        ( L2OutReady_S[i]  ),
+          .out_valid_o        ( L2OutValid_S[i]  ),
+          .hit_o              ( L2OutHit_SN[i]   ),
+          .miss_o             ( L2OutMiss_SN[i]  ),
+          .prot_o             ( L2OutProt_SN[i]  ),
+          .multi_o            ( L2OutMulti_SN[i] ),
+          .cache_coherent_o   ( L2OutCC_SN[i]    ),
+          .out_addr_o         ( L2OutAddr_DN[i]  )
+        );
+
+      /*
+       * L2 output buffer
+       *
+       * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB.
+       */
+      always_ff @(posedge Clk_CI) begin : L2_OUT_BUF
+         if (Rst_RBI == 0) begin
+            L2OutRwType_DP[i] <= 1'b0;
+            L2OutUser_DP[i]   <=  'b0;
+            L2OutLen_DP[i]    <=  'b0;
+            L2OutId_DP[i]     <=  'b0;
+            L2OutInAddr_DP[i] <=  'b0;
+
+            L2OutHit_SP[i]    <= 1'b0;
+            L2OutMiss_SP[i]   <= 1'b0;
+            L2OutProt_SP[i]   <= 1'b0;
+            L2OutMulti_SP[i]  <= 1'b0;
+            L2OutCC_SP[i]     <= 1'b0;
+            L2OutAddr_DP[i]   <=  'b0;
+         end else if (L2OutEn_S[i] == 1'b1) begin
+            L2OutRwType_DP[i] <= L2InRwType_DP[i];
+            L2OutUser_DP[i]   <= L2InUser_DP[i]  ;
+            L2OutLen_DP[i]    <= L2InLen_DP[i]   ;
+            L2OutId_DP[i]     <= L2InId_DP[i]    ;
+            L2OutInAddr_DP[i] <= L2InAddr_DP[i]  ;
+
+            L2OutHit_SP[i]    <= L2OutHit_SN[i]  ;
+            L2OutMiss_SP[i]   <= L2OutMiss_SN[i] ;
+            L2OutProt_SP[i]   <= L2OutProt_SN[i] ;
+            L2OutMulti_SP[i]  <= L2OutMulti_SN[i];
+            L2OutCC_SP[i]     <= L2OutCC_SN[i]   ;
+            L2OutAddr_DP[i]   <= L2OutAddr_DN[i] ;
+         end
+      end // always_ff @ (posedge Clk_CI)
+
+      always_ff @(posedge Clk_CI) begin : BUF_VALID
+        if (Rst_RBI == 0) begin
+          L1DropValid_SP[i] = 1'b0;
+          L2OutValid_SP[i]  = 1'b0;
+        end else begin
+          L1DropValid_SP[i] = L1DropValid_SN[i];
+          L2OutValid_SP[i]  = L2OutValid_SN[i];
+        end
+      end
+
+      always_comb begin : BUF_TO_PREFETCH
+        // L1 Drop Buf
+        if (L1DropUser_DP[i] == {AXI_USER_WIDTH{1'b1}})
+          L1DropPrefetch_S[i] = 1'b1;
+        else
+          L1DropPrefetch_S[i] = 1'b0;
+
+        // L2 Out Buf
+        if (L2OutUser_DP[i] == {AXI_USER_WIDTH{1'b1}})
+          L2OutPrefetch_S[i]  = 1'b1;
+        else
+          L2OutPrefetch_S[i]  = 1'b0;
+      end
+
+      assign l2_cache_coherent[i] = L2OutCC_SP[i];
+      assign int_miss[i]          = L2Miss_S[i];
+
+    end else begin : L2_TLB_STUB // if (ENABLE_L2TLB[i] == 1)
+
+      assign l1_ar_drop[i]        = int_rtrans_drop[i];
+      assign l1_r_drop[i]         = int_rtrans_drop[i];
+      assign l1_xw_drop[i]        = int_wtrans_drop[i];
+
+      assign l1_ar_save[i]        = 1'b0;
+      assign l1_xw_save[i]        = 1'b0;
+      assign l2_xw_accept[i]      = 1'b0;
+      assign l2_xr_drop[i]        = 1'b0;
+      assign l2_xw_drop[i]        = 1'b0;
+
+      assign l2_ar_addr[i]        =  'b0;
+      assign l2_aw_addr[i]        =  'b0;
+
+      assign l1_id_drop[i]        = int_wtrans_drop[i] ? int_awid[i] :
+                                    int_rtrans_drop[i] ? int_arid[i] :
+                                    '0;
+      assign l1_len_drop[i]       = int_wtrans_drop[i] ? int_awlen[i] :
+                                    int_rtrans_drop[i] ? int_arlen[i] :
+                                    '0;
+      assign l1_prefetch_drop[i]  = rab_prefetch[i];
+      assign l1_hit_drop[i]       = ~rab_miss[i];
+
+      assign lx_id_drop[i]        = int_wtrans_drop[i] ? int_awid[i] :
+                                    int_rtrans_drop[i] ? int_arid[i] :
+                                    '0;
+      assign lx_len_drop[i]       = int_wtrans_drop[i] ? int_awlen[i] :
+                                    int_rtrans_drop[i] ? int_arlen[i] :
+                                    '0;
+      assign lx_prefetch_drop[i]  = rab_prefetch[i];
+      assign lx_hit_drop[i]       = ~rab_miss[i];
+
+      assign l2_cache_coherent[i] = 1'b0;
+
+      assign int_miss[i]          = rab_miss[i];
+      assign int_prot[i]          = rab_prot[i];
+      assign int_multi[i]         = rab_multi[i];
+
+      // unused signals
+      assign L2Miss_S[i]          = 1'b0;
+
+      assign L1OutRwType_D[i]     = 1'b0;
+      assign L1OutProt_D[i]       = 1'b0;
+      assign L1OutMulti_D[i]      = 1'b0;
+
+      assign L1DropRwType_DP[i]   = 1'b0;
+      assign L1DropUser_DP[i]     =  'b0;
+      assign L1DropId_DP[i]       =  'b0;
+      assign L1DropLen_DP[i]      =  'b0;
+      assign L1DropAddr_DP[i]     =  'b0;
+      assign L1DropProt_DP[i]     = 1'b0;
+      assign L1DropMulti_DP[i]    = 1'b0;
+
+      assign L1DropEn_S[i]        = 1'b0;
+      assign L1DropPrefetch_S[i]  = 1'b0;
+      assign L1DropValid_SN[i]    = 1'b0;
+      assign L1DropValid_SP[i]    = 1'b0;
+
+      assign L2InRwType_DP[i]     = 1'b0;
+      assign L2InUser_DP[i]       =  'b0;
+      assign L2InId_DP[i]         =  'b0;
+      assign L2InLen_DP[i]        =  'b0;
+      assign L2InAddr_DP[i]       =  'b0;
+
+      assign L2InEn_S[i]          = 1'b0;
+
+      assign L2OutHit_SN[i]       = 1'b0;
+      assign L2OutMiss_SN[i]      = 1'b0;
+      assign L2OutProt_SN[i]      = 1'b0;
+      assign L2OutMulti_SN[i]     = 1'b0;
+      assign L2OutCC_SN[i]        = 1'b0;
+      assign L2OutAddr_DN[i]      =  'b0;
+
+      assign L2OutRwType_DP[i]    = 1'b0;
+      assign L2OutUser_DP[i]      =  'b0;
+      assign L2OutId_DP[i]        =  'b0;
+      assign L2OutLen_DP[i]       =  'b0;
+      assign L2OutInAddr_DP[i]    =  'b0;
+      assign L2OutHit_SP[i]       = 1'b0;
+      assign L2OutMiss_SP[i]      = 1'b0;
+      assign L2OutProt_SP[i]      = 1'b0;
+      assign L2OutMulti_SP[i]     = 1'b0;
+      assign L2OutCC_SP[i]        = 1'b0;
+      assign L2OutAddr_DP[i]      =  'b0;
+
+      assign L2OutEn_S[i]         = 1'b0;
+      assign L2OutPrefetch_S[i]   = 1'b0;
+      assign L2Busy_S[i]          = 1'b0;
+      assign L2OutValid_S[i]      = 1'b0;
+      assign L2OutValid_SN[i]     = 1'b0;
+      assign L2OutValid_SP[i]     = 1'b0;
+      assign L2OutReady_S[i]      = 1'b0;
+
+    end // !`ifdef ENABLE_L2TLB
+  end // for (i = 0; i < N_PORTS; i++)
+  endgenerate
+
+// }}}
+"""
+# endmodule
+#
+#
+# // vim: ts=2 sw=2 sts=2 et nosmartindent autoindent foldmethod=marker
+#
+#
diff --git a/src/soc/iommu/axi_rab/check_ram.py b/src/soc/iommu/axi_rab/check_ram.py
new file mode 100644
index 00000000..31bf32ea
--- /dev/null
+++ b/src/soc/iommu/axi_rab/check_ram.py
@@ -0,0 +1,240 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class check_ram(Elaboratable):
+
+    def __init__(self):
+        self.clk_i = Signal()  # input
+        self.rst_ni = Signal()  # input
+        self.in_addr = Signal(ADDR_WIDTH)  # input
+        self.rw_type = Signal()  # input
+        self.ram_we = Signal()  # input
+        self.port0_addr = Signal(1+ERROR p_expression_25)  # input
+        self.port1_addr = Signal(1+ERROR p_expression_25)  # input
+        self.ram_wdata = Signal(RAM_DATA_WIDTH)  # input
+        self.output_sent = Signal()  # input
+        self.output_valid = Signal()  # input
+        self.offset_addr_d = Signal(OFFSET_WIDTH)  # input
+        self.hit_addr = Signal(1+ERROR p_expression_25)  # output
+        self.master = Signal()  # output
+        self.hit = Signal()  # output
+        self.multi_hit = Signal()  # output
+        self.prot = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# //import CfMath::log2;
+#
+# //`define MULTI_HIT_FULL_SET
+#
+# module check_ram
+#  //#(
+#  //  parameter ADDR_WIDTH     = 32,
+#   // parameter RAM_DATA_WIDTH = 32,
+#   // parameter PAGE_SIZE      = 4096, // 4kB
+#  //  parameter SET_WIDTH      = 5,
+#   // parameter OFFSET_WIDTH   = 4
+#   // )
+#  (
+#   input  logic                                clk_i,
+#   input  logic                                rst_ni,
+#   input  logic [ADDR_WIDTH-1:0]               in_addr,
+#   input  logic                                rw_type, // 1 => write, 0=> read
+#   input  logic                                ram_we,
+#   input  logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr,
+#   input  logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr,
+#   input  logic [RAM_DATA_WIDTH-1:0]           ram_wdata,
+#   input  logic                                output_sent,
+#   input  logic                                output_valid,
+#   input  logic [OFFSET_WIDTH-1:0]             offset_addr_d,
+#   output logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr,
+#   output logic                                master,
+#   output logic                                hit,
+#   output logic                                multi_hit,
+#   output logic                                prot
+#   );
+#
+"""   #docstring_begin
+
+   localparam IGNORE_LSB = log2(PAGE_SIZE); // 12
+
+   logic [RAM_DATA_WIDTH-1:0]           port0_data_o, port1_data_o; // RAM read data outputs
+   logic                                port0_hit, port1_hit; // Ram output matches in_addr
+
+    logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr_saved, port1_addr_saved;
+
+   // Hit FSM Signals
+   typedef enum                         logic {SEARCH, HIT} hit_state_t;
+   hit_state_t                          hit_SP; // Hit FSM state
+   hit_state_t                          hit_SN; // Hit FSM next state
+
+   // Multi Hit FSM signals
+`ifdef MULTI_HIT_FULL_SET
+   typedef enum                         logic[1:0] {NO_HITS, ONE_HIT, MULTI_HIT} multi_state_t;
+   multi_state_t                        multi_SP; // Multi Hit FSM state
+   multi_state_t                        multi_SN; // Multi Hit FSM next state
+
+   logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr_saved;
+   logic                                master_saved;
+`endif
+
+  //// --------------- Block RAM (Dual Port) -------------- ////
+
+  // The outputs of the BRAMs are only valid if in the previous cycle:
+  // 1. the inputs were valid, and
+  // 2. the BRAM was not written to.
+  // Otherwise, the outputs must be ignored which is controlled by the output_valid signal.
+  // This signal is driven by the uppler level L2 TLB module.
+  ram_tp_no_change #(
+      .ADDR_WIDTH( SET_WIDTH+OFFSET_WIDTH+1 ),
+      .DATA_WIDTH( RAM_DATA_WIDTH           )
+    )
+    ram_tp_no_change_0
+    (
+      .clk   ( clk_i         ),
+      .we    ( ram_we        ),
+      .addr0 ( port0_addr    ),
+      .addr1 ( port1_addr    ),
+      .d_i   ( ram_wdata     ),
+      .d0_o  ( port0_data_o  ),
+      .d1_o  ( port1_data_o  )
+    );
+
+   //// Check Ram Outputs
+   assign port0_hit = (port0_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port0_data_o[RAM_DATA_WIDTH-1:4]);
+   assign port1_hit = (port1_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port1_data_o[RAM_DATA_WIDTH-1:4]);
+   //// ----------------------------------------------------- /////
+
+   //// ------------------- Check if Hit ------------------------ ////
+   // FSM
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         hit_SP <= SEARCH;
+      end else begin
+         hit_SP <= hit_SN;
+      end
+   end
+
+   always_ff @(posedge clk_i, negedge rst_ni) begin
+       if (!rst_ni) begin
+           port0_addr_saved <= '0;
+           port1_addr_saved <= '0;
+       end else begin
+           port0_addr_saved <= port0_addr;
+           port1_addr_saved <= port1_addr;
+       end
+   end
+
+   always_comb begin
+      hit_SN   = hit_SP;
+      hit      = 1'b0;
+      hit_addr = 0;
+      master   = 1'b0;
+      unique case(hit_SP)
+        SEARCH :
+          if (output_valid)
+            if (port0_hit || port1_hit) begin
+               hit_SN   = HIT;
+               hit      = 1'b1;
+               hit_addr = port0_hit ? {port0_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} :
+                          port1_hit ? {port1_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} :
+                          0;
+               master   = port0_hit ? port0_data_o[3] :
+                          port1_hit ? port1_data_o[3] :
+                          1'b0;
+            end
+
+        HIT : begin
+`ifdef MULTI_HIT_FULL_SET // Since the search continues after the first hit, it needs to be saved to be accessed later.
+           hit      = 1'b1;
+           hit_addr = hit_addr_saved;
+           master   = master_saved;
+`endif
+           if (output_sent)
+             hit_SN = SEARCH;
+        end
+
+        default : begin
+           hit_SN = SEARCH;
+        end
+      endcase // case (hit_SP)
+   end // always_comb begin
+
+   //// ------------------------------------------- ////
+
+   assign prot = output_valid && port0_hit ? ((~port0_data_o[2] && rw_type) || (~port0_data_o[1] && ~rw_type)) :
+                 output_valid && port1_hit ? ((~port1_data_o[2] && rw_type) || (~port1_data_o[1] && ~rw_type)) :
+                 1'b0;
+
+   //// ------------------- Multi ------------------- ////
+`ifdef MULTI_HIT_FULL_SET
+
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         hit_addr_saved <= 0;
+         master_saved   <= 1'b0;
+      end else if (output_valid) begin
+         hit_addr_saved <= hit_addr;
+         master_saved   <= master;
+      end
+   end
+
+   // FSM
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         multi_SP <= NO_HITS;
+      end else begin
+         multi_SP <= multi_SN;
+      end
+   end
+
+   always_comb begin
+      multi_SN  = multi_SP;
+      multi_hit = 1'b0;
+      unique case(multi_SP)
+        NO_HITS :
+          if(output_valid && (port0_hit && port1_hit)) begin
+             multi_SN  = MULTI_HIT;
+             multi_hit = 1'b1;
+          end else if(output_valid && (port0_hit || port1_hit))
+            multi_SN = ONE_HIT;
+
+        ONE_HIT :
+          if(output_valid && (port0_hit || port1_hit)) begin
+             multi_SN  = MULTI_HIT;
+             multi_hit = 1'b1;
+          end else if (output_sent)
+            multi_SN = NO_HITS;
+
+        MULTI_HIT : begin
+          multi_hit = 1'b1;
+           if (output_sent)
+             multi_SN = NO_HITS;
+        end
+
+      endcase // case (multi_SP)
+   end // always_comb begin
+
+`else // !`ifdef MULTI_HIT_FULL_SET
+   assign multi_hit = output_valid && port0_hit && port1_hit;
+`endif // !`ifdef MULTI_HIT_FULL_SET
+   //// ------------------------------------------- ////
+"""
+# endmodule
+#
+#
diff --git a/src/soc/iommu/axi_rab/coreconfig.py b/src/soc/iommu/axi_rab/coreconfig.py
new file mode 100644
index 00000000..247d0ce3
--- /dev/null
+++ b/src/soc/iommu/axi_rab/coreconfig.py
@@ -0,0 +1,6 @@
+class CoreConfig:
+    def __init__(self):
+        self.N_SLICES = 16
+        self.N_REGS = 4*self.N_SLICES
+        self.ADDR_WIDTH_PHYS = 40
+        self.ADDR_WIDTH_VIRT = 32
diff --git a/src/soc/iommu/axi_rab/fsm.py b/src/soc/iommu/axi_rab/fsm.py
new file mode 100644
index 00000000..d64b1cb4
--- /dev/null
+++ b/src/soc/iommu/axi_rab/fsm.py
@@ -0,0 +1,243 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class fsm(Elaboratable):
+
+    def __init__(self):
+        self.Clk_CI = Signal()  # input
+        self.Rst_RBI = Signal()  # input
+        self.port1_addr_valid_i = Signal()  # input
+        self.port2_addr_valid_i = Signal()  # input
+        self.port1_sent_i = Signal()  # input
+        self.port2_sent_i = Signal()  # input
+        self.select_i = Signal()  # input
+        self.no_hit_i = Signal()  # input
+        self.multi_hit_i = Signal()  # input
+        self.no_prot_i = Signal()  # input
+        self.prefetch_i = Signal()  # input
+        self.out_addr_i = Signal(AXI_M_ADDR_WIDTH)  # input
+        self.cache_coherent_i = Signal()  # input
+        self.port1_accept_o = Signal()  # output
+        self.port1_drop_o = Signal()  # output
+        self.port1_miss_o = Signal()  # output
+        self.port2_accept_o = Signal()  # output
+        self.port2_drop_o = Signal()  # output
+        self.port2_miss_o = Signal()  # output
+        self.out_addr_o = Signal(AXI_M_ADDR_WIDTH)  # output
+        self.cache_coherent_o = Signal()  # output
+        self.miss_o = Signal()  # output
+        self.multi_o = Signal()  # output
+        self.prot_o = Signal()  # output
+        self.prefetch_o = Signal()  # output
+        self.in_addr_i = Signal(AXI_S_ADDR_WIDTH)  # input
+        self.in_id_i = Signal(AXI_ID_WIDTH)  # input
+        self.in_len_i = Signal(8)  # input
+        self.in_user_i = Signal(AXI_USER_WIDTH)  # input
+        self.in_addr_o = Signal(AXI_S_ADDR_WIDTH)  # output
+        self.in_id_o = Signal(AXI_ID_WIDTH)  # output
+        self.in_len_o = Signal(8)  # output
+        self.in_user_o = Signal(AXI_USER_WIDTH)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# //`timescale 1ns / 1ps
+#
+# module fsm
+#  #(
+#    parameter AXI_M_ADDR_WIDTH = 40,
+#    parameter AXI_S_ADDR_WIDTH = 32,
+#    parameter AXI_ID_WIDTH     = 8,
+#    parameter AXI_USER_WIDTH   = 6
+#  )
+#  (
+#    input  logic                        Clk_CI,
+#    input  logic                        Rst_RBI,
+#
+#    input  logic                        port1_addr_valid_i,
+#    input  logic                        port2_addr_valid_i,
+#    input  logic                        port1_sent_i,
+#    input  logic                        port2_sent_i,
+#    input  logic                        select_i,
+#    input  logic                        no_hit_i,
+#    input  logic                        multi_hit_i,
+#    input  logic                        no_prot_i,
+#    input  logic                        prefetch_i,
+#    input  logic [AXI_M_ADDR_WIDTH-1:0] out_addr_i,
+#    input  logic                        cache_coherent_i,
+#    output logic                        port1_accept_o,
+#    output logic                        port1_drop_o,
+#    output logic                        port1_miss_o,
+#    output logic                        port2_accept_o,
+#    output logic                        port2_drop_o,
+#    output logic                        port2_miss_o,
+#    output logic [AXI_M_ADDR_WIDTH-1:0] out_addr_o,
+#    output logic                        cache_coherent_o,
+#    output logic                        miss_o,
+#    output logic                        multi_o,
+#    output logic                        prot_o,
+#    output logic                        prefetch_o,
+#    input  logic [AXI_S_ADDR_WIDTH-1:0] in_addr_i,
+#    input  logic     [AXI_ID_WIDTH-1:0] in_id_i,
+#    input  logic                  [7:0] in_len_i,
+#    input  logic   [AXI_USER_WIDTH-1:0] in_user_i,
+#    output logic [AXI_S_ADDR_WIDTH-1:0] in_addr_o,
+#    output logic     [AXI_ID_WIDTH-1:0] in_id_o,
+#    output logic                  [7:0] in_len_o,
+#    output logic   [AXI_USER_WIDTH-1:0] in_user_o
+#  );
+#
+"""  #docstring_begin
+
+  //-------------Internal Signals----------------------
+
+  typedef enum logic           {IDLE, WAIT} state_t;
+  logic                        state_SP; // Present state
+  logic                        state_SN; // Next State
+
+  logic                        port1_accept_SN;
+  logic                        port1_drop_SN;
+  logic                        port1_miss_SN;
+  logic                        port2_accept_SN;
+  logic                        port2_drop_SN;
+  logic                        port2_miss_SN;
+  logic                        miss_SN;
+  logic                        multi_SN;
+  logic                        prot_SN;
+  logic                        prefetch_SN;
+  logic                        cache_coherent_SN;
+  logic [AXI_M_ADDR_WIDTH-1:0] out_addr_DN;
+
+  logic                        out_reg_en_S;
+
+  //----------FSM comb------------------------------
+
+  always_comb begin: FSM_COMBO
+    state_SN          = state_SP;
+
+    port1_accept_SN   = 1'b0;
+    port1_drop_SN     = 1'b0;
+    port1_miss_SN     = 1'b0;
+    port2_accept_SN   = 1'b0;
+    port2_drop_SN     = 1'b0;
+    port2_miss_SN     = 1'b0;
+    miss_SN           = 1'b0;
+    multi_SN          = 1'b0;
+    prot_SN           = 1'b0;
+    prefetch_SN       = 1'b0;
+    cache_coherent_SN = 1'b0;
+    out_addr_DN       =   '0;
+
+    out_reg_en_S      = 1'b0; // by default hold register output
+
+    unique case(state_SP)
+        IDLE :
+          if ( (port1_addr_valid_i & select_i) | (port2_addr_valid_i & ~select_i) ) begin
+            out_reg_en_S = 1'b1;
+            state_SN     = WAIT;
+
+            // Select inputs for output registers
+            if          (port1_addr_valid_i & select_i) begin
+              port1_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
+              port1_drop_SN   =  (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
+              port1_miss_SN   =   no_hit_i;
+              port2_accept_SN = 1'b0;
+              port2_drop_SN   = 1'b0;
+              port2_miss_SN   = 1'b0;
+            end else if (port2_addr_valid_i & ~select_i) begin
+              port1_accept_SN = 1'b0;
+              port1_drop_SN   = 1'b0;
+              port1_miss_SN   = 1'b0;
+              port2_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
+              port2_drop_SN   =  (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
+              port2_miss_SN   =   no_hit_i;
+            end
+
+            miss_SN           = port1_miss_SN | port2_miss_SN;
+            multi_SN          = multi_hit_i;
+            prot_SN           = ~no_prot_i;
+            prefetch_SN       = ~no_hit_i & prefetch_i;
+
+            cache_coherent_SN = cache_coherent_i;
+            out_addr_DN       = out_addr_i;
+          end
+
+        WAIT :
+          if ( port1_sent_i | port2_sent_i ) begin
+            out_reg_en_S = 1'b1; // "clear" the register
+            state_SN     = IDLE;
+          end
+
+        default : begin
+           state_SN      = IDLE;
+        end
+      endcase
+    end
+
+  //----------FSM seq-------------------------------
+
+  always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: FSM_SEQ
+    if (Rst_RBI == 1'b0)
+      state_SP <= IDLE;
+    else
+      state_SP <= state_SN;
+  end
+
+  //----------Output seq--------------------------
+
+  always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: OUTPUT_SEQ
+    if (Rst_RBI == 1'b0) begin
+      port1_accept_o   = 1'b0;
+      port1_drop_o     = 1'b0;
+      port1_miss_o     = 1'b0;
+      port2_accept_o   = 1'b0;
+      port2_drop_o     = 1'b0;
+      port2_miss_o     = 1'b0;
+      miss_o           = 1'b0;
+      multi_o          = 1'b0;
+      prot_o           = 1'b0;
+      prefetch_o       = 1'b0;
+      cache_coherent_o = 1'b0;
+      out_addr_o       =   '0;
+      in_addr_o        =   '0;
+      in_id_o          =   '0;
+      in_len_o         =   '0;
+      in_user_o        =   '0;
+    end else if (out_reg_en_S == 1'b1) begin
+      port1_accept_o   = port1_accept_SN;
+      port1_drop_o     = port1_drop_SN;
+      port1_miss_o     = port1_miss_SN;
+      port2_accept_o   = port2_accept_SN;
+      port2_drop_o     = port2_drop_SN;
+      port2_miss_o     = port2_miss_SN;
+      miss_o           = miss_SN;
+      multi_o          = multi_SN;
+      prot_o           = prot_SN;
+      prefetch_o       = prefetch_SN;
+      cache_coherent_o = cache_coherent_SN;
+      out_addr_o       = out_addr_DN;
+      in_addr_o        = in_addr_i;
+      in_id_o          = in_id_i;
+      in_len_o         = in_len_i;
+      in_user_o        = in_user_i;
+    end
+  end // block: OUTPUT_SEQ
+"""
+#
+# endmodule
+#
+#
diff --git a/src/soc/iommu/axi_rab/l2_tlb.py b/src/soc/iommu/axi_rab/l2_tlb.py
new file mode 100644
index 00000000..11983f64
--- /dev/null
+++ b/src/soc/iommu/axi_rab/l2_tlb.py
@@ -0,0 +1,550 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class l2_tlb(Elaboratable):
+
+    def __init__(self):
+        self.clk_i = Signal()  # input
+        self.rst_ni = Signal()  # input
+        self.we_i = Signal()  # input
+        self.waddr_i = Signal(AXI_LITE_ADDR_WIDTH)  # input
+        self.wdata_i = Signal(AXI_LITE_DATA_WIDTH)  # input
+        self.start_i = Signal()  # input
+        self.busy_o = Signal()  # output
+        self.in_addr_i = Signal(AXI_S_ADDR_WIDTH)  # input
+        self.rw_type_i = Signal()  # input
+        self.out_ready_i = Signal()  # input
+        self.out_valid_o = Signal()  # output
+        self.hit_o = Signal()  # output
+        self.miss_o = Signal()  # output
+        self.prot_o = Signal()  # output
+        self.multi_o = Signal()  # output
+        self.cache_coherent_o = Signal()  # output
+        self.out_addr_o = Signal(AXI_M_ADDR_WIDTH)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# //`include "pulp_soc_defines.sv"
+#
+# ////import CfMath::log2;
+#
+# //`define MULTI_HIT_FULL_SET  // Enable full multi hit detection. Always the entire set is searched.
+# //`define MULTI_HIT_CUR_CYCLE // Enable partial multi hit detection. Only multi hits in the same search cycle are detected.
+#
+# //`ifdef MULTI_HIT_FULL_SET
+# //  `ifndef MULTI_HIT_CUR_CYCLE
+# //    `define MULTI_HIT_CUR_CYCLE
+# //  `endif
+# //`endif
+#
+# module l2_tlb
+#  //#(
+#  //  parameter AXI_S_ADDR_WIDTH       = 32,
+#   // parameter AXI_M_ADDR_WIDTH       = 40,
+#  //  parameter AXI_LITE_DATA_WIDTH    = 64,
+#   // parameter AXI_LITE_ADDR_WIDTH    = 32,
+#   // parameter N_SETS                 = 32,
+#   // parameter N_OFFSETS              = 4, //per port. There are 2 ports.
+#  //  parameter PAGE_SIZE              = 4096, // 4kB
+#  //  parameter N_PAR_VA_RAMS          = 4,
+#  //  parameter HIT_OFFSET_STORE_WIDTH = 2 // Num of bits of VA RAM offset stored. This should not be greater than OFFSET_WIDTH
+#  //  )
+#   (
+#    input  logic                           clk_i,
+#    input  logic                           rst_ni,
+#
+#    input  logic                           we_i,
+#    input  logic [AXI_LITE_ADDR_WIDTH-1:0] waddr_i,
+#    input  logic [AXI_LITE_DATA_WIDTH-1:0] wdata_i,
+#
+#    input  logic                           start_i,
+#    output logic                           busy_o,
+#    input  logic    [AXI_S_ADDR_WIDTH-1:0] in_addr_i,
+#    input  logic                           rw_type_i, //1 => write, 0=> read
+#
+#    input  logic                           out_ready_i,
+#    output logic                           out_valid_o,
+#    output logic                           hit_o,
+#    output logic                           miss_o,
+#    output logic                           prot_o,
+#    output logic                           multi_o,
+#    output logic                           cache_coherent_o,
+#    output logic    [AXI_M_ADDR_WIDTH-1:0] out_addr_o
+#    );
+#
+"""    #docstring_begin
+
+   localparam VA_RAM_DEPTH      = N_SETS * N_OFFSETS * 2;
+   localparam PA_RAM_DEPTH      = VA_RAM_DEPTH * N_PAR_VA_RAMS;
+   localparam VA_RAM_ADDR_WIDTH = log2(VA_RAM_DEPTH);
+   localparam PA_RAM_ADDR_WIDTH = log2(PA_RAM_DEPTH);
+   localparam SET_WIDTH         = log2(N_SETS);
+   localparam OFFSET_WIDTH      = log2(N_OFFSETS);
+   localparam LL_WIDTH          = log2(N_PAR_VA_RAMS);
+   localparam IGNORE_LSB        = log2(PAGE_SIZE);
+
+   localparam VA_RAM_DATA_WIDTH = AXI_S_ADDR_WIDTH - IGNORE_LSB + 4;
+   localparam PA_RAM_DATA_WIDTH = AXI_M_ADDR_WIDTH - IGNORE_LSB;
+
+   logic                               [N_PAR_VA_RAMS-1:0] hit, prot, multi_hit, cache_coherent;
+   logic                               [N_PAR_VA_RAMS-1:0] ram_we;
+   logic                                                   last_search, last_search_next;
+   logic                                                   first_search, first_search_next;
+   logic                    [SET_WIDTH+OFFSET_WIDTH+1-1:0] ram_waddr;
+   logic [N_PAR_VA_RAMS-1:0][SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr;
+   logic                                                   pa_ram_we;
+   logic                           [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr, pa_port0_waddr; // PA RAM read, Write addr;
+   logic                           [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr_reg_SN, pa_port0_raddr_reg_SP; // registered addresses, needed for WAIT_ON_WRITE;
+   logic                           [PA_RAM_ADDR_WIDTH-1:0] pa_port0_addr; // PA RAM addr
+   logic                           [PA_RAM_DATA_WIDTH-1:0] pa_port0_data, pa_data, pa_port0_data_reg; // PA RAM data
+   logic                                                   pa_ram_store_data_SN, pa_ram_store_data_SP;
+   logic                                                   hit_top, prot_top, multi_hit_top, first_hit_top;
+   logic                                                   output_sent;
+   int                                                     hit_block_num;
+
+   logic                                                   searching, search_done;
+   logic                    [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr, port0_raddr; // VA RAM port0 addr
+   logic                    [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr; // VA RAM port1 addr
+   logic                                [OFFSET_WIDTH-1:0] offset_addr, offset_addr_d;
+   logic                                [OFFSET_WIDTH-1:0] offset_start_addr, offset_end_addr;
+   logic                                   [SET_WIDTH-1:0] set_num;
+
+   logic                                                   va_output_valid;
+   logic                                                   searching_q;
+
+   genvar                                                  z;
+
+   // Search FSM
+   typedef enum logic                                [1:0] {IDLE, SEARCH, DONE} search_state_t;
+   search_state_t                                          search_SP; // Present state
+   search_state_t                                          search_SN; // Next State
+
+   // Output FSM
+   typedef enum logic                                [1:0] {OUT_IDLE, SEND_OUTPUT, WAIT_ON_WRITE} out_state_t;
+   out_state_t                                             out_SP; // Present state
+   out_state_t                                             out_SN; // Next State
+
+   logic                                                   miss_next;
+   logic                                                   hit_next;
+   logic                                                   prot_next;
+   logic                                                   multi_next;
+   logic                                                   cache_coherent_next;
+
+   // Generate the VA Block rams and their surrounding logic
+   generate
+      for (z = 0; z < N_PAR_VA_RAMS; z++) begin : VA_RAMS
+         check_ram
+           #(
+             .ADDR_WIDTH     ( AXI_S_ADDR_WIDTH  ),
+             .RAM_DATA_WIDTH ( VA_RAM_DATA_WIDTH ),
+             .PAGE_SIZE      ( PAGE_SIZE         ),
+             .SET_WIDTH      ( SET_WIDTH         ),
+             .OFFSET_WIDTH   ( OFFSET_WIDTH      )
+             )
+         u_check_ram
+             (
+              .clk_i         ( clk_i                          ),
+              .rst_ni        ( rst_ni                         ),
+              .in_addr       ( in_addr_i                      ),
+              .rw_type       ( rw_type_i                      ),
+              .ram_we        ( ram_we[z]                      ),
+              .port0_addr    ( port0_addr                     ),
+              .port1_addr    ( port1_addr                     ),
+              .ram_wdata     ( wdata_i[VA_RAM_DATA_WIDTH-1:0] ),
+              .output_sent   ( output_sent                    ),
+              .output_valid  ( va_output_valid                ),
+              .offset_addr_d ( offset_addr_d                  ),
+              .hit_addr      ( hit_addr[z]                    ),
+              .master        ( cache_coherent[z]              ),
+              .hit           ( hit[z]                         ),
+              .multi_hit     ( multi_hit[z]                   ),
+              .prot          ( prot[z]                        )
+              );
+      end // for (z = 0; z < N_PORTS; z++)
+   endgenerate
+
+   ////////////////// ---------------- Control and Address --------------- ////////////////////////
+   // FSM
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         search_SP <= IDLE;
+      end else begin
+         search_SP <= search_SN;
+      end
+   end
+
+   always_comb begin : SEARCH_FSM
+      search_SN         = search_SP;
+      busy_o            = 1'b0;
+      searching         = 1'b0;
+      search_done       = 1'b0;
+      last_search_next  = 1'b0;
+      first_search_next = first_search;
+
+      unique case (search_SP)
+        IDLE : begin
+          if (start_i) begin
+            search_SN         = SEARCH;
+            first_search_next = 1'b1;
+          end
+        end
+
+        SEARCH : begin
+          busy_o = 1'b1;
+
+          // detect last search cycle
+          if ( (first_search == 1'b0) && (offset_addr == offset_end_addr) )
+             last_search_next  = 1'b1;
+
+          // pause search during VA RAM reconfigration
+          if (|ram_we) begin
+             searching         = 1'b0;
+          end else begin
+             searching         = 1'b1;
+             first_search_next = 1'b0;
+          end
+
+          if (va_output_valid) begin
+            // stop search
+`ifdef MULTI_HIT_FULL_SET
+            if (last_search | prot_top | multi_hit_top) begin
+`else
+            if (last_search | prot_top | multi_hit_top | hit_top ) begin
+`endif
+              search_SN      = DONE;
+              search_done    = 1'b1;
+            end
+          end
+        end
+
+        DONE : begin
+          busy_o = 1'b1;
+          if (out_valid_o & out_ready_i)
+            search_SN = IDLE;
+        end
+
+        default : begin
+          search_SN = IDLE;
+        end
+      endcase // case (prot_SP)
+   end // always_comb begin
+
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         last_search  <= 1'b0;
+         first_search <= 1'b0;
+      end else begin
+         last_search  <= last_search_next;
+         first_search <= first_search_next;
+      end
+   end
+
+   /*
+    * VA RAM address generation
+    *
+    * The input address and set number, and thus the offset start address, are available in the
+    * cycle after the start signal. The buffered offset_addr becomes available one cycle later.
+    * During the first search cycle, we therefore directly use offset_addr_start for the lookup.
+    */
+   assign set_num = in_addr_i[SET_WIDTH+IGNORE_LSB -1 : IGNORE_LSB];
+
+   assign port0_raddr[OFFSET_WIDTH] = 1'b0;
+   assign port1_addr [OFFSET_WIDTH] = 1'b1;
+
+   assign port0_raddr[OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr;
+   assign port1_addr [OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr;
+
+   assign port0_raddr[SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num;
+   assign port1_addr [SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num;
+
+   assign port0_addr = ram_we ? ram_waddr : port0_raddr;
+
+   // The outputs of the BRAMs are only valid if in the previous cycle:
+   // 1. the inputs were valid, and
+   // 2. the BRAMs were not written to.
+   // Otherwise, the outputs must be ignored.
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         searching_q <= 1'b0;
+      end else begin
+         searching_q <= searching;
+      end
+   end
+   assign va_output_valid = searching_q;
+
+   // Address offset for looking up the VA RAMs
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         offset_addr   <= 0;
+      end else if (first_search) begin
+         offset_addr <= offset_start_addr + 1'b1;
+      end else if (searching) begin
+         offset_addr <= offset_addr + 1'b1;
+      end
+   end
+
+   // Delayed address offest for looking up the PA RAM upon a hit in the VA RAMs
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         offset_addr_d <= 0;
+      end else if (first_search) begin
+         offset_addr_d <= offset_start_addr;
+      end else if (searching) begin
+         offset_addr_d <= offset_addr_d + 1'b1;
+      end
+   end
+
+   // Store the offset addr for hit to reduce latency for next search.
+   generate
+      if (HIT_OFFSET_STORE_WIDTH > 0) begin : OFFSET_STORE
+`ifndef MULTI_HIT_FULL_SET
+         logic [N_SETS-1:0][HIT_OFFSET_STORE_WIDTH-1:0] hit_offset_addr; // Contains offset addr for previous hit for every SET.
+         logic [SET_WIDTH+OFFSET_WIDTH+1-1:0]           hit_addr_reg;
+
+         assign offset_start_addr = { hit_offset_addr[set_num] , {{OFFSET_WIDTH-HIT_OFFSET_STORE_WIDTH}{1'b0}} };
+         assign offset_end_addr   =   hit_offset_addr[set_num]-1'b1;
+
+         // Register the hit addr
+         always_ff @(posedge clk_i) begin
+            if (rst_ni == 0) begin
+               hit_addr_reg <= 0;
+            end else if (hit_top) begin
+               hit_addr_reg <= hit_addr[hit_block_num];
+            end
+         end
+
+         // Store hit addr for each set. The next search in the same set will start from the saved addr.
+         always_ff @(posedge clk_i) begin
+            if (rst_ni == 0) begin
+               hit_offset_addr <= 0;
+            end else if (hit_o) begin
+               hit_offset_addr[set_num][HIT_OFFSET_STORE_WIDTH-1:0] <= hit_addr_reg[OFFSET_WIDTH-1 : (OFFSET_WIDTH - HIT_OFFSET_STORE_WIDTH)];
+            end
+         end
+`else // No need to store offset if full multi hit detection is enabled because the entire SET is searched.
+         assign offset_start_addr = 0;
+         assign offset_end_addr   = {OFFSET_WIDTH{1'b1}};
+`endif
+      end else begin // if (HIT_OFFSET_STORE_WIDTH > 0)
+         assign offset_start_addr = 0;
+         assign offset_end_addr   = {OFFSET_WIDTH{1'b1}};
+      end
+   endgenerate
+
+   assign prot_top = |prot;
+
+   //////////////////////////////////////////////////////////////////////////////////////
+   // check for hit, multi hit
+   // In case of a multi hit, the hit_block_num indicates the lowest VA RAM with a hit.
+   // In case of a multi hit in the same VA RAM, Port 0 is given priority.
+   always_comb begin : HIT_CHECK
+      hit_top       = |hit;
+      hit_block_num = 0;
+      first_hit_top = 1'b0;
+      multi_hit_top = 1'b0;
+      for (int i=N_PAR_VA_RAMS-1; i>=0; i--) begin
+        if (hit[i] == 1'b1) begin
+`ifdef MULTI_HIT_CUR_CYCLE
+          if (multi_hit[i] | first_hit_top ) begin
+            multi_hit_top = 1'b1;
+          end
+`endif
+          first_hit_top = 1'b1;
+          hit_block_num = i;
+        end
+      end // for (int i=0; i<N_PAR_VA_RAMS; i++)
+   end // always_comb begin
+
+   ///////////////////// ------------- Outputs ------------ //////////////////////////////////
+   //// FSM
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         out_SP                     <= OUT_IDLE;
+         pa_ram_store_data_SP       <= 1'b0;
+         pa_port0_raddr_reg_SP      <=  'b0;
+      end else begin
+         out_SP                     <= out_SN;
+         pa_ram_store_data_SP       <= pa_ram_store_data_SN;
+         pa_port0_raddr_reg_SP      <= pa_port0_raddr_reg_SN;
+      end
+   end
+
+   always_comb begin : OUTPUT_FSM
+      out_SN                   = out_SP;
+
+      miss_next                = miss_o;
+      prot_next                = prot_o;
+      multi_next               = multi_o;
+      hit_next                 = hit_o;
+      cache_coherent_next      = cache_coherent_o;
+      pa_port0_raddr_reg_SN    = pa_port0_raddr_reg_SP;
+
+      pa_port0_raddr           =  'b0;
+      pa_ram_store_data_SN     = 1'b0;
+
+      out_valid_o              = 1'b0;
+      output_sent              = 1'b0;
+
+      unique case (out_SP)
+        OUT_IDLE : begin
+           hit_next            = 1'b0;
+           miss_next           = 1'b0;
+           prot_next           = 1'b0;
+           multi_next          = 1'b0;
+           cache_coherent_next = 1'b0;
+
+          // abort transaction
+          if         ((search_done & ~hit_top) | prot_top | multi_hit_top) begin
+             out_SN = SEND_OUTPUT;
+
+             if (search_done & ~hit_top) begin
+                miss_next  = 1'b1;
+             end
+             if (prot_top) begin
+                prot_next  = 1'b1;
+                hit_next   = 1'b1;
+             end
+             if (multi_hit_top) begin
+                multi_next = 1'b1;
+                hit_next   = 1'b1;
+             end
+
+          // read PA RAM
+          end else if (search_done & hit_top) begin
+             hit_next              = 1'b1;
+             cache_coherent_next   = cache_coherent[hit_block_num];
+             pa_port0_raddr        = (N_PAR_VA_RAMS * hit_addr[hit_block_num]) + hit_block_num;
+             pa_port0_raddr_reg_SN = pa_port0_raddr;
+
+             // read PA RAM now
+             if (~pa_ram_we) begin
+                out_SN               = SEND_OUTPUT;
+                pa_ram_store_data_SN = 1'b1;
+
+             // read PA RAM after PA RAM reconfiguration
+             end else begin // pa_ram_we
+                out_SN               = WAIT_ON_WRITE;
+
+             end
+          end
+        end
+
+        WAIT_ON_WRITE : begin
+          if ( ~pa_ram_we ) begin
+             out_SN               = SEND_OUTPUT;
+             pa_port0_raddr       = pa_port0_raddr_reg_SP;
+             pa_ram_store_data_SN = 1'b1;
+          end
+        end
+
+        SEND_OUTPUT : begin
+           out_valid_o  = 1'b1;
+           if (out_ready_i) begin
+              out_SN      = OUT_IDLE;
+              output_sent = 1'b1;
+           end
+        end
+
+        default : begin
+           out_SN = OUT_IDLE;
+        end
+
+      endcase // case (out_SP)
+   end // always_comb begin
+
+   //// Output signals
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         miss_o           <= 1'b0;
+         prot_o           <= 1'b0;
+         multi_o          <= 1'b0;
+         hit_o            <= 1'b0;
+         cache_coherent_o <= 1'b0;
+      end else begin
+         miss_o           <= miss_next;
+         prot_o           <= prot_next;
+         multi_o          <= multi_next;
+         hit_o            <= hit_next;
+         cache_coherent_o <= cache_coherent_next;
+      end
+   end
+
+   ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+  ///////////////////// --------------- Physical Address -------------- ////////////////////////////
+
+  /// PA Block RAM
+  ram_tp_no_change #(
+        .ADDR_WIDTH( PA_RAM_ADDR_WIDTH ),
+        .DATA_WIDTH( PA_RAM_DATA_WIDTH )
+        )
+  pa_ram
+    (
+      .clk   ( clk_i                          ),
+      .we    ( pa_ram_we                      ),
+      .addr0 ( pa_port0_addr                  ),
+      .addr1 ( '0                             ),
+      .d_i   ( wdata_i[PA_RAM_DATA_WIDTH-1:0] ),
+      .d0_o  ( pa_port0_data                  ),
+      .d1_o  (                                )
+    );
+
+   assign out_addr_o[IGNORE_LSB-1:0]                = in_addr_i[IGNORE_LSB-1:0];
+   assign out_addr_o[AXI_M_ADDR_WIDTH-1:IGNORE_LSB] = pa_data;
+
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         pa_port0_data_reg <= 0;
+      end else if (pa_ram_store_data_SP) begin
+         pa_port0_data_reg <= pa_port0_data;
+      end
+   end
+
+   assign pa_data = pa_ram_store_data_SP ? pa_port0_data : pa_port0_data_reg;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+///// Write enable for all block rams
+generate if (LL_WIDTH != 0) begin
+   always_comb begin
+      var reg[LL_WIDTH:0] para;
+      var int             para_int;
+      for (para = 0; para < N_PAR_VA_RAMS; para=para+1'b1) begin
+        para_int         = int'(para);
+        ram_we[para_int] = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b0) && (waddr_i[LL_WIDTH-1:0] == para);
+      end
+   end
+end else begin
+   assign ram_we[0] = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b0);
+end
+
+endgenerate
+
+// Addresses are word, not byte addresses
+assign pa_ram_we      = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b1); //waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] will be 0 for all VA writes and 1 for all PA writes
+assign ram_waddr      = waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH-1:LL_WIDTH];
+assign pa_port0_waddr = waddr_i[PA_RAM_ADDR_WIDTH-1:0];
+assign pa_port0_addr  = pa_ram_we ? pa_port0_waddr : pa_port0_raddr;
+
+"""
+# endmodule
+#
+# // vim: ts=3 sw=3 sts=3 et nosmartindent autoindent foldmethod=marker tw=100
+#
+#
diff --git a/src/soc/iommu/axi_rab/rab_core.py b/src/soc/iommu/axi_rab/rab_core.py
new file mode 100644
index 00000000..7d7494aa
--- /dev/null
+++ b/src/soc/iommu/axi_rab/rab_core.py
@@ -0,0 +1,539 @@
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+
+# this file has been generated by sv2nmigen
+
+#
+# //`include "pulp_soc_defines.sv"
+#
+# ////import CfMath::log2;
+#
+# //`define MY_ARRAY_SUM(MY_ARRAY,ARRAY_SIZE) ( (ARRAY_SIZE==1) ? MY_ARRAY[0] : (ARRAY_SIZE==2) ? MY_ARRAY[0] + MY_ARRAY[1] : (ARRAY_SIZE==3) ? MY_ARRAY[0] + MY_ARRAY[1] + MY_ARRAY[2] : (ARRAY_SIZE==4) ? MY_ARRAY[0] + MY_ARRAY[1] + MY_ARRAY[2] + MY_ARRAY[3] : 0 )
+#
+
+# module rab_core
+#  #(
+#    parameter N_PORTS             =  3,
+#    parameter N_L2_SETS           = 32,
+#    parameter N_L2_SET_ENTRIES    = 32,
+#    parameter AXI_DATA_WIDTH      = 64,
+#    parameter AXI_S_ADDR_WIDTH    = 32,
+#    parameter AXI_M_ADDR_WIDTH    = 40,
+#    parameter AXI_LITE_DATA_WIDTH = 64,
+#    parameter AXI_LITE_ADDR_WIDTH = 32,
+#    parameter AXI_ID_WIDTH        =  8,
+#    parameter AXI_USER_WIDTH      =  6,
+#    parameter MH_FIFO_DEPTH       = 16
+#    )
+#   (
+#    input  logic                                         Clk_CI,
+#    input  logic                                         Rst_RBI,
+#
+#    input  logic               [AXI_LITE_ADDR_WIDTH-1:0] s_axi_awaddr,
+#    input  logic                                         s_axi_awvalid,
+#    output logic                                         s_axi_awready,
+#
+#    input  logic               [AXI_LITE_DATA_WIDTH-1:0] s_axi_wdata,
+#    input  logic             [AXI_LITE_DATA_WIDTH/8-1:0] s_axi_wstrb,
+#    input  logic                                         s_axi_wvalid,
+#    output logic                                         s_axi_wready,
+#
+#    input  logic               [AXI_LITE_ADDR_WIDTH-1:0] s_axi_araddr,
+#    input  logic                                         s_axi_arvalid,
+#    output logic                                         s_axi_arready,
+#
+#    input  logic                                         s_axi_rready,
+#    output logic               [AXI_LITE_DATA_WIDTH-1:0] s_axi_rdata,
+#    output logic                                   [1:0] s_axi_rresp,
+#    output logic                                         s_axi_rvalid,
+#
+#    output logic                                   [1:0] s_axi_bresp,
+#    output logic                                         s_axi_bvalid,
+#    input  logic                                         s_axi_bready,
+#
+#    output logic [N_PORTS-1:0]                           int_miss,
+#    output logic [N_PORTS-1:0]                           int_prot,
+#    output logic [N_PORTS-1:0]                           int_multi,
+#    output logic [N_PORTS-1:0]                           int_prefetch,
+#    output logic                                         int_mhf_full,
+#
+#    output logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] int_axaddr_o,
+#    output logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] int_axid_o,
+#    output logic [N_PORTS-1:0]                     [7:0] int_axlen_o,
+#    output logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] int_axuser_o,
+#
+#    input  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] port1_addr,
+#    input  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] port1_id,
+#    input  logic [N_PORTS-1:0]                     [7:0] port1_len,
+#    input  logic [N_PORTS-1:0]                     [2:0] port1_size,
+#    input  logic [N_PORTS-1:0]                           port1_addr_valid,
+#    input  logic [N_PORTS-1:0]                           port1_type,
+#    input  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] port1_user,
+#    input  logic [N_PORTS-1:0]                           port1_sent,
+#    output logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] port1_out_addr,
+#    output logic [N_PORTS-1:0]                           port1_cache_coherent,
+#    output logic [N_PORTS-1:0]                           port1_accept,
+#    output logic [N_PORTS-1:0]                           port1_drop,
+#    output logic [N_PORTS-1:0]                           port1_miss,
+#
+#    input  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] port2_addr,
+#    input  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] port2_id,
+#    input  logic [N_PORTS-1:0]                     [7:0] port2_len,
+#    input  logic [N_PORTS-1:0]                     [2:0] port2_size,
+#    input  logic [N_PORTS-1:0]                           port2_addr_valid,
+#    input  logic [N_PORTS-1:0]                           port2_type,
+#    input  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] port2_user,
+#    input  logic [N_PORTS-1:0]                           port2_sent,
+#    output logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] port2_out_addr,
+#    output logic [N_PORTS-1:0]                           port2_cache_coherent,
+#    output logic [N_PORTS-1:0]                           port2_accept,
+#    output logic [N_PORTS-1:0]                           port2_drop,
+#    output logic [N_PORTS-1:0]                           port2_miss,
+#
+#    input  logic [N_PORTS-1:0]                           miss_l2_i,
+#    input  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] miss_l2_addr_i,
+#    input  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] miss_l2_id_i,
+#    input  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] miss_l2_user_i,
+#
+#    output logic [N_PORTS-1:0] [AXI_LITE_DATA_WIDTH-1:0] wdata_l2_o,
+#    output logic [N_PORTS-1:0] [AXI_LITE_ADDR_WIDTH-1:0] waddr_l2_o,
+#    output logic [N_PORTS-1:0]                           wren_l2_o
+#    );
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class rab_core(Elaboratable):
+
+    def __init__(self):
+        self.s_axi_awaddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
+        self.s_axi_awvalid = Signal()  # input
+        self.s_axi_awready = Signal()  # output
+        self.s_axi_wdata = Signal(AXI_LITE_DATA_WIDTH)  # input
+        self.s_axi_wstrb = Signal(FIXME)  # input
+        self.s_axi_wvalid = Signal()  # input
+        self.s_axi_wready = Signal()  # output
+        self.s_axi_araddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
+        self.s_axi_arvalid = Signal()  # input
+        self.s_axi_arready = Signal()  # output
+        self.s_axi_rready = Signal()  # input
+        self.s_axi_rdata = Signal(AXI_LITE_DATA_WIDTH)  # output
+        self.s_axi_rresp = Signal(2)  # output
+        self.s_axi_rvalid = Signal()  # output
+        self.s_axi_bresp = Signal(2)  # output
+        self.s_axi_bvalid = Signal()  # output
+        self.s_axi_bready = Signal()  # input
+        self.int_miss = Signal(N_PORTS)  # output
+        self.int_prot = Signal(N_PORTS)  # output
+        self.int_multi = Signal(N_PORTS)  # output
+        self.int_prefetch = Signal(N_PORTS)  # output
+        self.int_mhf_full = Signal()  # output
+        self.int_axaddr_o = Signal()  # output
+        self.int_axid_o = Signal()  # output
+        self.int_axlen_o = Signal()  # output
+        self.int_axuser_o = Signal()  # output
+        self.port1_addr = Signal()  # input
+        self.port1_id = Signal()  # input
+        self.port1_len = Signal()  # input
+        self.port1_size = Signal()  # input
+        self.port1_addr_valid = Signal(N_PORTS)  # input
+        self.port1_type = Signal(N_PORTS)  # input
+        self.port1_user = Signal()  # input
+        self.port1_sent = Signal(N_PORTS)  # input
+        self.port1_out_addr = Signal()  # output
+        self.port1_cache_coherent = Signal(N_PORTS)  # output
+        self.port1_accept = Signal(N_PORTS)  # output
+        self.port1_drop = Signal(N_PORTS)  # output
+        self.port1_miss = Signal(N_PORTS)  # output
+        self.port2_addr = Signal()  # input
+        self.port2_id = Signal()  # input
+        self.port2_len = Signal()  # input
+        self.port2_size = Signal()  # input
+        self.port2_addr_valid = Signal(N_PORTS)  # input
+        self.port2_type = Signal(N_PORTS)  # input
+        self.port2_user = Signal()  # input
+        self.port2_sent = Signal(N_PORTS)  # input
+        self.port2_out_addr = Signal()  # output
+        self.port2_cache_coherent = Signal(N_PORTS)  # output
+        self.port2_accept = Signal(N_PORTS)  # output
+        self.port2_drop = Signal(N_PORTS)  # output
+        self.port2_miss = Signal(N_PORTS)  # output
+        self.miss_l2_i = Signal(N_PORTS)  # input
+        self.miss_l2_addr_i = Signal()  # input
+        self.miss_l2_id_i = Signal()  # input
+        self.miss_l2_user_i = Signal()  # input
+        self.wdata_l2_o = Signal()  # output
+        self.waddr_l2_o = Signal()  # output
+        self.wren_l2_o = Signal(N_PORTS)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+""" 
+
+
+    // âââââââââââ âââââââ ââââ   âââ ââââââ âââ     ââââââââ
+    // âââââââââââââââââââ âââââ  ââââââââââââââ     ââââââââ
+    // ââââââââââââââ  ââââââââââ ââââââââââââââ     ââââââââ
+    // ââââââââââââââ   ââââââââââââââââââââââââ     ââââââââ
+    // âââââââââââââââââââââââ âââââââââ  âââââââââââââââââââ
+    // âââââââââââ âââââââ âââ  ââââââââ  âââââââââââââââââââ
+    // signals
+
+  localparam integer ENABLE_L2TLB[N_PORTS-1:0] = `EN_L2TLB_ARRAY;
+
+  localparam integer N_SLICES[N_PORTS-1:0]     = `N_SLICES_ARRAY;
+  localparam         N_SLICES_TOT              = `MY_ARRAY_SUM(N_SLICES,N_PORTS);
+  localparam         N_SLICES_MAX              = `N_SLICES_MAX;
+
+  localparam N_REGS                            = 4*N_SLICES_TOT + 4;
+  localparam AXI_SIZE_WIDTH                    = log2(AXI_DATA_WIDTH/8);
+
+  localparam PORT_ID_WIDTH                     = (N_PORTS < 2) ? 1 : log2(N_PORTS);
+  localparam MISS_META_WIDTH                   = PORT_ID_WIDTH + AXI_USER_WIDTH + AXI_ID_WIDTH;
+
+  logic [N_PORTS-1:0]                      [15:0] p1_burst_size;
+  logic [N_PORTS-1:0]                      [15:0] p2_burst_size;
+
+  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p1_align_addr;
+  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p2_align_addr;
+
+  logic [N_PORTS-1:0]        [AXI_SIZE_WIDTH-1:0] p1_mask;
+  logic [N_PORTS-1:0]        [AXI_SIZE_WIDTH-1:0] p2_mask;
+
+  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p1_max_addr;
+  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p2_max_addr;
+
+  logic [N_PORTS-1:0]                             p1_prefetch;
+  logic [N_PORTS-1:0]                             p2_prefetch;
+
+  logic [N_PORTS-1:0]                             int_rw;
+  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] int_addr_min;
+  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] int_addr_max;
+  logic [N_PORTS-1:0]          [AXI_ID_WIDTH-1:0] int_id;
+  logic [N_PORTS-1:0]                       [7:0] int_len;
+  logic [N_PORTS-1:0]        [AXI_USER_WIDTH-1:0] int_user;
+
+  logic [N_PORTS-1:0]                             hit;
+  logic [N_PORTS-1:0]                             prot;
+  logic [N_PORTS-1:0]                             prefetch;
+
+  logic [N_PORTS-1:0]                             no_hit;
+  logic [N_PORTS-1:0]                             no_prot;
+
+  logic [N_PORTS-1:0]          [N_SLICES_MAX-1:0] hit_slices;
+  logic [N_PORTS-1:0]          [N_SLICES_MAX-1:0] prot_slices;
+
+  logic [N_PORTS-1:0]      [AXI_M_ADDR_WIDTH-1:0] out_addr;
+  logic [N_PORTS-1:0]      [AXI_M_ADDR_WIDTH-1:0] out_addr_reg;
+
+  logic [N_PORTS-1:0]                             cache_coherent;
+  logic [N_PORTS-1:0]                             cache_coherent_reg;
+
+  logic [N_PORTS-1:0]                             select;
+  reg   [N_PORTS-1:0]                             curr_priority;
+
+  reg   [N_PORTS-1:0]                             multi_hit;
+
+  logic [N_PORTS-1:0]                             miss_valid_mhf;
+  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] miss_addr_mhf;
+  logic [N_PORTS-1:0]       [MISS_META_WIDTH-1:0] miss_meta_mhf;
+
+  logic [N_REGS-1:0]                       [63:0] int_cfg_regs;
+  logic [N_PORTS-1:0] [4*N_SLICES_MAX-1:0] [63:0] int_cfg_regs_slices;
+
+  logic                                           L1AllowMultiHit_S;
+
+  genvar z;
+
+  //  ââââââ âââââââââââââââââââ âââââââ ââââ   âââââââ   ââââââââââââââââ   ââââââââââââââââââââ
+  // âââââââââââââââââââââââââââââââââââ âââââ  ââââââââ ââââââââââââââââââ  ââââââââââââââââââââ
+  // ââââââââââââââââââââââââââââââ  ââââââââââ ââââââââââââââââââââ  ââââââ âââ   âââ   ââââââââ
+  // ââââââââââââââââââââââââââââââ   ââââââââââââââââââââââââââââââ  ââââââââââ   âââ   ââââââââ
+  // âââ  ââââââââââââââââââââââââââââââââââ âââââââââ âââ ââââââââââââââ ââââââ   âââ   ââââââââ
+  // âââ  ââââââââââââââââââââââ âââââââ âââ  ââââââââ     ââââââââââââââ  âââââ   âââ   ââââââââ
+  // assignments
+
+  always_comb
+    begin : PORT_SELECT
+      var integer idx;
+
+      for (idx=0; idx<N_PORTS; idx++) begin
+
+        // select = 1 -> port1 active
+        // select = 0 -> port2 active
+        select[idx] = (curr_priority[idx] & port1_addr_valid[idx]) | ~port2_addr_valid[idx];
+
+        p1_burst_size[idx] = (port1_len[idx] + 1) << port1_size[idx];
+        p2_burst_size[idx] = (port2_len[idx] + 1) << port2_size[idx];
+
+        // align min addr for max addr computation to allow for smart AXI bursts around the 4k boundary
+        if      (port1_size[idx] == 3'b001)
+          p1_mask[idx] = 3'b110;
+        else if (port1_size[idx] == 3'b010)
+          p1_mask[idx] = 3'b100;
+        else if (port1_size[idx] == 3'b011)
+          p1_mask[idx] = 3'b000;
+        else
+          p1_mask[idx] = 3'b111;
+
+        p1_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port1_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH];
+        p1_align_addr[idx][AXI_SIZE_WIDTH-1:0]                = port1_addr[idx][AXI_SIZE_WIDTH-1:0] & p1_mask[idx];
+
+        if      (port2_size[idx] == 3'b001)
+          p2_mask[idx] = 3'b110;
+        else if (port2_size[idx] == 3'b010)
+          p2_mask[idx] = 3'b100;
+        else if (port2_size[idx] == 3'b011)
+          p2_mask[idx] = 3'b000;
+        else
+          p2_mask[idx] = 3'b111;
+
+        if (port1_user[idx] == {AXI_USER_WIDTH{1'b1}})
+          p1_prefetch[idx] = 1'b1;
+        else
+          p1_prefetch[idx] = 1'b0;
+
+        if (port2_user[idx] == {AXI_USER_WIDTH{1'b1}})
+          p2_prefetch[idx] = 1'b1;
+        else
+          p2_prefetch[idx] = 1'b0;
+
+        p2_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port2_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH];
+        p2_align_addr[idx][AXI_SIZE_WIDTH-1:0]                = port2_addr[idx][AXI_SIZE_WIDTH-1:0] & p2_mask[idx];
+
+        p1_max_addr[idx]  = p1_align_addr[idx] + p1_burst_size[idx] - 1;
+        p2_max_addr[idx]  = p2_align_addr[idx] + p2_burst_size[idx] - 1;
+
+        int_addr_min[idx] = select[idx] ? port1_addr[idx]  : port2_addr[idx];
+        int_addr_max[idx] = select[idx] ? p1_max_addr[idx] : p2_max_addr[idx];
+        int_rw[idx]       = select[idx] ? port1_type[idx]  : port2_type[idx];
+        int_id[idx]       = select[idx] ? port1_id[idx]    : port2_id[idx];
+        int_len[idx]      = select[idx] ? port1_len[idx]   : port2_len[idx];
+        int_user[idx]     = select[idx] ? port1_user[idx]  : port2_user[idx];
+        prefetch[idx]     = select[idx] ? p1_prefetch[idx] : p2_prefetch[idx];
+
+        hit [idx]    = | hit_slices [idx];
+        prot[idx]    = | prot_slices[idx];
+
+        no_hit [idx] = ~hit [idx];
+        no_prot[idx] = ~prot[idx];
+
+        port1_out_addr[idx] = out_addr_reg[idx];
+        port2_out_addr[idx] = out_addr_reg[idx];
+
+        port1_cache_coherent[idx] = cache_coherent_reg[idx];
+        port2_cache_coherent[idx] = cache_coherent_reg[idx];
+      end
+    end
+
+  always_comb
+    begin
+      var integer idx_port, idx_slice;
+      var integer reg_num;
+      reg_num=0;
+      for ( idx_port = 0; idx_port < N_PORTS; idx_port++ ) begin
+        for ( idx_slice = 0; idx_slice < 4*N_SLICES[idx_port]; idx_slice++ ) begin
+          int_cfg_regs_slices[idx_port][idx_slice] = int_cfg_regs[4+reg_num];
+          reg_num++;
+        end
+        // int_cfg_regs_slices[idx_port][N_SLICES_MAX:N_SLICES[idx_port]] will be dangling
+        // Fix to zero. Synthesis will remove these signals.
+        // int_cfg_regs_slices[idx_port][4*N_SLICES_MAX-1:4*N_SLICES[idx_port]] = 0;
+      end
+  end
+
+  always @(posedge Clk_CI or negedge Rst_RBI)
+    begin : PORT_PRIORITY
+      var integer idx;
+      if (Rst_RBI == 1'b0)
+        curr_priority = 'h0;
+      else begin
+        for (idx=0; idx<N_PORTS; idx++) begin
+          if (port1_accept[idx] || port1_drop[idx])
+            curr_priority[idx] = 1'b1;
+          else if (port2_accept[idx] || port2_drop[idx])
+            curr_priority[idx] = 1'b0;
+        end
+      end
+    end
+
+  // find port that misses
+  logic [PORT_ID_WIDTH-1:0] PortIdx_D; // index of the first missing port
+  var integer               idx_miss;
+  always_comb begin : MHF_PORT_SELECT
+    PortIdx_D = 'b0;
+    for (idx_miss = 0; idx_miss < N_PORTS; idx_miss++) begin
+      if (miss_valid_mhf[idx_miss] == 1'b1) begin
+        PortIdx_D = idx_miss;
+        break;
+      end
+    end
+  end // always_comb begin
+
+  //  ââââââ âââ  ââââââ    âââââââ  ââââââ âââââââ      âââââââââââââââ âââââââ
+  // âââââââââââââââââââ    ââââââââââââââââââââââââ    ââââââââââââââââââââââââ
+  // ââââââââ ââââââ âââ    ââââââââââââââââââââââââ    âââ     ââââââ  âââ  ââââ
+  // ââââââââ ââââââ âââ    ââââââââââââââââââââââââ    âââ     ââââââ  âââ   âââ
+  // âââ  âââââââ ââââââ    âââ  ââââââ  âââââââââââ    âââââââââââ     âââââââââ
+  // âââ  ââââââ  ââââââ    âââ  ââââââ  ââââââââââ      ââââââââââ      âââââââ
+  axi_rab_cfg
+    #(
+      .N_PORTS         ( N_PORTS             ),
+      .N_REGS          ( N_REGS              ),
+      .N_L2_SETS       ( N_L2_SETS           ),
+      .N_L2_SET_ENTRIES( N_L2_SET_ENTRIES    ),
+      .ADDR_WIDTH_PHYS ( AXI_M_ADDR_WIDTH    ),
+      .ADDR_WIDTH_VIRT ( AXI_S_ADDR_WIDTH    ),
+      .N_FLAGS         ( 4                   ),
+      .AXI_DATA_WIDTH  ( AXI_LITE_DATA_WIDTH ),
+      .AXI_ADDR_WIDTH  ( AXI_LITE_ADDR_WIDTH ),
+      .MISS_META_WIDTH ( MISS_META_WIDTH     ),
+      .MH_FIFO_DEPTH   ( MH_FIFO_DEPTH       )
+    )
+    u_axi_rab_cfg
+    (
+      .Clk_CI             ( Clk_CI                    ),
+      .Rst_RBI            ( Rst_RBI                   ),
+      .s_axi_awaddr       ( s_axi_awaddr              ),
+      .s_axi_awvalid      ( s_axi_awvalid             ),
+      .s_axi_wdata        ( s_axi_wdata               ),
+      .s_axi_wstrb        ( s_axi_wstrb               ),
+      .s_axi_wvalid       ( s_axi_wvalid              ),
+      .s_axi_bready       ( s_axi_bready              ),
+      .s_axi_araddr       ( s_axi_araddr              ),
+      .s_axi_arvalid      ( s_axi_arvalid             ),
+      .s_axi_rready       ( s_axi_rready              ),
+      .s_axi_arready      ( s_axi_arready             ),
+      .s_axi_rdata        ( s_axi_rdata               ),
+      .s_axi_rresp        ( s_axi_rresp               ),
+      .s_axi_rvalid       ( s_axi_rvalid              ),
+      .s_axi_wready       ( s_axi_wready              ),
+      .s_axi_bresp        ( s_axi_bresp               ),
+      .s_axi_bvalid       ( s_axi_bvalid              ),
+      .s_axi_awready      ( s_axi_awready             ),
+      .L1Cfg_DO           ( int_cfg_regs              ),
+      .L1AllowMultiHit_SO ( L1AllowMultiHit_S         ),
+      .MissAddr_DI        ( miss_addr_mhf[PortIdx_D]  ),
+      .MissMeta_DI        ( miss_meta_mhf[PortIdx_D]  ),
+      .Miss_SI            ( miss_valid_mhf[PortIdx_D] ),
+      .MhFifoFull_SO      ( int_mhf_full              ),
+      .wdata_l2           ( wdata_l2_o                ),
+      .waddr_l2           ( waddr_l2_o                ),
+      .wren_l2            ( wren_l2_o                 )
+    );
+
+  generate for (z = 0; z < N_PORTS; z++) begin : MHF_TLB_SELECT
+    if (ENABLE_L2TLB[z] == 1) begin // L2 TLB is enabled
+      assign miss_valid_mhf[z] = miss_l2_i[z];
+      assign miss_addr_mhf[z]  = miss_l2_addr_i[z];
+      assign miss_meta_mhf[z]  = {miss_l2_user_i[z], PortIdx_D, miss_l2_id_i[z]};
+    end else begin// L2 TLB is disabled
+      assign miss_valid_mhf[z] = int_miss[z];
+      assign miss_addr_mhf[z]  = int_addr_min[z];
+      assign miss_meta_mhf[z]  = {int_user[z], PortIdx_D, int_id[z]};
+    end
+  end
+  endgenerate
+
+  // âââââââââââ     âââ âââââââââââââââ    âââââââââ âââââââ âââââââ
+  // âââââââââââ     âââââââââââââââââââ    ââââââââââââââââââââââââââ
+  // âââââââââââ     ââââââ     ââââââ         âââ   âââ   âââââââââââ
+  // âââââââââââ     ââââââ     ââââââ         âââ   âââ   ââââââââââ
+  // âââââââââââââââââââââââââââââââââââ       âââ   ââââââââââââ
+  // âââââââââââââââââââ âââââââââââââââ       âââ    âââââââ âââ
+  generate for (z = 0; z < N_PORTS; z++) begin : SLICE_TOP_GEN
+    slice_top
+      #(
+        .N_SLICES        ( N_SLICES[z]      ),
+        .N_REGS          ( 4*N_SLICES[z]    ),
+        .ADDR_WIDTH_PHYS ( AXI_M_ADDR_WIDTH ),
+        .ADDR_WIDTH_VIRT ( AXI_S_ADDR_WIDTH )
+      )
+      u_slice_top
+      (
+        .int_cfg_regs    ( int_cfg_regs_slices[z][4*N_SLICES[z]-1:0] ),
+        .int_rw          ( int_rw[z]                                 ),
+        .int_addr_min    ( int_addr_min[z]                           ),
+        .int_addr_max    ( int_addr_max[z]                           ),
+        .multi_hit_allow ( L1AllowMultiHit_S                         ),
+        .multi_hit       ( multi_hit[z]                              ),
+        .prot            ( prot_slices[z][N_SLICES[z]-1:0]           ),
+        .hit             ( hit_slices [z][N_SLICES[z]-1:0]           ),
+        .cache_coherent  ( cache_coherent[z]                         ),
+        .out_addr        ( out_addr[z]                               )
+      );
+    // hit_slices [N_SLICES_MAX-1:N_SLICES_MAX-N_SLICES[z]] will be dangling
+    // prot_slices[N_SLICES_MAX-1:N_SLICES_MAX-N_SLICES[z]] will be dangling
+    // Fix to zero. Synthesis will remove these signals.
+    if ( N_SLICES[z] < N_SLICES_MAX ) begin
+      assign hit_slices [z][N_SLICES_MAX-1:N_SLICES[z]] = 0;
+      assign prot_slices[z][N_SLICES_MAX-1:N_SLICES[z]] = 0;
+    end
+  end // for (z = 0; z < N_PORTS; z++)
+  endgenerate
+
+  // ââââââââââââââââââââ   ââââ
+  // âââââââââââââââââââââ âââââ
+  // ââââââ  âââââââââââââââââââ
+  // ââââââ  âââââââââââââââââââ
+  // âââ     âââââââââââ âââ âââ
+  // âââ     âââââââââââ     âââ
+  //
+  generate for (z = 0; z < N_PORTS; z++) begin : FSM_GEN
+    fsm
+      #(
+        .AXI_M_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+        .AXI_S_ADDR_WIDTH ( AXI_S_ADDR_WIDTH ),
+        .AXI_ID_WIDTH     ( AXI_ID_WIDTH     ),
+        .AXI_USER_WIDTH   ( AXI_USER_WIDTH   )
+      )
+      u_fsm
+      (
+        .Clk_CI             ( Clk_CI                ),
+        .Rst_RBI            ( Rst_RBI               ),
+        .port1_addr_valid_i ( port1_addr_valid[z]   ),
+        .port2_addr_valid_i ( port2_addr_valid[z]   ),
+        .port1_sent_i       ( port1_sent[z]         ),
+        .port2_sent_i       ( port2_sent[z]         ),
+        .select_i           ( select[z]             ),
+        .no_hit_i           ( no_hit[z]             ),
+        .multi_hit_i        ( multi_hit[z]          ),
+        .no_prot_i          ( no_prot[z]            ),
+        .prefetch_i         ( prefetch[z]           ),
+        .out_addr_i         ( out_addr[z]           ),
+        .cache_coherent_i   ( cache_coherent[z]     ),
+        .port1_accept_o     ( port1_accept[z]       ),
+        .port1_drop_o       ( port1_drop[z]         ),
+        .port1_miss_o       ( port1_miss[z]         ),
+        .port2_accept_o     ( port2_accept[z]       ),
+        .port2_drop_o       ( port2_drop[z]         ),
+        .port2_miss_o       ( port2_miss[z]         ),
+        .out_addr_o         ( out_addr_reg[z]       ),
+        .cache_coherent_o   ( cache_coherent_reg[z] ),
+        .miss_o             ( int_miss[z]           ),
+        .multi_o            ( int_multi[z]          ),
+        .prot_o             ( int_prot[z]           ),
+        .prefetch_o         ( int_prefetch[z]       ),
+        .in_addr_i          ( int_addr_min[z]       ),
+        .in_id_i            ( int_id[z]             ),
+        .in_len_i           ( int_len[z]            ),
+        .in_user_i          ( int_user[z]           ),
+        .in_addr_o          ( int_axaddr_o[z]       ),
+        .in_id_o            ( int_axid_o[z]         ),
+        .in_len_o           ( int_axlen_o[z]        ),
+        .in_user_o          ( int_axuser_o[z]       )
+      );
+  end
+  endgenerate
+  
+"""
diff --git a/src/soc/iommu/axi_rab/rab_slice.py b/src/soc/iommu/axi_rab/rab_slice.py
new file mode 100644
index 00000000..59f84e3e
--- /dev/null
+++ b/src/soc/iommu/axi_rab/rab_slice.py
@@ -0,0 +1,76 @@
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module rab_slice
+# #(
+#    parameter ADDR_WIDTH_PHYS = 40,
+#    parameter ADDR_WIDTH_VIRT = 32
+#    )
+#   (
+#    input  logic [ADDR_WIDTH_VIRT-1:0] cfg_min,
+#    input  logic [ADDR_WIDTH_VIRT-1:0] cfg_max,
+#    input  logic [ADDR_WIDTH_PHYS-1:0] cfg_offset,
+#    input  logic                       cfg_wen,
+#    input  logic                       cfg_ren,
+#    input  logic                       cfg_en,
+#    input  logic                       in_trans_type,
+#    input  logic [ADDR_WIDTH_VIRT-1:0] in_addr_min,
+#    input  logic [ADDR_WIDTH_VIRT-1:0] in_addr_max,
+#    output logic                       out_hit,
+#    output logic                       out_prot,
+#    output logic [ADDR_WIDTH_PHYS-1:0] out_addr
+#  );
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class rab_slice(Elaboratable):
+
+    def __init__(self, params):  # pass config object
+        # TODO parameters
+        self.params = params
+        self.cfg_min = Signal(params.ADDR_WIDTH_VIRT)  # input
+        self.cfg_max = Signal(params.ADDR_WIDTH_VIRT)  # input
+        self.cfg_offset = Signal(params.ADDR_WIDTH_PHYS)  # input
+        self.cfg_wen = Signal()  # input
+        self.cfg_ren = Signal()  # input
+        self.cfg_en = Signal()  # input
+        self.in_trans_type = Signal()  # input
+        self.in_addr_min = Signal(params.ADDR_WIDTH_VIRT)  # input
+        self.in_addr_max = Signal(params.ADDR_WIDTH_VIRT)  # input
+        self.out_hit = Signal()  # output
+        self.out_prot = Signal()  # output
+        self.out_addr = Signal(params.ADDR_WIDTH_PHYS)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        min_above_min = Signal()
+        min_below_max = Signal()
+        max_below_max = Signal()
+
+        #  assign min_above_min = (in_addr_min >= cfg_min) ? 1'b1 : 1'b0;
+        #  assign min_below_max = (in_addr_min <= cfg_max) ? 1'b1 : 1'b0;
+        #  assign max_below_max = (in_addr_max <= cfg_max) ? 1'b1 : 1'b0;
+        #  assign out_hit  = cfg_en & min_above_min & min_below_max & max_below_max;
+        #  assign out_prot = out_hit & ((in_trans_type & ~cfg_wen) | (~in_trans_type & ~cfg_ren));
+        #  assign out_addr = in_addr_min - cfg_min + cfg_offset;
+        m.d.comb += [
+            min_above_min.eq(self.in_addr_min >= self.cfg_min),
+            min_below_max.eq(self.in_addr_min <= self.cfg_max),
+            max_below_max.eq(self.in_addr_max <= self.cfg_max),
+            self.out_hit.eq(self.cfg_en & min_above_min &
+                            min_below_max & max_below_max),
+            self.out_prot.eq(self.out_hit & (
+                (self.in_trans_type & ~self.cfg_wen) | (~self.in_trans_type & ~self.cfg_ren))),
+            self.out_addr.eq(self.in_addr_min - self.cfg_min + self.cfg_offset)
+        ]
+
+        return m
diff --git a/src/soc/iommu/axi_rab/ram_tp_no_change.py b/src/soc/iommu/axi_rab/ram_tp_no_change.py
new file mode 100644
index 00000000..bdcd5550
--- /dev/null
+++ b/src/soc/iommu/axi_rab/ram_tp_no_change.py
@@ -0,0 +1,97 @@
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# /*
+# * ram_tp_no_change
+# *
+# * This code implements a parameterizable two-port memory. Port 0 can read and
+# * write while Port 1 can read only. The Xilinx tools will infer a BRAM with
+# * Port 0 in "no change" mode, i.e., during a write, it retains the last read
+# * value on the output. Port 1 (read-only) is in "write first" mode. Still, it
+# * outputs the old data during the write cycle. Note: Port 1 outputs invalid
+# * data in the cycle after the write when reading the same address.
+# *
+# * For more information, see Xilinx PG058 Block Memory Generator Product Guide.
+# */
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+from nmigen import Memory
+
+import math
+
+#
+# module ram_tp_no_change
+#  #(
+ADDR_WIDTH = 10
+DATA_WIDTH = 36
+#  )
+#  (
+#    input                   clk,
+#    input                   we,
+#    input  [ADDR_WIDTH-1:0] addr0,
+#    input  [ADDR_WIDTH-1:0] addr1,
+#    input  [DATA_WIDTH-1:0] d_i,
+#    output [DATA_WIDTH-1:0] d0_o,
+#    output [DATA_WIDTH-1:0] d1_o
+#  );
+
+
+class ram_tp_no_change(Elaboratable):
+
+    def __init__(self):
+        self.we = Signal()               # input
+        self.addr0 = Signal(ADDR_WIDTH)  # input
+        self.addr1 = Signal(ADDR_WIDTH)  # input
+        self.d_i = Signal(DATA_WIDTH)    # input
+        self.d0_o = Signal(DATA_WIDTH)   # output
+        self.d1_o = Signal(DATA_WIDTH)   # output
+
+        DEPTH = int(math.pow(2, ADDR_WIDTH))
+        self.ram = Memory(DATA_WIDTH, DEPTH)
+    #
+    #  localparam DEPTH = 2**ADDR_WIDTH;
+    #
+    #  (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH];
+    #                            reg [DATA_WIDTH-1:0] d0;
+    #                            reg [DATA_WIDTH-1:0] d1;
+    #
+    #  always_ff @(posedge clk) begin
+    #    if(we == 1'b1) begin
+    #      ram[addr0] <= d_i;
+    #    end else begin
+    # only change data if we==false
+    #      d0 <= ram[addr0];
+    #    end
+    #    d1   <= ram[addr1];
+    #  end
+    #
+    #  assign d0_o = d0;
+    #  assign d1_o = d1;
+    #
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.submodules.read_ram0 = read_ram0 = self.ram.read_port()
+        m.submodules.read_ram1 = read_ram1 = self.ram.read_port()
+        m.submodules.write_ram = write_ram = self.ram.write_port()
+
+        # write port
+        m.d.comb += write_ram.en.eq(self.we)
+        m.d.comb += write_ram.addr.eq(self.addr0)
+        m.d.comb += write_ram.data.eq(self.d_i)
+
+        # read ports
+        m.d.comb += read_ram0.addr.eq(self.addr0)
+        m.d.comb += read_ram1.addr.eq(self.addr1)
+        with m.If(self.we == 0):
+            m.d.sync += self.d0_o.eq(read_ram0.data)
+        m.d.sync += self.d1_o.eq(read_ram1.data)
+
+        return m
diff --git a/src/soc/iommu/axi_rab/ram_tp_write_first.py b/src/soc/iommu/axi_rab/ram_tp_write_first.py
new file mode 100644
index 00000000..7a21969c
--- /dev/null
+++ b/src/soc/iommu/axi_rab/ram_tp_write_first.py
@@ -0,0 +1,93 @@
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# /*
+# * ram_tp_write_first
+# *
+# * This code implements a parameterizable two-port memory. Port 0 can read and
+# * write while Port 1 can read only. Xilinx Vivado will infer a BRAM in
+# * "write first" mode, i.e., upon a read and write to the same address, the
+# * new value is read. Note: Port 1 outputs invalid data in the cycle after
+# * the write when reading the same address.
+# *
+# * For more information, see Xilinx PG058 Block Memory Generator Product Guide.
+# */
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+from nmigen import Memory
+
+import math
+#
+# module ram_tp_write_first
+#  #(
+ADDR_WIDTH = 10
+DATA_WIDTH = 36
+#  )
+#  (
+#    input                   clk,
+#    input                   we,
+#    input  [ADDR_WIDTH-1:0] addr0,
+#    input  [ADDR_WIDTH-1:0] addr1,
+#    input  [DATA_WIDTH-1:0] d_i,
+#    output [DATA_WIDTH-1:0] d0_o,
+#    output [DATA_WIDTH-1:0] d1_o
+#  );
+
+
+class ram_tp_write_first(Elaboratable):
+
+    def __init__(self):
+        self.we = Signal()               # input
+        self.addr0 = Signal(ADDR_WIDTH)  # input
+        self.addr1 = Signal(ADDR_WIDTH)  # input
+        self.d_i = Signal(DATA_WIDTH)    # input
+        self.d0_o = Signal(DATA_WIDTH)   # output
+        self.d1_o = Signal(DATA_WIDTH)   # output
+
+        DEPTH = int(math.pow(2, ADDR_WIDTH))
+        self.ram = Memory(DATA_WIDTH, DEPTH)
+
+    #
+    #  localparam DEPTH = 2**ADDR_WIDTH;
+    #
+    #  (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH];
+    #                            reg [ADDR_WIDTH-1:0] raddr0;
+    #                            reg [ADDR_WIDTH-1:0] raddr1;
+    #
+    #  always_ff @(posedge clk) begin
+    #    if(we == 1'b1) begin
+    #      ram[addr0] <= d_i;
+    #    end
+    #    raddr0 <= addr0;
+    #    raddr1 <= addr1;
+    #  end
+    #
+    #  assign d0_o = ram[raddr0];
+    #  assign d1_o = ram[raddr1];
+    #
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.submodules.read_ram0 = read_ram0 = self.ram.read_port()
+        m.submodules.read_ram1 = read_ram1 = self.ram.read_port()
+        m.submodules.write_ram = write_ram = self.ram.write_port()
+
+        # write port
+        m.d.comb += write_ram.en.eq(self.we)
+        m.d.comb += write_ram.addr.eq(self.addr0)
+        m.d.comb += write_ram.data.eq(self.d_i)
+
+        # read ports
+        m.d.comb += read_ram0.addr.eq(self.addr0)
+        m.d.comb += read_ram1.addr.eq(self.addr1)
+        m.d.sync += self.d0_o.eq(read_ram0.data)
+        m.d.sync += self.d1_o.eq(read_ram1.data)
+
+        return m
diff --git a/src/soc/iommu/axi_rab/slice_top.py b/src/soc/iommu/axi_rab/slice_top.py
new file mode 100644
index 00000000..6eedb1cd
--- /dev/null
+++ b/src/soc/iommu/axi_rab/slice_top.py
@@ -0,0 +1,141 @@
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+import rab_slice
+import coreconfig
+
+#
+# module slice_top
+# //#(
+#  //  parameter N_SLICES        = 16,
+#  //  parameter N_REGS          = 4*N_SLICES,
+#   // parameter ADDR_WIDTH_PHYS = 40,
+#   // parameter ADDR_WIDTH_VIRT = 32
+#  //  )
+#   (
+#    input   logic   [N_REGS-1:0] [63:0] int_cfg_regs,
+#    input   logic                       int_rw,
+#    input   logic [ADDR_WIDTH_VIRT-1:0] int_addr_min,
+#    input   logic [ADDR_WIDTH_VIRT-1:0] int_addr_max,
+#    input   logic                       multi_hit_allow,
+#    output  logic                       multi_hit,
+#    output  logic        [N_SLICES-1:0] prot,
+#    output  logic        [N_SLICES-1:0] hit,
+#    output  logic                       cache_coherent,
+#    output  logic [ADDR_WIDTH_PHYS-1:0] out_addr
+#  );
+#
+
+
+class slice_top(Elaboratable):
+
+    def __init__(self):
+        # FIXME self.int_cfg_regs = Signal()  # input
+        self.params = coreconfig.CoreConfig() # rename ?
+        self.int_rw = Signal()  # input
+        self.int_addr_min = Signal(self.params.ADDR_WIDTH_VIRT)  # input
+        self.int_addr_max = Signal(self.params.ADDR_WIDTH_VIRT)  # input
+        self.multi_hit_allow = Signal()  # input
+        self.multi_hit = Signal()  # output
+        self.prot = Signal(self.params.N_SLICES)  # output
+        self.hit = Signal(self.params.N_SLICES)  # output
+        self.cache_coherent = Signal()  # output
+        self.out_addr = Signal(self.params.ADDR_WIDTH_PHYS)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+
+        first_hit = Signal()
+
+        for i in range(self.params.N_SLICES):
+            # TODO pass params / core config here
+            u_slice = rab_slice.rab_slice(self.params)
+            setattr(m.submodules, "u_slice%d" % i, u_slice)
+            # TODO set param and connect ports
+
+        # In case of a multi hit, the lowest slice with a hit is selected.
+        # TODO always_comb begin : HIT_CHECK
+        m.d.comb += [
+            first_hit.eq(0),
+            self.multi_hit.eq(0),
+            self.out_addr.eq(0),
+            self.cache_coherent.eq(0)]
+
+        for j in range(self.params.N_SLICES):
+            with m.If(self.hit[j] == 1):
+                with m.If(first_hit == 1):
+                    with m.If(self.multi_hit_allow == 0):
+                        m.d.comb += [self.multi_hit.eq(1)]
+                with m.Elif(first_hit == 1):
+                    m.d.comb += [first_hit.eq(1)
+                                 # only output first slice that was hit
+                                 # SV self.out_addr.eq(slice_out_addr[ADDR_WIDTH_PHYS*j + : ADDR_WIDTH_PHYS]),
+                                 # SV self.cache_coherent.eq(int_cfg_regs[4*j+3][3]),
+                                 ]
+        return m
+
+  # TODO translate generate statement
+
+
+"""
+  logic [ADDR_WIDTH_PHYS*N_SLICES-1:0]  slice_out_addr;
+
+  generate
+    for ( i=0; i<N_SLICES; i++ )
+      begin
+        rab_slice
+          #(
+            .ADDR_WIDTH_PHYS ( ADDR_WIDTH_PHYS ),
+            .ADDR_WIDTH_VIRT ( ADDR_WIDTH_VIRT )
+            )
+          u_slice
+          (
+            .cfg_min       ( int_cfg_regs[4*i]  [ADDR_WIDTH_VIRT-1:0]                              ),
+            .cfg_max       ( int_cfg_regs[4*i+1][ADDR_WIDTH_VIRT-1:0]                              ),
+            .cfg_offset    ( int_cfg_regs[4*i+2][ADDR_WIDTH_PHYS-1:0]                              ),
+            .cfg_wen       ( int_cfg_regs[4*i+3][2]                                                ),
+            .cfg_ren       ( int_cfg_regs[4*i+3][1]                                                ),
+            .cfg_en        ( int_cfg_regs[4*i+3][0]                                                ),
+            .in_trans_type ( int_rw                                                                ),
+            .in_addr_min   ( int_addr_min                                                          ),
+            .in_addr_max   ( int_addr_max                                                          ),
+            .out_addr      ( slice_out_addr[ADDR_WIDTH_PHYS*i+ADDR_WIDTH_PHYS-1:ADDR_WIDTH_PHYS*i] ),
+            .out_prot      ( prot[i]                                                               ),
+            .out_hit       ( hit[i]                                                                )
+          );
+     end
+  endgenerate
+
+  // In case of a multi hit, the lowest slice with a hit is selected.
+  always_comb begin : HIT_CHECK
+    first_hit      =  0;
+    multi_hit      =  0;
+    out_addr       = '0;
+    cache_coherent =  0;
+    for (j = 0; j < N_SLICES; j++) begin
+      if (hit[j] == 1'b1) begin
+        if (first_hit == 1'b1) begin
+          if (multi_hit_allow == 1'b0) begin
+            multi_hit = 1'b1;
+          end
+        end else begin
+          first_hit       = 1'b1;
+          out_addr        = slice_out_addr[ADDR_WIDTH_PHYS*j +: ADDR_WIDTH_PHYS];
+          cache_coherent  = int_cfg_regs[4*j+3][3];
+        end
+      end
+    end
+  end
+"""
+
+# sv 2 migen: TODO add translate code for generate statements and for loops inside always_comb
diff --git a/src/soc/iommu/axi_rab/test/test_ram_tp_no_change.py b/src/soc/iommu/axi_rab/test/test_ram_tp_no_change.py
new file mode 100644
index 00000000..8d23ef05
--- /dev/null
+++ b/src/soc/iommu/axi_rab/test/test_ram_tp_no_change.py
@@ -0,0 +1,18 @@
+from ram_tp_write_first import ram_tp_write_first
+from nmigen.compat.sim import run_simulation
+import sys
+sys.path.append("../")
+
+
+def tbench(dut):
+    yield dut.we.eq(1)
+    for i in range(0, 255):
+        yield dut.addr0.eq(i)
+        yield dut.d_i.eq(i)
+        yield
+
+
+if __name__ == "__main__":
+    dut = ram_tp_write_first()
+    run_simulation(dut, tbench(dut), vcd_name="ram_tp_write_first.vcd")
+    print("ram_tp_write_first Unit Test Success")
diff --git a/src/soc/iommu/axi_rab/test/test_slice_top.py b/src/soc/iommu/axi_rab/test/test_slice_top.py
new file mode 100644
index 00000000..c234b908
--- /dev/null
+++ b/src/soc/iommu/axi_rab/test/test_slice_top.py
@@ -0,0 +1,14 @@
+from nmigen.compat.sim import run_simulation
+import sys
+sys.path.append("../")
+# sys.path.append("../../../TestUtil")
+from slice_top import slice_top
+
+def tbench(dut):
+    yield
+
+
+if __name__ == "__main__":
+    dut = slice_top()
+    run_simulation(dut, tbench(dut), vcd_name="test_slice_top.vcd")
+    print("slice_top Unit Test Success")
diff --git a/src/soc/regfile/regfile.py b/src/soc/regfile/regfile.py
new file mode 100644
index 00000000..b1d6f1c6
--- /dev/null
+++ b/src/soc/regfile/regfile.py
@@ -0,0 +1,290 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+
+from nmigen import Cat, Const, Array, Signal, Elaboratable, Module
+from nmutil.iocontrol import RecordObject
+
+from math import log
+from functools import reduce
+import operator
+
+
+class Register(Elaboratable):
+    def __init__(self, width, writethru=True):
+        self.width = width
+        self.writethru = writethru
+        self._rdports = []
+        self._wrports = []
+
+    def read_port(self, name=None):
+        port = RecordObject([("ren", 1),
+                             ("data_o", self.width)],
+                            name=name)
+        self._rdports.append(port)
+        return port
+
+    def write_port(self, name=None):
+        port = RecordObject([("wen", 1),
+                             ("data_i", self.width)],
+                            name=name)
+        self._wrports.append(port)
+        return port
+
+    def elaborate(self, platform):
+        m = Module()
+        self.reg = reg = Signal(self.width, name="reg")
+
+        # read ports. has write-through detection (returns data written)
+        for rp in self._rdports:
+            with m.If(rp.ren):
+                if self.writethru:
+                    wr_detect = Signal(reset_less=False)
+                    m.d.comb += wr_detect.eq(0)
+                    for wp in self._wrports:
+                        with m.If(wp.wen):
+                            m.d.comb += rp.data_o.eq(wp.data_i)
+                            m.d.comb += wr_detect.eq(1)
+                    with m.If(~wr_detect):
+                        m.d.comb += rp.data_o.eq(reg)
+                else:
+                    m.d.comb += rp.data_o.eq(reg)
+
+        # write ports, don't allow write to address 0 (ignore it)
+        for wp in self._wrports:
+            with m.If(wp.wen):
+                m.d.sync += reg.eq(wp.data_i)
+
+        return m
+
+    def __iter__(self):
+        for p in self._rdports:
+            yield from p
+        for p in self._wrports:
+            yield from p
+
+    def ports(self):
+        res = list(self)
+
+def treereduce(tree, attr="data_o"):
+    #print ("treereduce", tree)
+    if not isinstance(tree, list):
+        return tree
+    if len(tree) == 1:
+        return getattr(tree[0], attr)
+    if len(tree) == 2:
+        return getattr(tree[0], attr) | getattr(tree[1], attr)
+    split = len(tree) // 2
+    return treereduce(tree[:split], attr) | treereduce(tree[split:], attr)
+
+
+class RegFileArray(Elaboratable):
+    """ an array-based register file (register having write-through capability)
+        that has no "address" decoder, instead it has individual write-en
+        and read-en signals (per port).
+    """
+    def __init__(self, width, depth):
+        self.width = width
+        self.depth = depth
+        self.regs = Array(Register(width) for _ in range(self.depth))
+        self._rdports = []
+        self._wrports = []
+
+    def read_port(self, name=None):
+        regs = []
+        for i in range(self.depth):
+            port = self.regs[i].read_port(name)
+            regs.append(port)
+        regs = Array(regs)
+        port = RecordObject([("ren", self.depth),
+                             ("data_o", self.width)], name)
+        self._rdports.append((regs, port))
+        return port
+
+    def write_port(self, name=None):
+        regs = []
+        for i in range(self.depth):
+            port = self.regs[i].write_port(name)
+            regs.append(port)
+        regs = Array(regs)
+        port = RecordObject([("wen", self.depth),
+                             ("data_i", self.width)])
+        self._wrports.append((regs, port))
+        return port
+
+    def _get_en_sig(self, port, typ):
+        wen = []
+        for p in port:
+            wen.append(p[typ])
+        return Cat(*wen)
+
+    def elaborate(self, platform):
+        m = Module()
+        for i, reg in enumerate(self.regs):
+            setattr(m.submodules, "reg_%d" % i, reg)
+
+        for (regs, p) in self._rdports:
+            #print (p)
+            m.d.comb += self._get_en_sig(regs, 'ren').eq(p.ren)
+            ror = treereduce(list(regs))
+            m.d.comb += p.data_o.eq(ror)
+        for (regs, p) in self._wrports:
+            m.d.comb += self._get_en_sig(regs, 'wen').eq(p.wen)
+            for r in regs:
+                m.d.comb += r.data_i.eq(p.data_i)
+
+        return m
+
+    def __iter__(self):
+        for r in self.regs:
+            yield from r
+
+    def ports(self):
+        return list(self)
+
+
+class RegFile(Elaboratable):
+    def __init__(self, width, depth):
+        self.width = width
+        self.depth = depth
+        self._rdports = []
+        self._wrports = []
+
+    def read_port(self):
+        bsz = int(log(self.width) / log(2))
+        port = RecordObject([("raddr", bsz),
+                             ("ren", 1),
+                             ("data_o", self.width)])
+        self._rdports.append(port)
+        return port
+
+    def write_port(self):
+        bsz = int(log(self.width) / log(2))
+        port = RecordObject([("waddr", bsz),
+                             ("wen", 1),
+                             ("data_i", self.width)])
+        self._wrports.append(port)
+        return port
+
+    def elaborate(self, platform):
+        m = Module()
+        bsz = int(log(self.width) / log(2))
+        regs = Array(Signal(self.width, name="reg") for _ in range(self.depth))
+
+        # read ports. has write-through detection (returns data written)
+        for rp in self._rdports:
+            wr_detect = Signal(reset_less=False)
+            with m.If(rp.ren):
+                m.d.comb += wr_detect.eq(0)
+                for wp in self._wrports:
+                    addrmatch = Signal(reset_less=False)
+                    m.d.comb += addrmatch.eq(wp.waddr == rp.raddr)
+                    with m.If(wp.wen & addrmatch):
+                        m.d.comb += rp.data_o.eq(wp.data_i)
+                        m.d.comb += wr_detect.eq(1)
+                with m.If(~wr_detect):
+                    m.d.comb += rp.data_o.eq(regs[rp.raddr])
+
+        # write ports, don't allow write to address 0 (ignore it)
+        for wp in self._wrports:
+            with m.If(wp.wen & (wp.waddr != Const(0, bsz))):
+                m.d.sync += regs[wp.waddr].eq(wp.data_i)
+
+        return m
+
+    def __iter__(self):
+        yield from self._rdports
+        yield from self._wrports
+
+    def ports(self):
+        res = list(self)
+        for r in res:
+            if isinstance(r, RecordObject):
+                yield from r
+            else:
+                yield r
+
+def regfile_sim(dut, rp, wp):
+    yield wp.waddr.eq(1)
+    yield wp.data_i.eq(2)
+    yield wp.wen.eq(1)
+    yield
+    yield wp.wen.eq(0)
+    yield rp.ren.eq(1)
+    yield rp.raddr.eq(1)
+    yield
+    data = yield rp.data_o
+    print (data)
+    assert data == 2
+
+    yield wp.waddr.eq(5)
+    yield rp.raddr.eq(5)
+    yield rp.ren.eq(1)
+    yield wp.wen.eq(1)
+    yield wp.data_i.eq(6)
+    data = yield rp.data_o
+    print (data)
+    yield
+    yield wp.wen.eq(0)
+    yield rp.ren.eq(0)
+    data = yield rp.data_o
+    print (data)
+    assert data == 6
+    yield
+    data = yield rp.data_o
+    print (data)
+
+def regfile_array_sim(dut, rp1, rp2, wp):
+    yield wp.data_i.eq(2)
+    yield wp.wen.eq(1<<1)
+    yield
+    yield wp.wen.eq(0)
+    yield rp1.ren.eq(1<<1)
+    yield
+    data = yield rp1.data_o
+    print (data)
+    assert data == 2
+
+    yield rp1.ren.eq(1<<5)
+    yield rp2.ren.eq(1<<1)
+    yield wp.wen.eq(1<<5)
+    yield wp.data_i.eq(6)
+    data = yield rp1.data_o
+    print (data)
+    yield
+    yield wp.wen.eq(0)
+    yield rp1.ren.eq(0)
+    yield rp2.ren.eq(0)
+    data1 = yield rp1.data_o
+    print (data1)
+    data2 = yield rp2.data_o
+    print (data2)
+    assert data1 == 6
+    yield
+    data = yield rp1.data_o
+    print (data)
+
+def test_regfile():
+    dut = RegFile(32, 8)
+    rp = dut.read_port()
+    wp = dut.write_port()
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_regfile.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, regfile_sim(dut, rp, wp), vcd_name='test_regfile.vcd')
+
+    dut = RegFileArray(32, 8)
+    rp1 = dut.read_port("read1")
+    rp2 = dut.read_port("read2")
+    wp = dut.write_port("write")
+    ports=dut.ports()
+    print ("ports", ports)
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_regfile_array.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, regfile_array_sim(dut, rp1, rp2, wp),
+                   vcd_name='test_regfile_array.vcd')
+
+if __name__ == '__main__':
+    test_regfile()
diff --git a/src/soc/scoreboard/__init__.py b/src/soc/scoreboard/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/soc/scoreboard/addr_match.py b/src/soc/scoreboard/addr_match.py
new file mode 100644
index 00000000..e42bbe52
--- /dev/null
+++ b/src/soc/scoreboard/addr_match.py
@@ -0,0 +1,130 @@
+""" Load / Store partial address matcher
+
+Loads and Stores do not need a full match (CAM), they need "good enough"
+avoidance.  Around 11 bits on a 64-bit address is "good enough".
+
+The simplest way to use this module is to ignore not only the top bits,
+but also the bottom bits as well: in this case (this RV64 processor),
+enough to cover a DWORD (64-bit).  that means ignore the bottom 4 bits,
+due to the possibility of 64-bit LD/ST being misaligned.
+
+To reiterate: the use of this module is an *optimisation*.  All it has
+to do is cover the cases that are *definitely* matches (by checking 11
+bits or so), and if a few opportunities for parallel LD/STs are missed
+because the top (or bottom) bits weren't checked, so what: all that
+happens is: the mis-matched addresses are LD/STd on single-cycles. Big Deal.
+
+However, if we wanted to enhance this algorithm (without using a CAM and
+without using expensive comparators) probably the best way to do so would
+be to turn the last 16 bits into a byte-level bitmap.  LD/ST on a byte
+would have 1 of the 16 bits set.  LD/ST on a DWORD would have 8 of the 16
+bits set (offset if the LD/ST was misaligned).  TODO.
+
+Notes:
+
+> I have used bits <11:6> as they are not translated (4KB pages)
+> and larger than a cache line (64 bytes).
+> I have used bits <11:4> when the L1 cache was QuadW sized and
+> the L2 cache was Line sized.
+"""
+
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Const, Array, Cat, Elaboratable
+
+from nmutil.latch import latchregister, SRLatch
+
+
+class PartialAddrMatch(Elaboratable):
+    """A partial address matcher
+    """
+    def __init__(self, n_adr, bitwid):
+        self.n_adr = n_adr
+        self.bitwid = bitwid
+        # inputs
+        self.addrs_i = Array(Signal(bitwid, name="addr") for i in range(n_adr))
+        self.addr_we_i = Signal(n_adr) # write-enable for incoming address
+        self.addr_en_i = Signal(n_adr) # address latched in
+        self.addr_rs_i = Signal(n_adr) # address deactivated
+
+        # output
+        self.addr_nomatch_o = Signal(n_adr, name="nomatch_o")
+        self.addr_nomatch_a_o = Array(Signal(n_adr, name="nomatch_array_o") \
+                                  for i in range(n_adr))
+
+    def elaborate(self, platform):
+        m = Module()
+        return self._elaborate(m, platform)
+
+    def _elaborate(self, m, platform):
+        comb = m.d.comb
+        sync = m.d.sync
+
+        m.submodules.l = l = SRLatch(llen=self.n_adr, sync=False)
+        addrs_r = Array(Signal(self.bitwid, name="a_r") \
+                                for i in range(self.n_adr))
+
+        # latch set/reset
+        comb += l.s.eq(self.addr_en_i)
+        comb += l.r.eq(self.addr_rs_i)
+
+        # copy in addresses (and "enable" signals)
+        for i in range(self.n_adr):
+            latchregister(m, self.addrs_i[i], addrs_r[i], l.q[i])
+
+        # is there a clash, yes/no
+        matchgrp = []
+        for i in range(self.n_adr):
+            match = []
+            for j in range(self.n_adr):
+                if i == j:
+                    match.append(Const(0)) # don't match against self!
+                else:
+                    match.append(addrs_r[i] == addrs_r[j])
+            comb += self.addr_nomatch_a_o[i].eq(~Cat(*match) & l.q)
+            matchgrp.append(self.addr_nomatch_a_o[i] == l.q)
+        comb += self.addr_nomatch_o.eq(Cat(*matchgrp) & l.q)
+            
+        return m
+
+    def __iter__(self):
+        yield from self.addrs_i
+        yield self.addr_we_i
+        yield self.addr_en_i
+        yield from self.addr_nomatch_a_o
+        yield self.addr_nomatch_o
+
+    def ports(self):
+        return list(self)
+
+
+def part_addr_sim(dut):
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_rd_i.eq(1)
+    yield
+    yield dut.go_rd_i.eq(0)
+    yield
+    yield dut.go_wr_i.eq(1)
+    yield
+    yield dut.go_wr_i.eq(0)
+    yield
+
+def test_part_addr():
+    dut = PartialAddrMatch(3, 10)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_part_addr.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, part_addr_sim(dut), vcd_name='test_part_addr.vcd')
+
+if __name__ == '__main__':
+    test_part_addr()
diff --git a/src/soc/scoreboard/dependence_cell.py b/src/soc/scoreboard/dependence_cell.py
new file mode 100644
index 00000000..16108229
--- /dev/null
+++ b/src/soc/scoreboard/dependence_cell.py
@@ -0,0 +1,169 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl
+from nmutil.latch import SRLatch
+from functools import reduce
+from operator import or_
+
+
+class DependencyRow(Elaboratable):
+    """ implements 11.4.7 mitch alsup dependence cell, p27
+        adjusted to be clock-sync'd on rising edge only.
+        mitch design (as does 6600) requires alternating rising/falling clock
+
+        * SET mode: issue_i HI, go_i LO, reg_i HI - register is captured
+                                                  - FWD is DISABLED (~issue_i)
+                                                  - RSEL DISABLED
+        * QRY mode: issue_i LO, go_i LO, haz_i HI - FWD is ASSERTED
+                                         reg_i HI - ignored
+        * GO mode : issue_i LO, go_i HI           - RSEL is ASSERTED
+                                         haz_i HI - FWD still can be ASSERTED
+
+        FWD assertion (hazard protection) therefore still occurs in both
+        Query and Go Modes, for this cycle, due to the cq register
+
+        GO mode works for one cycle, again due to the cq register capturing
+        the latch output.  Without the cq register, the SR Latch (which is
+        asynchronous) would be reset at the exact moment that GO was requested,
+        and the RSEL would be garbage.
+    """
+    def __init__(self, n_reg, n_src, cancel_mode=False):
+        self.cancel_mode = cancel_mode
+        self.n_reg = n_reg
+        self.n_src = n_src
+        # arrays
+        src = []
+        rsel = []
+        fwd = []
+        for i in range(n_src):
+            j = i + 1 # name numbering to match src1/src2
+            src.append(Signal(n_reg, name="src%d" % j, reset_less=True))
+            rsel.append(Signal(n_reg, name="src%d_rsel_o" % j, reset_less=True))
+            fwd.append(Signal(n_reg, name="src%d_fwd_o" % j, reset_less=True))
+
+        # inputs
+        self.dest_i = Signal(n_reg, reset_less=True)     # Dest in (top)
+        self.src_i = Array(src)     # operands in (top)
+        self.issue_i = Signal(reset_less=True)    # Issue in (top)
+
+        self.rd_pend_i = Signal(n_reg, reset_less=True) # Read pend in (top)
+        self.wr_pend_i = Signal(n_reg, reset_less=True) # Write pend in (top)
+        self.v_rd_rsel_o = Signal(n_reg, reset_less=True) # Read pend out (bot)
+        self.v_wr_rsel_o = Signal(n_reg, reset_less=True) # Write pend out (bot)
+
+        self.go_wr_i = Signal(reset_less=True) # Go Write in (left)
+        self.go_rd_i = Signal(reset_less=True)  # Go Read in (left)
+        if self.cancel_mode:
+            self.go_die_i = Signal(n_reg, reset_less=True) # Go Die in (left)
+        else:
+            self.go_die_i = Signal(reset_less=True) # Go Die in (left)
+
+        # for Register File Select Lines (vertical)
+        self.dest_rsel_o = Signal(n_reg, reset_less=True)  # dest reg sel (bot)
+        self.src_rsel_o = Array(rsel)   # src reg sel (bot)
+        self.src2_rsel_o = Signal(n_reg, reset_less=True)  # src2 reg sel (bot)
+
+        # for Function Unit "forward progress" (horizontal)
+        self.dest_fwd_o = Signal(n_reg, reset_less=True)   # dest FU fw (right)
+        self.src_fwd_o = Array(fwd)    # src FU fw (right)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.dest_c = dest_c = SRLatch(sync=False, llen=self.n_reg)
+        src_c = []
+        for i in range(self.n_src):
+            src_l = SRLatch(sync=False, llen=self.n_reg)
+            setattr(m.submodules, "src%d_c" % (i+1), src_l)
+            src_c.append(src_l)
+
+        # connect go_rd / go_wr (dest->wr, src->rd)
+        wr_die = Signal(self.n_reg, reset_less=True)
+        rd_die = Signal(self.n_reg, reset_less=True)
+        if self.cancel_mode:
+            go_die = self.go_die_i
+        else:
+            go_die = Repl(self.go_die_i, self.n_reg)
+        m.d.comb += wr_die.eq(Repl(self.go_wr_i, self.n_reg) | go_die)
+        m.d.comb += rd_die.eq(Repl(self.go_rd_i, self.n_reg) | go_die)
+        m.d.comb += dest_c.r.eq(wr_die)
+        for i in range(self.n_src):
+            m.d.comb += src_c[i].r.eq(rd_die)
+
+        # connect input reg bit (unary)
+        i_ext = Repl(self.issue_i, self.n_reg)
+        m.d.comb += dest_c.s.eq(i_ext & self.dest_i)
+        for i in range(self.n_src):
+            m.d.comb += src_c[i].s.eq(i_ext & self.src_i[i])
+
+        # connect up hazard checks: read-after-write and write-after-read
+        m.d.comb += self.dest_fwd_o.eq(dest_c.q & self.rd_pend_i)
+        for i in range(self.n_src):
+            m.d.comb += self.src_fwd_o[i].eq(src_c[i].q & self.wr_pend_i)
+
+        # connect reg-sel outputs
+        rd_ext = Repl(self.go_rd_i, self.n_reg)
+        wr_ext = Repl(self.go_wr_i, self.n_reg)
+        m.d.comb += self.dest_rsel_o.eq(dest_c.qlq & wr_ext)
+        for i in range(self.n_src):
+            m.d.comb += self.src_rsel_o[i].eq(src_c[i].qlq & rd_ext)
+
+        # to be accumulated to indicate if register is in use (globally)
+        # after ORing, is fed back in to rd_pend_i / wr_pend_i
+        src_q = []
+        for i in range(self.n_src):
+            src_q.append(src_c[i].qlq)
+        m.d.comb += self.v_rd_rsel_o.eq(reduce(or_, src_q))
+        m.d.comb += self.v_wr_rsel_o.eq(dest_c.qlq)
+
+        return m
+
+    def __iter__(self):
+        yield self.dest_i
+        yield from self.src_i
+        yield self.rd_pend_i
+        yield self.wr_pend_i
+        yield self.issue_i
+        yield self.go_wr_i
+        yield self.go_rd_i
+        yield self.go_die_i
+        yield self.dest_rsel_o
+        yield from self.src_rsel_o
+        yield self.dest_fwd_o
+        yield from self.src_fwd_o
+
+    def ports(self):
+        return list(self)
+
+
+def dcell_sim(dut):
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_rd_i.eq(1)
+    yield
+    yield dut.go_rd_i.eq(0)
+    yield
+    yield dut.go_wr_i.eq(1)
+    yield
+    yield dut.go_wr_i.eq(0)
+    yield
+
+def test_dcell():
+    dut = DependencyRow(4, 2, True)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_drow.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, dcell_sim(dut), vcd_name='test_dcell.vcd')
+
+if __name__ == '__main__':
+    test_dcell()
diff --git a/src/soc/scoreboard/fn_unit.py b/src/soc/scoreboard/fn_unit.py
new file mode 100644
index 00000000..63beb70b
--- /dev/null
+++ b/src/soc/scoreboard/fn_unit.py
@@ -0,0 +1,321 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Cat, Array, Const, Elaboratable
+from nmigen.lib.coding import Decoder
+
+from nmutil.latch import SRLatch, latchregister
+
+from scoreboard.shadow import Shadow
+
+
+class FnUnit(Elaboratable):
+    """ implements 11.4.8 function unit, p31
+        also implements optional shadowing 11.5.1, p55
+
+        shadowing can be used for branches as well as exceptions (interrupts),
+        load/store hold (exceptions again), and vector-element predication
+        (once the predicate is known, which it may not be at instruction issue)
+
+        Inputs
+
+        * :wid:         register file width
+        * :shadow_wid:  number of shadow/fail/good/go_die sets
+        * :n_dests:     number of destination regfile(s) (index: rfile_sel_i)
+        * :wr_pend:     if true, writable observes the g_wr_pend_i vector
+                        otherwise observes g_rd_pend_i
+
+        notes:
+
+        * dest_i / src1_i / src2_i are in *binary*, whereas...
+        * ...g_rd_pend_i / g_wr_pend_i and rd_pend_o / wr_pend_o are UNARY
+        * req_rel_i (request release) is the direct equivalent of pipeline
+                    "output valid" (valid_o)
+        * recover is a local python variable (actually go_die_o)
+        * when shadow_wid = 0, recover and shadown are Consts (i.e. do nothing)
+        * wr_pend is set False for the majority of uses: however for
+          use in a STORE Function Unit it is set to True
+    """
+    def __init__(self, wid, shadow_wid=0, n_dests=1, wr_pend=False):
+        self.reg_width = wid
+        self.n_dests = n_dests
+        self.shadow_wid = shadow_wid
+        self.wr_pend = wr_pend
+
+        # inputs
+        if n_dests > 1:
+            self.rfile_sel_i = Signal(max=n_dests, reset_less=True)
+        else:
+            self.rfile_sel_i = Const(0) # no selection.  gets Array[0]
+        self.dest_i = Signal(max=wid, reset_less=True) # Dest R# in (top)
+        self.src1_i = Signal(max=wid, reset_less=True) # oper1 R# in (top)
+        self.src2_i = Signal(max=wid, reset_less=True) # oper2 R# in (top)
+        self.issue_i = Signal(reset_less=True)    # Issue in (top)
+
+        self.go_wr_i = Signal(reset_less=True) # Go Write in (left)
+        self.go_rd_i = Signal(reset_less=True)  # Go Read in (left)
+        self.req_rel_i = Signal(reset_less=True)  # request release (left)
+
+        self.g_xx_pend_i = Array(Signal(wid, reset_less=True, name="g_pend_i") \
+                               for i in range(n_dests)) # global rd (right)
+        self.g_wr_pend_i = Signal(wid, reset_less=True) # global wr (right)
+
+        if shadow_wid:
+            self.shadow_i = Signal(shadow_wid, reset_less=True)
+            self.s_fail_i  = Signal(shadow_wid, reset_less=True)
+            self.s_good_i  = Signal(shadow_wid, reset_less=True)
+            self.go_die_o  = Signal(reset_less=True)
+
+        # outputs
+        self.readable_o = Signal(reset_less=True) # Readable out (right)
+        self.writable_o = Array(Signal(reset_less=True, name="writable_o") \
+                               for i in range(n_dests)) # writable out (right)
+        self.busy_o = Signal(reset_less=True) # busy out (left)
+
+        self.src1_pend_o = Signal(wid, reset_less=True) # src1 pending
+        self.src2_pend_o = Signal(wid, reset_less=True) # src1 pending
+        self.rd_pend_o = Signal(wid, reset_less=True) # rd pending (right)
+        self.xx_pend_o = Array(Signal(wid, reset_less=True, name="pend_o") \
+                               for i in range(n_dests))# wr pending (right)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.rd_l = rd_l = SRLatch(sync=False)
+        m.submodules.wr_l = wr_l = SRLatch(sync=False)
+        m.submodules.dest_d = dest_d = Decoder(self.reg_width)
+        m.submodules.src1_d = src1_d = Decoder(self.reg_width)
+        m.submodules.src2_d = src2_d = Decoder(self.reg_width)
+
+        # shadow / recover (optional: shadow_wid > 0)
+        m.submodules.shadow = shadow = Shadow(self.shadow_wid)
+        if self.shadow_wid:
+            m.d.comb += shadow.issue_i.eq(self.issue_i)
+            m.d.comb += shadow.s_fail_i.eq(self.s_fail_i)
+            m.d.comb += shadow.s_good_i.eq(self.s_good_i)
+            m.d.comb += shadow.shadow_i.eq(self.shadow_i)
+        shadown = shadow.shadown_o
+        recover = shadow.go_die_o
+
+        # selector
+        xx_pend_o = self.xx_pend_o[self.rfile_sel_i]
+        writable_o = self.writable_o[self.rfile_sel_i]
+        g_pend_i = self.g_xx_pend_i[self.rfile_sel_i]
+
+        for i in range(self.n_dests):
+            m.d.comb += self.xx_pend_o[i].eq(0)  # initialise all array
+            m.d.comb += self.writable_o[i].eq(0) # to zero
+        m.d.comb += self.readable_o.eq(0) # to zero
+
+        # go_wr latch: reset on go_wr HI, set on issue
+        m.d.comb += wr_l.s.eq(self.issue_i)
+        m.d.comb += wr_l.r.eq(self.go_wr_i | recover)
+
+        # src1 latch: reset on go_rd HI, set on issue
+        m.d.comb += rd_l.s.eq(self.issue_i)
+        m.d.comb += rd_l.r.eq(self.go_rd_i | recover)
+
+        # latch/registers for dest / src1 / src2
+        dest_r = Signal(max=self.reg_width, reset_less=True)
+        src1_r = Signal(max=self.reg_width, reset_less=True)
+        src2_r = Signal(max=self.reg_width, reset_less=True)
+        # XXX latch based on *issue* rather than !latch (as in book)
+        latchregister(m, self.dest_i, dest_r, self.issue_i) #wr_l.qn)
+        latchregister(m, self.src1_i, src1_r, self.issue_i) #wr_l.qn)
+        latchregister(m, self.src2_i, src2_r, self.issue_i) #wr_l.qn)
+
+        # dest decoder (use dest reg as input): write-pending out
+        m.d.comb += dest_d.i.eq(dest_r)
+        m.d.comb += dest_d.n.eq(wr_l.qn) # decode is inverted
+        m.d.comb += self.busy_o.eq(wr_l.q) # busy if set
+        m.d.comb += xx_pend_o.eq(dest_d.o)
+
+        # src1/src2 decoder (use src1/2 regs as input): read-pending out
+        m.d.comb += src1_d.i.eq(src1_r)
+        m.d.comb += src1_d.n.eq(rd_l.qn) # decode is inverted
+        m.d.comb += src2_d.i.eq(src2_r)
+        m.d.comb += src2_d.n.eq(rd_l.qn) # decode is inverted
+        m.d.comb += self.src1_pend_o.eq(src1_d.o)
+        m.d.comb += self.src2_pend_o.eq(src2_d.o)
+        m.d.comb += self.rd_pend_o.eq(src1_d.o | src2_d.o)
+
+        # readable output signal
+        g_rd = Signal(self.reg_width, reset_less=True)
+        ro = Signal(reset_less=True)
+        m.d.comb += g_rd.eq(~self.g_wr_pend_i & self.rd_pend_o)
+        m.d.comb += ro.eq(~g_rd.bool())
+        m.d.comb += self.readable_o.eq(ro)
+
+        # writable output signal
+        g_wr_v = Signal(self.reg_width, reset_less=True)
+        g_wr = Signal(reset_less=True)
+        wo = Signal(reset_less=True)
+        m.d.comb += g_wr_v.eq(g_pend_i & xx_pend_o)
+        m.d.comb += g_wr.eq(~g_wr_v.bool())
+        m.d.comb += wo.eq(g_wr & rd_l.qn & self.req_rel_i & shadown)
+        m.d.comb += writable_o.eq(wo)
+
+        return m
+
+    def __iter__(self):
+        yield self.dest_i
+        yield self.src1_i
+        yield self.src2_i
+        yield self.issue_i
+        yield self.go_wr_i
+        yield self.go_rd_i
+        yield self.req_rel_i
+        yield from self.g_xx_pend_i
+        yield self.g_wr_pend_i
+        yield self.readable_o
+        yield from self.writable_o
+        yield self.rd_pend_o
+        yield from self.xx_pend_o
+
+    def ports(self):
+        return list(self)
+
+#############                                     ###############
+# ---                                                       --- #
+# --- renamed / redirected from base class                  --- #
+# ---                                                       --- #
+# --- below are convenience classes which match the names   --- #
+# --- of the various mitch alsup book chapter gate diagrams --- #
+# ---                                                       --- #
+#############                                     ###############
+
+
+class IntFnUnit(FnUnit):
+    def __init__(self, wid, shadow_wid=0):
+        FnUnit.__init__(self, wid, shadow_wid)
+        self.int_rd_pend_o = self.rd_pend_o
+        self.int_wr_pend_o = self.xx_pend_o[0]
+        self.g_int_wr_pend_i = self.g_wr_pend_i
+        self.g_int_rd_pend_i = self.g_xx_pend_i[0]
+        self.int_readable_o = self.readable_o
+        self.int_writable_o = self.writable_o[0]
+
+        self.int_rd_pend_o.name = "int_rd_pend_o"
+        self.int_wr_pend_o.name = "int_wr_pend_o"
+        self.g_int_rd_pend_i.name = "g_int_rd_pend_i"
+        self.g_int_wr_pend_i.name = "g_int_wr_pend_i"
+        self.int_readable_o.name = "int_readable_o"
+        self.int_writable_o.name = "int_writable_o"
+
+
+class FPFnUnit(FnUnit):
+    def __init__(self, wid, shadow_wid=0):
+        FnUnit.__init__(self, wid, shadow_wid)
+        self.fp_rd_pend_o = self.rd_pend_o
+        self.fp_wr_pend_o = self.xx_pend_o[0]
+        self.g_fp_wr_pend_i = self.g_wr_pend_i
+        self.g_fp_rd_pend_i = self.g_xx_pend_i[0]
+        self.fp_writable_o = self.writable_o[0]
+        self.fp_readable_o = self.readable_o
+
+        self.fp_rd_pend_o.name = "fp_rd_pend_o"
+        self.fp_wr_pend_o.name = "fp_wr_pend_o"
+        self.g_fp_rd_pend_i.name = "g_fp_rd_pend_i"
+        self.g_fp_wr_pend_i.name = "g_fp_wr_pend_i"
+        self.fp_writable_o.name = "fp_writable_o"
+        self.fp_readable_o.name = "fp_readable_o"
+
+
+class LDFnUnit(FnUnit):
+    """ number of dest selectors: 2. assumes len(int_regfile) == len(fp_regfile)
+        * when rfile_sel_i == 0, int_wr_pend_o is set
+        * when rfile_sel_i == 1, fp_wr_pend_o is set
+    """
+    def __init__(self, wid, shadow_wid=0):
+        FnUnit.__init__(self, wid, shadow_wid, n_dests=2)
+        self.int_rd_pend_o = self.rd_pend_o
+        self.int_wr_pend_o = self.xx_pend_o[0]
+        self.fp_wr_pend_o = self.xx_pend_o[1]
+        self.g_int_wr_pend_i = self.g_wr_pend_i
+        self.g_int_rd_pend_i = self.g_xx_pend_i[0]
+        self.g_fp_rd_pend_i = self.g_xx_pend_i[1]
+        self.int_readable_o = self.readable_o
+        self.int_writable_o = self.writable_o[0]
+        self.fp_writable_o = self.writable_o[1]
+
+        self.int_rd_pend_o.name = "int_rd_pend_o"
+        self.int_wr_pend_o.name = "int_wr_pend_o"
+        self.fp_wr_pend_o.name = "fp_wr_pend_o"
+        self.g_int_wr_pend_i.name = "g_int_wr_pend_i"
+        self.g_int_rd_pend_i.name = "g_int_rd_pend_i"
+        self.g_fp_rd_pend_i.name = "g_fp_rd_pend_i"
+        self.int_readable_o.name = "int_readable_o"
+        self.int_writable_o.name = "int_writable_o"
+        self.fp_writable_o.name = "fp_writable_o"
+
+
+class STFnUnit(FnUnit):
+    """ number of dest selectors: 2. assumes len(int_regfile) == len(fp_regfile)
+        * wr_pend=False indicates to observe global fp write pending
+        * when rfile_sel_i == 0, int_wr_pend_o is set
+        * when rfile_sel_i == 1, fp_wr_pend_o is set
+        *
+    """
+    def __init__(self, wid, shadow_wid=0):
+        FnUnit.__init__(self, wid, shadow_wid, n_dests=2, wr_pend=True)
+        self.int_rd_pend_o = self.rd_pend_o     # 1st int read-pending vector
+        self.int2_rd_pend_o = self.xx_pend_o[0] # 2nd int read-pending vector
+        self.fp_rd_pend_o = self.xx_pend_o[1]   # 1x FP read-pending vector
+        # yes overwrite FnUnit base class g_wr_pend_i vector
+        self.g_int_wr_pend_i = self.g_wr_pend_i = self.g_xx_pend_i[0]
+        self.g_fp_wr_pend_i = self.g_xx_pend_i[1]
+        self.int_readable_o = self.readable_o
+        self.int_writable_o = self.writable_o[0]
+        self.fp_writable_o = self.writable_o[1]
+
+        self.int_rd_pend_o.name = "int_rd_pend_o"
+        self.int2_rd_pend_o.name = "int2_rd_pend_o"
+        self.fp_rd_pend_o.name = "fp_rd_pend_o"
+        self.g_int_wr_pend_i.name = "g_int_wr_pend_i"
+        self.g_fp_wr_pend_i.name = "g_fp_wr_pend_i"
+        self.int_readable_o.name = "int_readable_o"
+        self.int_writable_o.name = "int_writable_o"
+        self.fp_writable_o.name = "fp_writable_o"
+
+
+
+def int_fn_unit_sim(dut):
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_rd_i.eq(1)
+    yield
+    yield dut.go_rd_i.eq(0)
+    yield
+    yield dut.go_wr_i.eq(1)
+    yield
+    yield dut.go_wr_i.eq(0)
+    yield
+
+def test_int_fn_unit():
+    dut = FnUnit(32, 2, 2)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_fn_unit.il", "w") as f:
+        f.write(vl)
+
+    dut = LDFnUnit(32, 2)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_ld_fn_unit.il", "w") as f:
+        f.write(vl)
+
+    dut = STFnUnit(32, 0)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_st_fn_unit.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, int_fn_unit_sim(dut), vcd_name='test_fn_unit.vcd')
+
+if __name__ == '__main__':
+    test_int_fn_unit()
diff --git a/src/soc/scoreboard/fu_dep_cell.py b/src/soc/scoreboard/fu_dep_cell.py
new file mode 100644
index 00000000..9946dcb5
--- /dev/null
+++ b/src/soc/scoreboard/fu_dep_cell.py
@@ -0,0 +1,92 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Const, Elaboratable
+from nmutil.latch import SRLatch
+
+
+class FUDependenceCell(Elaboratable):
+    """ implements 11.4.7 mitch alsup dependence cell, p27
+    """
+    def __init__(self, dummy, n_fu=1):
+        self.n_fu = n_fu
+        self.dummy = Const(~(1<<dummy), n_fu)
+        # inputs
+        self.rd_pend_i = Signal(n_fu, reset_less=True) # read pend in (left)
+        self.wr_pend_i = Signal(n_fu, reset_less=True) # write pend in (left)
+        self.issue_i = Signal(n_fu, reset_less=True)    # Issue in (top)
+
+        self.go_wr_i = Signal(n_fu, reset_less=True) # Go Write in (left)
+        self.go_rd_i = Signal(n_fu, reset_less=True)  # Go Read in (left)
+        self.go_die_i = Signal(n_fu, reset_less=True) # Go Die in (left)
+
+        # outputs (latched rd/wr wait)
+        self.rd_wait_o = Signal(n_fu, reset_less=True) # read wait out (right)
+        self.wr_wait_o = Signal(n_fu, reset_less=True) # write wait out (right)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.rd_c = rd_c = SRLatch(sync=False, llen=self.n_fu)
+        m.submodules.wr_c = wr_c = SRLatch(sync=False, llen=self.n_fu)
+
+        # reset on go HI, set on dest and issue
+        m.d.comb += rd_c.s.eq(self.issue_i & self.rd_pend_i)
+        m.d.comb += wr_c.s.eq(self.issue_i & self.wr_pend_i)
+
+        # connect go_rd / go_wr 
+        m.d.comb += wr_c.r.eq(self.go_wr_i | self.go_die_i)
+        m.d.comb += rd_c.r.eq(self.go_rd_i | self.go_die_i)
+
+        # connect pend_i
+        m.d.comb += rd_c.s.eq(self.issue_i & self.rd_pend_i & self.dummy)
+        m.d.comb += wr_c.s.eq(self.issue_i & self.wr_pend_i & self.dummy)
+
+        # connect output
+        m.d.comb += self.rd_wait_o.eq(rd_c.qlq & ~self.issue_i)
+        m.d.comb += self.wr_wait_o.eq(wr_c.qlq & ~self.issue_i)
+
+        return m
+
+    def __iter__(self):
+        yield self.rd_pend_i
+        yield self.wr_pend_i
+        yield self.issue_i
+        yield self.go_wr_i
+        yield self.go_rd_i
+        yield self.go_die_i
+        yield self.rd_wait_o
+        yield self.wr_wait_o
+                
+    def ports(self):
+        return list(self)
+
+
+def dcell_sim(dut):
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_rd_i.eq(1)
+    yield
+    yield dut.go_rd_i.eq(0)
+    yield
+    yield dut.go_wr_i.eq(1)
+    yield
+    yield dut.go_wr_i.eq(0)
+    yield
+
+def test_dcell():
+    dut = FUDependenceCell(dummy=0, n_fu=4)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_fu_dcell.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, dcell_sim(dut), vcd_name='test_fu_dcell.vcd')
+
+if __name__ == '__main__':
+    test_dcell()
diff --git a/src/soc/scoreboard/fu_fu_matrix.py b/src/soc/scoreboard/fu_fu_matrix.py
new file mode 100644
index 00000000..cc2c1b96
--- /dev/null
+++ b/src/soc/scoreboard/fu_fu_matrix.py
@@ -0,0 +1,155 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Elaboratable, Array, Cat, Const
+
+from .fu_dep_cell import FUDependenceCell
+from .fu_picker_vec import FU_Pick_Vec
+
+"""
+
+ 6600 Function Unit Dependency Table Matrix inputs / outputs
+ -----------------------------------------------------------
+
+"""
+
+class FUFUDepMatrix(Elaboratable):
+    """ implements 11.4.7 mitch alsup FU-to-Reg Dependency Matrix, p26
+    """
+    def __init__(self, n_fu_row, n_fu_col):
+        self.n_fu_row = n_fu_row                  # Y (FU row#)   ^v
+        self.n_fu_col = n_fu_col                # X (FU col #)  <>
+        self.rd_pend_i = Signal(n_fu_row, reset_less=True) # Rd pending (left)
+        self.wr_pend_i = Signal(n_fu_row, reset_less=True) # Wr pending (left)
+        self.issue_i = Signal(n_fu_col, reset_less=True)    # Issue in (top)
+
+        self.go_wr_i = Signal(n_fu_row, reset_less=True) # Go Write in (left)
+        self.go_rd_i = Signal(n_fu_row, reset_less=True)  # Go Read in (left)
+        self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left)
+
+        # for Function Unit Readable/Writable (horizontal)
+        self.readable_o = Signal(n_fu_col, reset_less=True) # readable (bot)
+        self.writable_o = Signal(n_fu_col, reset_less=True) # writable (bot)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # ---
+        # matrix of dependency cells
+        # ---
+        dm = Array(FUDependenceCell(f, self.n_fu_col) \
+                                            for f in range(self.n_fu_row))
+        for y in range(self.n_fu_row):
+                setattr(m.submodules, "dm%d" % y, dm[y])
+
+        # ---
+        # array of Function Unit Readable/Writable: row-length, horizontal
+        # ---
+        fur = Array(FU_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col))
+        for x in range(self.n_fu_col):
+            setattr(m.submodules, "fur_x%d" % (x), fur[x])
+
+        # ---
+        # connect FU Readable/Writable vector
+        # ---
+        readable = []
+        writable = []
+        for y in range(self.n_fu_row):
+            fu = fur[y]
+            # accumulate Readable/Writable Vector outputs
+            readable.append(fu.readable_o)
+            writable.append(fu.writable_o)
+
+        # ... and output them from this module (horizontal, width=REGs)
+        m.d.comb += self.readable_o.eq(Cat(*readable))
+        m.d.comb += self.writable_o.eq(Cat(*writable))
+
+        # ---
+        # connect FU Pending
+        # ---
+        for y in range(self.n_fu_row):
+            dc = dm[y]
+            fu = fur[y]
+            # connect cell reg-select outputs to Reg Vector In
+            m.d.comb += [fu.rd_pend_i.eq(dc.rd_wait_o),
+                         fu.wr_pend_i.eq(dc.wr_wait_o),
+                        ]
+
+        # ---
+        # connect Dependency Matrix dest/src1/src2/issue to module d/s/s/i
+        # ---
+        for x in range(self.n_fu_col):
+            issue_i = []
+            for y in range(self.n_fu_row):
+                dc = dm[y]
+                # accumulate cell inputs issue
+                issue_i.append(dc.issue_i[x])
+            # wire up inputs from module to row cell inputs
+            m.d.comb += Cat(*issue_i).eq(self.issue_i)
+
+        # ---
+        # connect Matrix go_rd_i/go_wr_i to module readable/writable
+        # ---
+        for y in range(self.n_fu_row):
+            dc = dm[y]
+            # wire up inputs from module to row cell inputs
+            m.d.comb += [dc.go_rd_i.eq(self.go_rd_i),
+                         dc.go_wr_i.eq(self.go_wr_i),
+                         dc.go_die_i.eq(self.go_die_i),
+                        ]
+
+        # ---
+        # connect Matrix pending
+        # ---
+        for y in range(self.n_fu_row):
+            dc = dm[y]
+            # wire up inputs from module to row cell inputs
+            m.d.comb += [dc.rd_pend_i.eq(self.rd_pend_i),
+                         dc.wr_pend_i.eq(self.wr_pend_i),
+                        ]
+
+        return m
+
+    def __iter__(self):
+        yield self.rd_pend_i
+        yield self.wr_pend_i
+        yield self.issue_i
+        yield self.go_wr_i
+        yield self.go_rd_i
+        yield self.readable_o
+        yield self.writable_o
+                
+    def ports(self):
+        return list(self)
+
+def d_matrix_sim(dut):
+    """ XXX TODO
+    """
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_rd_i.eq(1)
+    yield
+    yield dut.go_rd_i.eq(0)
+    yield
+    yield dut.go_wr_i.eq(1)
+    yield
+    yield dut.go_wr_i.eq(0)
+    yield
+
+def test_fu_fu_matrix():
+    dut = FUFUDepMatrix(n_fu_row=3, n_fu_col=4)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_fu_fu_matrix.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_fu_matrix.vcd')
+
+if __name__ == '__main__':
+    test_fu_fu_matrix()
diff --git a/src/soc/scoreboard/fu_mem_matrix.py b/src/soc/scoreboard/fu_mem_matrix.py
new file mode 100644
index 00000000..baaa02be
--- /dev/null
+++ b/src/soc/scoreboard/fu_mem_matrix.py
@@ -0,0 +1,155 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Elaboratable, Array, Cat, Const
+
+from scoreboard.fumem_dep_cell import FUMemDependenceCell
+from scoreboard.fu_mem_picker_vec import FUMem_Pick_Vec
+
+"""
+
+ 6600 Function Unit Dependency Table Matrix inputs / outputs
+ -----------------------------------------------------------
+
+"""
+
+class FUMemDepMatrix(Elaboratable):
+    """ implements FU-to-FU Memory Dependency Matrix
+    """
+    def __init__(self, n_fu_row, n_fu_col):
+        self.n_fu_row = n_fu_row               # Y (FU row#)   ^v
+        self.n_fu_col = n_fu_col                # X (FU col #)  <>
+        self.st_pend_i = Signal(n_fu_row, reset_less=True) # Rd pending (left)
+        self.ld_pend_i = Signal(n_fu_row, reset_less=True) # Wr pending (left)
+        self.issue_i = Signal(n_fu_col, reset_less=True)    # Issue in (top)
+
+        self.go_ld_i = Signal(n_fu_row, reset_less=True) # Go Write in (left)
+        self.go_st_i = Signal(n_fu_row, reset_less=True)  # Go Read in (left)
+        self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left)
+
+        # for Function Unit Readable/Writable (horizontal)
+        self.storable_o = Signal(n_fu_col, reset_less=True) # storable (bot)
+        self.loadable_o = Signal(n_fu_col, reset_less=True) # loadable (bot)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # ---
+        # matrix of dependency cells
+        # ---
+        dm = Array(FUMemDependenceCell(f, self.n_fu_col) \
+                                            for f in range(self.n_fu_row))
+        for y in range(self.n_fu_row):
+                setattr(m.submodules, "dm%d" % y, dm[y])
+
+        # ---
+        # array of Function Unit Readable/Writable: row-length, horizontal
+        # ---
+        fur = Array(FUMem_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col))
+        for x in range(self.n_fu_col):
+            setattr(m.submodules, "fur_x%d" % (x), fur[x])
+
+        # ---
+        # connect FU Readable/Writable vector
+        # ---
+        storable = []
+        loadable = []
+        for y in range(self.n_fu_row):
+            fu = fur[y]
+            # accumulate Readable/Writable Vector outputs
+            storable.append(fu.storable_o)
+            loadable.append(fu.loadable_o)
+
+        # ... and output them from this module (horizontal, width=REGs)
+        m.d.comb += self.storable_o.eq(Cat(*storable))
+        m.d.comb += self.loadable_o.eq(Cat(*loadable))
+
+        # ---
+        # connect FU Pending
+        # ---
+        for y in range(self.n_fu_row):
+            dc = dm[y]
+            fu = fur[y]
+            # connect cell reg-select outputs to Reg Vector In
+            m.d.comb += [fu.st_pend_i.eq(dc.st_wait_o),
+                         fu.ld_pend_i.eq(dc.ld_wait_o),
+                        ]
+
+        # ---
+        # connect Dependency Matrix dest/src1/src2/issue to module d/s/s/i
+        # ---
+        for x in range(self.n_fu_col):
+            issue_i = []
+            for y in range(self.n_fu_row):
+                dc = dm[y]
+                # accumulate cell inputs issue
+                issue_i.append(dc.issue_i[x])
+            # wire up inputs from module to row cell inputs
+            m.d.comb += Cat(*issue_i).eq(self.issue_i)
+
+        # ---
+        # connect Matrix go_st_i/go_ld_i to module storable/loadable
+        # ---
+        for y in range(self.n_fu_row):
+            dc = dm[y]
+            # wire up inputs from module to row cell inputs
+            m.d.comb += [dc.go_st_i.eq(self.go_st_i),
+                         dc.go_ld_i.eq(self.go_ld_i),
+                         dc.go_die_i.eq(self.go_die_i),
+                        ]
+
+        # ---
+        # connect Matrix pending
+        # ---
+        for y in range(self.n_fu_row):
+            dc = dm[y]
+            # wire up inputs from module to row cell inputs
+            m.d.comb += [dc.st_pend_i.eq(self.st_pend_i),
+                         dc.ld_pend_i.eq(self.ld_pend_i),
+                        ]
+
+        return m
+
+    def __iter__(self):
+        yield self.st_pend_i
+        yield self.ld_pend_i
+        yield self.issue_i
+        yield self.go_ld_i
+        yield self.go_st_i
+        yield self.storable_o
+        yield self.loadable_o
+                
+    def ports(self):
+        return list(self)
+
+def d_matrix_sim(dut):
+    """ XXX TODO
+    """
+    yield dut.ld_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.st_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_st_i.eq(1)
+    yield
+    yield dut.go_st_i.eq(0)
+    yield
+    yield dut.go_ld_i.eq(1)
+    yield
+    yield dut.go_ld_i.eq(0)
+    yield
+
+def test_fu_fu_matrix():
+    dut = FUMemDepMatrix(n_fu_row=3, n_fu_col=3)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_fu_mem_matrix.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_mem_matrix.vcd')
+
+if __name__ == '__main__':
+    test_fu_fu_matrix()
diff --git a/src/soc/scoreboard/fu_mem_picker_vec.py b/src/soc/scoreboard/fu_mem_picker_vec.py
new file mode 100644
index 00000000..dc40bd09
--- /dev/null
+++ b/src/soc/scoreboard/fu_mem_picker_vec.py
@@ -0,0 +1,26 @@
+from nmigen import Elaboratable, Module, Signal, Cat
+
+
+class FUMem_Pick_Vec(Elaboratable):
+    """ these are allocated per-FU (horizontally),
+        and are of length fu_row_n
+    """
+    def __init__(self, fu_row_n):
+        self.fu_row_n = fu_row_n
+        self.st_pend_i = Signal(fu_row_n, reset_less=True)
+        self.ld_pend_i = Signal(fu_row_n, reset_less=True)
+
+        self.storable_o = Signal(reset_less=True)
+        self.loadable_o = Signal(reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # Readable if there are no writes pending
+        m.d.comb += self.storable_o.eq(~self.ld_pend_i.bool())
+
+        # Writable if there are no reads pending
+        m.d.comb += self.loadable_o.eq(~self.st_pend_i.bool())
+
+        return m
+
diff --git a/src/soc/scoreboard/fu_picker_vec.py b/src/soc/scoreboard/fu_picker_vec.py
new file mode 100644
index 00000000..d38bbfae
--- /dev/null
+++ b/src/soc/scoreboard/fu_picker_vec.py
@@ -0,0 +1,26 @@
+from nmigen import Elaboratable, Module, Signal, Cat
+
+
+class FU_Pick_Vec(Elaboratable):
+    """ these are allocated per-FU (horizontally),
+        and are of length fu_row_n
+    """
+    def __init__(self, fu_row_n):
+        self.fu_row_n = fu_row_n
+        self.rd_pend_i = Signal(fu_row_n, reset_less=True)
+        self.wr_pend_i = Signal(fu_row_n, reset_less=True)
+
+        self.readable_o = Signal(reset_less=True)
+        self.writable_o = Signal(reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # Readable if there are no writes pending
+        m.d.comb += self.readable_o.eq(~self.wr_pend_i.bool())
+
+        # Writable if there are no reads pending
+        m.d.comb += self.writable_o.eq(~self.rd_pend_i.bool())
+
+        return m
+
diff --git a/src/soc/scoreboard/fu_reg_matrix.py b/src/soc/scoreboard/fu_reg_matrix.py
new file mode 100644
index 00000000..8ca1494e
--- /dev/null
+++ b/src/soc/scoreboard/fu_reg_matrix.py
@@ -0,0 +1,304 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl
+
+from scoreboard.dependence_cell import DependencyRow
+from scoreboard.fu_wr_pending import FU_RW_Pend
+from scoreboard.reg_select import Reg_Rsv
+from scoreboard.global_pending import GlobalPending
+
+"""
+
+ 6600 Dependency Table Matrix inputs / outputs
+ ---------------------------------------------
+
+                d s1 s2 i  d s1 s2 i  d s1 s2 i  d s1 s2 i
+                | |   | |  | |   | |  | |   | |  | |   | |
+                v v   v v  v v   v v  v v   v v  v v   v v
+ go_rd/go_wr -> dm-r0-fu0  dm-r1-fu0  dm-r2-fu0  dm-r3-fu0 -> wr/rd-pend
+ go_rd/go_wr -> dm-r0-fu1  dm-r1-fu1  dm-r2-fu1  dm-r3-fu1 -> wr/rd-pend
+ go_rd/go_wr -> dm-r0-fu2  dm-r1-fu2  dm-r2-fu2  dm-r3-fu2 -> wr/rd-pend
+                 |  |  |    |  |  |    |  |  |    |  |  |
+                 v  v  v    v  v  v    v  v  v    v  v  v
+                 d  s1 s2   d  s1 s2   d  s1 s2   d  s1 s2
+                 reg sel    reg sel    reg sel    reg sel
+
+"""
+
+class FURegDepMatrix(Elaboratable):
+    """ implements 11.4.7 mitch alsup FU-to-Reg Dependency Matrix, p26
+    """
+    def __init__(self, n_fu_row, n_reg_col, n_src, cancel=None):
+        self.n_src = n_src
+        self.n_fu_row = nf = n_fu_row      # Y (FUs)   ^v
+        self.n_reg_col = n_reg = n_reg_col   # X (Regs)  <>
+
+        # arrays
+        src = []
+        rsel = []
+        for i in range(n_src):
+            j = i + 1 # name numbering to match src1/src2
+            src.append(Signal(n_reg, name="src%d" % j, reset_less=True))
+            rsel.append(Signal(n_reg, name="src%d_rsel_o" % j, reset_less=True))
+        pend = []
+        for i in range(nf):
+            j = i + 1 # name numbering to match src1/src2
+            pend.append(Signal(nf, name="rd_src%d_pend_o" % j, reset_less=True))
+
+        self.dest_i = Signal(n_reg_col, reset_less=True)     # Dest in (top)
+        self.src_i = Array(src)                              # oper in (top)
+
+        # cancellation array (from Address Matching), ties in with go_die_i
+        self.cancel = cancel
+
+        # Register "Global" vectors for determining RaW and WaR hazards
+        self.wr_pend_i = Signal(n_reg_col, reset_less=True) # wr pending (top)
+        self.rd_pend_i = Signal(n_reg_col, reset_less=True) # rd pending (top)
+        self.v_wr_rsel_o = Signal(n_reg_col, reset_less=True) # wr pending (bot)
+        self.v_rd_rsel_o = Signal(n_reg_col, reset_less=True) # rd pending (bot)
+
+        self.issue_i = Signal(n_fu_row, reset_less=True)  # Issue in (top)
+        self.go_wr_i = Signal(n_fu_row, reset_less=True)  # Go Write in (left)
+        self.go_rd_i = Signal(n_fu_row, reset_less=True)  # Go Read in (left)
+        self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left)
+
+        # for Register File Select Lines (horizontal), per-reg
+        self.dest_rsel_o = Signal(n_reg_col, reset_less=True) # dest reg (bot)
+        self.src_rsel_o = Array(rsel)                         # src reg (bot)
+
+        # for Function Unit "forward progress" (vertical), per-FU
+        self.wr_pend_o = Signal(n_fu_row, reset_less=True) # wr pending (right)
+        self.rd_pend_o = Signal(n_fu_row, reset_less=True) # rd pending (right)
+        self.rd_src_pend_o = Array(pend) # src1 pending
+
+    def elaborate(self, platform):
+        m = Module()
+        return self._elaborate(m, platform)
+
+    def _elaborate(self, m, platform):
+
+        # ---
+        # matrix of dependency cells
+        # ---
+        cancel_mode = self.cancel is not None
+        dm = Array(DependencyRow(self.n_reg_col, self.n_src, cancel_mode) \
+                    for r in range(self.n_fu_row))
+        for fu in range(self.n_fu_row):
+            setattr(m.submodules, "dr_fu%d" % fu, dm[fu])
+
+        # ---
+        # array of Function Unit Pending vectors
+        # ---
+        fupend = Array(FU_RW_Pend(self.n_reg_col, self.n_src) \
+                        for f in range(self.n_fu_row))
+        for fu in range(self.n_fu_row):
+            setattr(m.submodules, "fu_fu%d" % (fu), fupend[fu])
+
+        # ---
+        # array of Register Reservation vectors
+        # ---
+        regrsv = Array(Reg_Rsv(self.n_fu_row, self.n_src) \
+                        for r in range(self.n_reg_col))
+        for rn in range(self.n_reg_col):
+            setattr(m.submodules, "rr_r%d" % (rn), regrsv[rn])
+
+        # ---
+        # connect Function Unit vector
+        # ---
+        wr_pend = []
+        rd_pend = []
+        for fu in range(self.n_fu_row):
+            dc = dm[fu]
+            fup = fupend[fu]
+            dest_fwd_o = []
+            for rn in range(self.n_reg_col):
+                # accumulate cell fwd outputs for dest/src1/src2
+                dest_fwd_o.append(dc.dest_fwd_o[rn])
+            # connect cell fwd outputs to FU Vector in [Cat is gooood]
+            m.d.comb += [fup.dest_fwd_i.eq(Cat(*dest_fwd_o)),
+                        ]
+            # accumulate FU Vector outputs
+            wr_pend.append(fup.reg_wr_pend_o)
+            rd_pend.append(fup.reg_rd_pend_o)
+
+        # ... and output them from this module (vertical, width=FUs)
+        m.d.comb += self.wr_pend_o.eq(Cat(*wr_pend))
+        m.d.comb += self.rd_pend_o.eq(Cat(*rd_pend))
+
+        # same for src
+        for i in range(self.n_src):
+            rd_src_pend = []
+            for fu in range(self.n_fu_row):
+                dc = dm[fu]
+                fup = fupend[fu]
+                src_fwd_o = []
+                for rn in range(self.n_reg_col):
+                    # accumulate cell fwd outputs for dest/src1/src2
+                    src_fwd_o.append(dc.src_fwd_o[i][rn])
+                # connect cell fwd outputs to FU Vector in [Cat is gooood]
+                m.d.comb += [fup.src_fwd_i[i].eq(Cat(*src_fwd_o)),
+                            ]
+                # accumulate FU Vector outputs
+                rd_src_pend.append(fup.reg_rd_src_pend_o[i])
+            # ... and output them from this module (vertical, width=FUs)
+            m.d.comb += self.rd_src_pend_o[i].eq(Cat(*rd_src_pend))
+
+        # ---
+        # connect Reg Selection vector
+        # ---
+        dest_rsel = []
+        for rn in range(self.n_reg_col):
+            rsv = regrsv[rn]
+            dest_rsel_o = []
+            for fu in range(self.n_fu_row):
+                dc = dm[fu]
+                # accumulate cell reg-select outputs dest/src1/src2
+                dest_rsel_o.append(dc.dest_rsel_o[rn])
+            # connect cell reg-select outputs to Reg Vector In
+            m.d.comb += rsv.dest_rsel_i.eq(Cat(*dest_rsel_o)),
+
+            # accumulate Reg-Sel Vector outputs
+            dest_rsel.append(rsv.dest_rsel_o)
+
+        # ... and output them from this module (horizontal, width=REGs)
+        m.d.comb += self.dest_rsel_o.eq(Cat(*dest_rsel))
+
+        # same for src
+        for i in range(self.n_src):
+            src_rsel = []
+            for rn in range(self.n_reg_col):
+                rsv = regrsv[rn]
+                src_rsel_o = []
+                for fu in range(self.n_fu_row):
+                    dc = dm[fu]
+                    # accumulate cell reg-select outputs dest/src1/src2
+                    src_rsel_o.append(dc.src_rsel_o[i][rn])
+                # connect cell reg-select outputs to Reg Vector In
+                m.d.comb += rsv.src_rsel_i[i].eq(Cat(*src_rsel_o)),
+                # accumulate Reg-Sel Vector outputs
+                src_rsel.append(rsv.src_rsel_o[i])
+
+            # ... and output them from this module (horizontal, width=REGs)
+            m.d.comb += self.src_rsel_o[i].eq(Cat(*src_rsel))
+
+        # ---
+        # connect Dependency Matrix dest/src1/src2/issue to module d/s/s/i
+        # ---
+        for fu in range(self.n_fu_row):
+            dc = dm[fu]
+            # wire up inputs from module to row cell inputs (Cat is gooood)
+            m.d.comb += [dc.dest_i.eq(self.dest_i),
+                         dc.rd_pend_i.eq(self.rd_pend_i),
+                         dc.wr_pend_i.eq(self.wr_pend_i),
+                        ]
+        # same for src
+        for i in range(self.n_src):
+            for fu in range(self.n_fu_row):
+                dc = dm[fu]
+                # wire up inputs from module to row cell inputs (Cat is gooood)
+                m.d.comb += dc.src_i[i].eq(self.src_i[i])
+
+        # accumulate rsel bits into read/write pending vectors.
+        rd_pend_v = []
+        wr_pend_v = []
+        for fu in range(self.n_fu_row):
+            dc = dm[fu]
+            rd_pend_v.append(dc.v_rd_rsel_o)
+            wr_pend_v.append(dc.v_wr_rsel_o)
+        rd_v = GlobalPending(self.n_reg_col, rd_pend_v)
+        wr_v = GlobalPending(self.n_reg_col, wr_pend_v)
+        m.submodules.rd_v = rd_v
+        m.submodules.wr_v = wr_v
+
+        m.d.comb += self.v_rd_rsel_o.eq(rd_v.g_pend_o)
+        m.d.comb += self.v_wr_rsel_o.eq(wr_v.g_pend_o)
+
+        # ---
+        # connect Dep issue_i/go_rd_i/go_wr_i to module issue_i/go_rd/go_wr
+        # ---
+        go_rd_i = []
+        go_wr_i = []
+        issue_i = []
+        for fu in range(self.n_fu_row):
+            dc = dm[fu]
+            # accumulate cell fwd outputs for dest/src1/src2
+            go_rd_i.append(dc.go_rd_i)
+            go_wr_i.append(dc.go_wr_i)
+            issue_i.append(dc.issue_i)
+        # wire up inputs from module to row cell inputs (Cat is gooood)
+        m.d.comb += [Cat(*go_rd_i).eq(self.go_rd_i),
+                     Cat(*go_wr_i).eq(self.go_wr_i),
+                     Cat(*issue_i).eq(self.issue_i),
+                    ]
+
+        # ---
+        # connect Dep go_die_i
+        # ---
+        if cancel_mode:
+            for fu in range(self.n_fu_row):
+                dc = dm[fu]
+                go_die = Repl(self.go_die_i[fu], self.n_fu_row)
+                go_die = go_die | self.cancel[fu]
+                m.d.comb += dc.go_die_i.eq(go_die)
+        else:
+            go_die_i = []
+            for fu in range(self.n_fu_row):
+                dc = dm[fu]
+                # accumulate cell fwd outputs for dest/src1/src2
+                go_die_i.append(dc.go_die_i)
+            # wire up inputs from module to row cell inputs (Cat is gooood)
+            m.d.comb += Cat(*go_die_i).eq(self.go_die_i)
+        return m
+
+    def __iter__(self):
+        yield self.dest_i
+        yield from self.src_i
+        yield self.issue_i
+        yield self.go_wr_i
+        yield self.go_rd_i
+        yield self.go_die_i
+        yield self.dest_rsel_o
+        yield from self.src_rsel_o
+        yield self.wr_pend_o
+        yield self.rd_pend_o
+        yield self.wr_pend_i
+        yield self.rd_pend_i
+        yield self.v_wr_rsel_o
+        yield self.v_rd_rsel_o
+        yield from self.rd_src_pend_o
+
+    def ports(self):
+        return list(self)
+
+def d_matrix_sim(dut):
+    """ XXX TODO
+    """
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_rd_i.eq(1)
+    yield
+    yield dut.go_rd_i.eq(0)
+    yield
+    yield dut.go_wr_i.eq(1)
+    yield
+    yield dut.go_wr_i.eq(0)
+    yield
+
+def test_d_matrix():
+    dut = FURegDepMatrix(n_fu_row=3, n_reg_col=4, n_src=2)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_fu_reg_matrix.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_reg_matrix.vcd')
+
+if __name__ == '__main__':
+    test_d_matrix()
diff --git a/src/soc/scoreboard/fu_wr_pending.py b/src/soc/scoreboard/fu_wr_pending.py
new file mode 100644
index 00000000..d0bcb954
--- /dev/null
+++ b/src/soc/scoreboard/fu_wr_pending.py
@@ -0,0 +1,29 @@
+from nmigen import Elaboratable, Module, Signal, Array
+
+
+class FU_RW_Pend(Elaboratable):
+    """ these are allocated per-FU (horizontally),
+        and are of length reg_count
+    """
+    def __init__(self, reg_count, n_src):
+        self.n_src = n_src
+        self.reg_count = reg_count
+        self.dest_fwd_i = Signal(reg_count, reset_less=True)
+        src = []
+        for i in range(n_src):
+            j = i + 1 # name numbering to match src1/src2
+            src.append(Signal(reg_count, name="src%d" % j, reset_less=True))
+        self.src_fwd_i = Array(src)
+
+        self.reg_wr_pend_o = Signal(reset_less=True)
+        self.reg_rd_pend_o = Signal(reset_less=True)
+        self.reg_rd_src_pend_o = Signal(n_src, reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.reg_wr_pend_o.eq(self.dest_fwd_i.bool())
+        for i in range(self.n_src):
+            m.d.comb += self.reg_rd_src_pend_o[i].eq(self.src_fwd_i[i].bool())
+        m.d.comb += self.reg_rd_pend_o.eq(self.reg_rd_src_pend_o.bool())
+        return m
+
diff --git a/src/soc/scoreboard/fumem_dep_cell.py b/src/soc/scoreboard/fumem_dep_cell.py
new file mode 100644
index 00000000..982b55a3
--- /dev/null
+++ b/src/soc/scoreboard/fumem_dep_cell.py
@@ -0,0 +1,92 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Const, Elaboratable
+from nmutil.latch import SRLatch
+
+
+class FUMemDependenceCell(Elaboratable):
+    """ implements 11.4.7 mitch alsup dependence cell, p27
+    """
+    def __init__(self, dummy, n_fu=1):
+        self.n_fu = n_fu
+        self.dummy = Const(~(1<<dummy), n_fu)
+        # inputs
+        self.st_pend_i = Signal(n_fu, reset_less=True) # read pend in (left)
+        self.ld_pend_i = Signal(n_fu, reset_less=True) # write pend in (left)
+        self.issue_i = Signal(n_fu, reset_less=True)    # Issue in (top)
+
+        self.go_ld_i = Signal(n_fu, reset_less=True) # Go Write in (left)
+        self.go_st_i = Signal(n_fu, reset_less=True)  # Go Read in (left)
+        self.go_die_i = Signal(n_fu, reset_less=True) # Go Die in (left)
+
+        # outputs (latched rd/wr wait)
+        self.st_wait_o = Signal(n_fu, reset_less=True) # read wait out (right)
+        self.ld_wait_o = Signal(n_fu, reset_less=True) # write wait out (right)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.st_c = st_c = SRLatch(sync=False, llen=self.n_fu)
+        m.submodules.ld_c = ld_c = SRLatch(sync=False, llen=self.n_fu)
+
+        # reset on go HI, set on dest and issue
+        m.d.comb += st_c.s.eq(self.issue_i & self.st_pend_i)
+        m.d.comb += ld_c.s.eq(self.issue_i & self.ld_pend_i)
+
+        # connect go_rd / go_wr 
+        m.d.comb += ld_c.r.eq(self.go_ld_i | self.go_die_i)
+        m.d.comb += st_c.r.eq(self.go_st_i | self.go_die_i)
+
+        # connect pend_i
+        m.d.comb += st_c.s.eq(self.issue_i & self.st_pend_i & self.dummy)
+        m.d.comb += ld_c.s.eq(self.issue_i & self.ld_pend_i & self.dummy)
+
+        # connect output
+        m.d.comb += self.st_wait_o.eq(st_c.qlq & ~self.issue_i)
+        m.d.comb += self.ld_wait_o.eq(ld_c.qlq & ~self.issue_i)
+
+        return m
+
+    def __iter__(self):
+        yield self.st_pend_i
+        yield self.ld_pend_i
+        yield self.issue_i
+        yield self.go_ld_i
+        yield self.go_st_i
+        yield self.go_die_i
+        yield self.st_wait_o
+        yield self.ld_wait_o
+                
+    def ports(self):
+        return list(self)
+
+
+def dcell_sim(dut):
+    yield dut.ld_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.st_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_st_i.eq(1)
+    yield
+    yield dut.go_st_i.eq(0)
+    yield
+    yield dut.go_ld_i.eq(1)
+    yield
+    yield dut.go_ld_i.eq(0)
+    yield
+
+def test_dcell():
+    dut = FUMemDependenceCell(dummy=0, n_fu=4)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_fumem_dcell.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, dcell_sim(dut), vcd_name='test_fumem_dcell.vcd')
+
+if __name__ == '__main__':
+    test_dcell()
diff --git a/src/soc/scoreboard/global_pending.py b/src/soc/scoreboard/global_pending.py
new file mode 100644
index 00000000..540f4430
--- /dev/null
+++ b/src/soc/scoreboard/global_pending.py
@@ -0,0 +1,95 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Cat, Elaboratable
+
+
+class GlobalPending(Elaboratable):
+    """ implements Global Pending Vector, basically ORs all incoming Function
+        Unit vectors together.  Can be used for creating Read or Write Global
+        Pending.  Can be used for INT or FP Global Pending.
+
+        Inputs:
+        * :dep:       register file depth
+        * :fu_vecs:   a python list of function unit "pending" vectors, each
+                      vector being a Signal of width equal to the reg file.
+
+        Notes:
+
+        * the regfile may be Int or FP, this code doesn't care which.
+          obviously do not try to put in a mixture of regfiles into fu_vecs.
+        * this code also doesn't care if it's used for Read Pending or Write
+          pending, it can be used for both: again, obviously, do not try to
+          put in a mixture of read *and* write pending vectors in.
+        * if some Function Units happen not to be uniform (don't operate
+          on a particular register (extremely unusual), they must set a Const
+          zero bit in the vector.
+    """
+    def __init__(self, dep, fu_vecs, sync=False):
+        self.reg_dep = dep
+        # inputs
+        self.fu_vecs = fu_vecs
+        self.sync = sync
+        for v in fu_vecs:
+            assert len(v) == dep, "FU Vector must be same width as regfile"
+
+        self.g_pend_o = Signal(dep, reset_less=True)  # global pending vector
+
+    def elaborate(self, platform):
+        m = Module()
+
+        pend_l = []
+        for i in range(self.reg_dep): # per-register
+            vec_bit_l = []
+            for v in self.fu_vecs:
+                vec_bit_l.append(v[i])             # fu bit for same register
+            pend_l.append(Cat(*vec_bit_l).bool())  # OR all bits for same reg
+        if self.sync:
+            m.d.sync += self.g_pend_o.eq(Cat(*pend_l)) # merge all OR'd bits
+        else:
+            m.d.comb += self.g_pend_o.eq(Cat(*pend_l)) # merge all OR'd bits
+
+        return m
+
+    def __iter__(self):
+        yield from self.fu_vecs
+        yield self.g_pend_o
+
+    def ports(self):
+        return list(self)
+
+
+def g_vec_sim(dut):
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_rd_i.eq(1)
+    yield
+    yield dut.go_rd_i.eq(0)
+    yield
+    yield dut.go_wr_i.eq(1)
+    yield
+    yield dut.go_wr_i.eq(0)
+    yield
+
+def test_g_vec():
+    vecs = []
+    for i in range(3):
+        vecs.append(Signal(32, name="fu%d" % i))
+    dut = GlobalPending(32, vecs)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_global_pending.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, g_vec_sim(dut), vcd_name='test_global_pending.vcd')
+
+if __name__ == '__main__':
+    test_g_vec()
diff --git a/src/soc/scoreboard/group_picker.py b/src/soc/scoreboard/group_picker.py
new file mode 100644
index 00000000..a59fdd28
--- /dev/null
+++ b/src/soc/scoreboard/group_picker.py
@@ -0,0 +1,124 @@
+""" Group Picker: to select an instruction that is permitted to read (or write)
+    based on the Function Unit expressing a *desire* to read (or write).
+
+    The job of the Group Picker is extremely simple yet extremely important.
+    It sits in front of a register file port (read or write) and stops it from
+    being corrupted.  It's a "port contention selector", basically.
+
+    The way it works is:
+
+    * Function Units need to read from (or write to) the register file,
+      in order to get (or store) their operands, so they each have a signal,
+      readable (or writable), which "expresses" this need.  This is an
+      *unary* encoding.
+
+    * The Function Units also have a signal which indicates that they
+      are requesting "release" of the register file port (this because
+      in the scoreboard, readable/writable can be permanently HI even
+      if the FU is idle, whereas the "release" signal is very specifically
+      only HI if the read (or write) latch is still active)
+
+    * The Group Picker takes this unary encoding of the desire to read
+      (or write) and, on a priority basis, activates one *and only* one
+      of those signals, again as an unary output.
+
+    * Due to the way that the Computation Unit works, that signal (Go_Read
+      or Go_Write) will fire for one (and only one) cycle, and can be used
+      to enable the register file port read (or write) lines.  The Go_Read/Wr
+      signal basically loops back to the Computation Unit and resets the
+      "desire-to-read/write-expressing" latch.
+
+    In theory (and in practice!) the following is possible:
+
+    * Separate src1 and src2 Group Pickers.  This would allow instructions
+      with only one operand to read to not block up other instructions,
+      and it would also allow 3-operand instructions to be interleaved
+      with 1 and 2 operand instructions.
+
+    * *Multiple* Group Pickers (multi-issue).  This would require
+      a corresponding increase in the number of register file ports,
+      either 4R2W (or more) or by "striping" the register file into
+      split banks (a strategy best deployed on Vector Processors)
+
+"""
+
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Elaboratable
+
+from nmutil.picker import PriorityPicker
+
+
+class GroupPicker(Elaboratable):
+    """ implements 10.5 mitch alsup group picker, p27
+    """
+    def __init__(self, wid):
+        self.gp_wid = wid
+        # inputs
+        self.readable_i = Signal(wid, reset_less=True) # readable in (top)
+        self.writable_i = Signal(wid, reset_less=True) # writable in (top)
+        self.rd_rel_i = Signal(wid, reset_less=True)   # go read in (top)
+        self.req_rel_i = Signal(wid, reset_less=True) # release request in (top)
+
+        # outputs
+        self.go_rd_o = Signal(wid, reset_less=True)  # go read (bottom)
+        self.go_wr_o = Signal(wid, reset_less=True)  # go write (bottom)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        m.submodules.rpick = rpick = PriorityPicker(self.gp_wid)
+        m.submodules.wpick = wpick = PriorityPicker(self.gp_wid)
+
+        # combine release (output ready signal) with writeable
+        m.d.comb += wpick.i.eq(self.writable_i & self.req_rel_i)
+        m.d.comb += self.go_wr_o.eq(wpick.o)
+
+        m.d.comb += rpick.i.eq(self.readable_i & self.rd_rel_i)
+        m.d.comb += self.go_rd_o.eq(rpick.o)
+
+        return m
+
+    def __iter__(self):
+        yield self.readable_i
+        yield self.writable_i
+        yield self.req_rel_i
+        yield self.go_rd_o
+        yield self.go_wr_o
+
+    def ports(self):
+        return list(self)
+
+
+def grp_pick_sim(dut):
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.rd_rel_i.eq(1)
+    yield
+    yield dut.rd_rel_i.eq(0)
+    yield
+    yield dut.go_wr_i.eq(1)
+    yield
+    yield dut.go_wr_i.eq(0)
+    yield
+
+def test_grp_pick():
+    dut = GroupPicker(4)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_grp_pick.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, grp_pick_sim(dut), vcd_name='test_grp_pick.vcd')
+
+if __name__ == '__main__':
+    test_grp_pick()
diff --git a/src/soc/scoreboard/instruction_q.py b/src/soc/scoreboard/instruction_q.py
new file mode 100644
index 00000000..65496a6a
--- /dev/null
+++ b/src/soc/scoreboard/instruction_q.py
@@ -0,0 +1,179 @@
+from math import log
+
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Cat, Array, Const, Repl, Elaboratable
+from nmutil.iocontrol import RecordObject
+from nmutil.nmoperator import eq, shape, cat
+
+
+class Instruction(RecordObject):
+    def __init__(self, name, wid, opwid):
+        RecordObject.__init__(self, name=name)
+        self.oper_i = Signal(opwid, reset_less=True)
+        self.opim_i = Signal(1, reset_less=True) # src2 is an immediate
+        self.imm_i = Signal(wid, reset_less=True)
+        self.dest_i = Signal(wid, reset_less=True)
+        self.src1_i = Signal(wid, reset_less=True)
+        self.src2_i = Signal(wid, reset_less=True)
+
+    @staticmethod
+    def nq(n_insns, name, wid, opwid):
+        q = []
+        for i in range(n_insns):
+            q.append(Instruction("%s%d" % (name, i), wid, opwid))
+        return Array(q)
+
+
+class InstructionQ(Elaboratable):
+    """ contains a queue of (part-decoded) instructions.
+
+        output is copied combinatorially from the front of the queue,
+        for easy access on the clock cycle.  only "n_in" instructions
+        are made available this way
+
+        input and shifting occurs on sync.
+    """
+    def __init__(self, wid, opwid, iqlen, n_in, n_out):
+        """ constructor
+
+            Inputs
+
+            * :wid:         register file width
+            * :opwid:       operand width
+            * :iqlen:       instruction queue length
+            * :n_in:        max number of instructions allowed "in"
+        """
+        self.iqlen = iqlen
+        self.reg_width = wid
+        self.opwid = opwid
+        self.n_in = n_in
+        self.n_out = n_out
+        mqbits = (int(log(iqlen) / log(2))+2, False)
+
+        self.p_add_i = Signal(mqbits) # instructions to add (from data_i)
+        self.p_ready_o = Signal() # instructions were added
+        self.data_i = Instruction.nq(n_in, "data_i", wid, opwid)
+        
+        self.data_o = Instruction.nq(n_out, "data_o", wid, opwid)
+        self.n_sub_i = Signal(mqbits) # number of instructions to remove
+        self.n_sub_o = Signal(mqbits) # number of instructions removed
+
+        self.qsz = shape(self.data_o[0])[0]
+        q = []
+        for i in range(iqlen):
+            q.append(Signal(self.qsz, name="q%d" % i))
+        self.q = Array(q)
+        self.qlen_o = Signal(mqbits)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        sync = m.d.sync
+
+        iqlen = self.iqlen
+        mqbits = int(log(iqlen) / log(2))
+
+        left = Signal((mqbits+2, False))
+        spare = Signal((mqbits+2, False))
+        qmaxed = Signal()
+
+        start_q = Signal(mqbits)
+        end_q = Signal(mqbits)
+        mqlen = Const(iqlen, (len(left), False))
+        print ("mqlen", mqlen)
+
+        # work out how many can be subtracted from the queue
+        with m.If(self.n_sub_i):
+            qinmax = Signal()
+            comb += qinmax.eq(self.n_sub_i > self.qlen_o)
+            with m.If(qinmax):
+                comb += self.n_sub_o.eq(self.qlen_o)
+            with m.Else():
+                comb += self.n_sub_o.eq(self.n_sub_i)
+
+        # work out how many new items are going to be in the queue
+        comb += left.eq(self.qlen_o )#- self.n_sub_o)
+        comb += spare.eq(mqlen - self.p_add_i)
+        comb += qmaxed.eq(left <= spare)
+        comb += self.p_ready_o.eq(qmaxed & (self.p_add_i != 0))
+
+        # put q (flattened) into output
+        for i in range(self.n_out):
+            opos = Signal(mqbits)
+            comb += opos.eq(end_q + i)
+            comb += cat(self.data_o[i]).eq(self.q[opos])
+
+        with m.If(self.n_sub_o):
+            # ok now the end's moved
+            sync += end_q.eq(end_q + self.n_sub_o)
+
+        with m.If(self.p_ready_o):
+            # copy in the input... insanely gate-costly... *sigh*...
+            for i in range(self.n_in):
+                with m.If(self.p_add_i > Const(i, len(self.p_add_i))):
+                    ipos = Signal(mqbits)
+                    comb += ipos.eq(start_q + i) # should roll round
+                    sync += self.q[ipos].eq(cat(self.data_i[i]))
+            sync += start_q.eq(start_q + self.p_add_i)
+
+        with m.If(self.p_ready_o):
+            # update the queue length
+            add2 = Signal(mqbits+1)
+            comb += add2.eq(self.qlen_o + self.p_add_i)
+            sync += self.qlen_o.eq(add2 - self.n_sub_o)
+        with m.Else():
+            sync += self.qlen_o.eq(self.qlen_o - self.n_sub_o)
+
+        return m
+
+    def __iter__(self):
+        yield from self.q
+
+        yield self.p_ready_o
+        for o in self.data_i:
+            yield from list(o)
+        yield self.p_add_i
+        
+        for o in self.data_o:
+            yield from list(o)
+        yield self.n_sub_i
+        yield self.n_sub_o
+
+    def ports(self):
+        return list(self)
+
+
+def instruction_q_sim(dut):
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_rd_i.eq(1)
+    yield
+    yield dut.go_rd_i.eq(0)
+    yield
+    yield dut.go_wr_i.eq(1)
+    yield
+    yield dut.go_wr_i.eq(0)
+    yield
+
+def test_instruction_q():
+    dut = InstructionQ(16, 4, 4, n_in=2, n_out=2)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_instruction_q.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, instruction_q_sim(dut),
+                   vcd_name='test_instruction_q.vcd')
+
+if __name__ == '__main__':
+    test_instruction_q()
diff --git a/src/soc/scoreboard/issue_unit.py b/src/soc/scoreboard/issue_unit.py
new file mode 100644
index 00000000..3ec2a31c
--- /dev/null
+++ b/src/soc/scoreboard/issue_unit.py
@@ -0,0 +1,278 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Cat, Array, Const, Repl, Elaboratable
+from nmigen.lib.coding import Decoder
+
+from scoreboard.group_picker import PriorityPicker
+
+
+class RegDecode(Elaboratable):
+    """ decodes registers into unary
+
+        Inputs
+
+        * :wid:         register file width
+    """
+    def __init__(self, wid):
+        self.reg_width = wid
+
+        # inputs
+        self.enable_i = Signal(reset_less=True) # enable decoders
+        self.dest_i = Signal(range(wid), reset_less=True) # Dest R# in
+        self.src1_i = Signal(range(wid), reset_less=True) # oper1 R# in
+        self.src2_i = Signal(range(wid), reset_less=True) # oper2 R# in
+
+        # outputs
+        self.dest_o = Signal(wid, reset_less=True) # Dest unary out
+        self.src1_o = Signal(wid, reset_less=True) # oper1 unary out
+        self.src2_o = Signal(wid, reset_less=True) # oper2 unary out
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.dest_d = dest_d = Decoder(self.reg_width)
+        m.submodules.src1_d = src1_d = Decoder(self.reg_width)
+        m.submodules.src2_d = src2_d = Decoder(self.reg_width)
+
+        # dest decoder: write-pending
+        for d, i, o in [(dest_d, self.dest_i, self.dest_o),
+                     (src1_d, self.src1_i, self.src1_o),
+                     (src2_d, self.src2_i, self.src2_o)]:
+            m.d.comb += d.i.eq(i)
+            m.d.comb += d.n.eq(~self.enable_i)
+            m.d.comb += o.eq(d.o)
+
+        return m
+
+    def __iter__(self):
+        yield self.enable_i
+        yield self.dest_i
+        yield self.src1_i
+        yield self.src2_i
+        yield self.dest_o
+        yield self.src1_o
+        yield self.src2_o
+
+    def ports(self):
+        return list(self)
+
+
+class IssueUnitGroup(Elaboratable):
+    """ Manages a batch of Computation Units all of which can do the same task
+
+        A priority picker will allocate one instruction in this cycle based
+        on whether the others are busy.
+
+        insn_i indicates to this module that there is an instruction to be
+        issued which this group can handle
+
+        busy_i is a vector of signals that indicate, in this cycle, which
+        of the units are currently busy.
+
+        busy_o indicates whether it is "safe to proceed" i.e. whether
+        there is a unit here that can *be* issued an instruction
+
+        fn_issue_o indicates, out of the available (non-busy) units,
+        which one may be selected
+    """
+    def __init__(self, n_insns):
+        """ Set up inputs and outputs for the Group
+
+            Input Parameters
+
+            * :n_insns:     number of instructions in this issue unit.
+        """
+        self.n_insns = n_insns
+
+        # inputs
+        self.insn_i = Signal(reset_less=True, name="insn_i")
+        self.busy_i = Signal(n_insns, reset_less=True, name="busy_i")
+
+        # outputs
+        self.fn_issue_o = Signal(n_insns, reset_less=True, name="fn_issue_o")
+        self.busy_o = Signal(reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        if self.n_insns == 0:
+            return m
+
+        m.submodules.pick = pick = PriorityPicker(self.n_insns)
+
+        # temporaries
+        allissue = Signal(self.n_insns, reset_less=True)
+
+        m.d.comb += allissue.eq(Repl(self.insn_i, self.n_insns))
+        # Pick one (and only one) of the units to proceed in this cycle
+        m.d.comb += pick.i.eq(~self.busy_i & allissue)
+
+        # "Safe to issue" condition is basically when all units are not busy
+        m.d.comb += self.busy_o.eq(~((~self.busy_i).bool()))
+
+        # Picker only raises one signal, therefore it's also the fn_issue
+        m.d.comb += self.fn_issue_o.eq(pick.o & Repl(~self.busy_o, self.n_insns))
+
+        return m
+
+    def __iter__(self):
+        yield self.insn_i
+        yield self.busy_i
+        yield self.fn_issue_o
+        yield self.g_issue_o
+
+    def ports(self):
+        return list(self)
+
+
+class IssueUnitArray(Elaboratable):
+    """ Convenience module that amalgamates the issue and busy signals
+
+        unit issue_i is to be set externally, at the same time as the
+        ALU group oper_i
+    """
+    def __init__(self, units):
+        self.units = units
+        self.issue_o = Signal(reset_less=True)
+        n_insns = 0
+        for u in self.units:
+            n_insns += len(u.fn_issue_o)
+        self.busy_i = Signal(n_insns, reset_less=True)
+        self.fn_issue_o = Signal(n_insns, reset_less=True)
+        self.n_insns = n_insns
+
+    def elaborate(self, platform):
+        m = Module()
+        for i, u in enumerate(self.units):
+            setattr(m.submodules, "issue%d" % i, u)
+
+        g_issue_o = []
+        busy_i = []
+        fn_issue_o = []
+        for u in self.units:
+            busy_i.append(u.busy_i)
+            g_issue_o.append(u.busy_o)
+            fn_issue_o.append(u.fn_issue_o)
+        m.d.comb += self.issue_o.eq(~(Cat(*g_issue_o).bool()))
+        m.d.comb += self.fn_issue_o.eq(Cat(*fn_issue_o))
+        m.d.comb += Cat(*busy_i).eq(self.busy_i)
+
+        return m
+
+    def ports(self):
+        yield self.busy_i
+        yield self.issue_o
+        yield self.fn_issue_o
+        yield from self.units
+
+
+
+class IssueUnit(Elaboratable):
+    """ implements 11.4.14 issue unit, p50
+
+        Inputs
+
+        * :n_insns:     number of instructions in this issue unit.
+    """
+    def __init__(self, n_insns):
+        self.n_insns = n_insns
+
+        # inputs
+        self.insn_i = Signal(n_insns, reset_less=True, name="insn_i")
+        self.busy_i = Signal(n_insns, reset_less=True, name="busy_i")
+
+        # outputs
+        self.fn_issue_o = Signal(n_insns, reset_less=True, name="fn_issue_o")
+        self.g_issue_o = Signal(reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        if self.n_insns == 0:
+            return m
+
+        # temporaries
+        fu_stall = Signal(reset_less=True)
+
+        ib_l = []
+        for i in range(self.n_insns):
+            ib_l.append(self.insn_i[i] & self.busy_i[i])
+        m.d.comb += fu_stall.eq(Cat(*ib_l).bool())
+        m.d.comb += self.g_issue_o.eq(~(fu_stall))
+        for i in range(self.n_insns):
+            m.d.comb += self.fn_issue_o[i].eq(self.g_issue_o & self.insn_i[i])
+
+        return m
+
+    def __iter__(self):
+        yield self.insn_i
+        yield self.busy_i
+        yield self.fn_issue_o
+        yield self.g_issue_o
+
+    def ports(self):
+        return list(self)
+
+
+class IntFPIssueUnit(Elaboratable):
+    def __init__(self, n_int_insns, n_fp_insns):
+        self.i = IssueUnit(n_int_insns)
+        self.f = IssueUnit(n_fp_insns)
+        self.issue_o = Signal(reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.intissue = self.i
+        m.submodules.fpissue = self.f
+
+        m.d.comb += self.issue_o.eq(self.i.g_issue_o | self.f.g_issue_o)
+
+        return m
+
+    def ports(self):
+        yield self.issue_o
+        yield from self.i
+        yield from self.f
+
+
+def issue_unit_sim(dut):
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_rd_i.eq(1)
+    yield
+    yield dut.go_rd_i.eq(0)
+    yield
+    yield dut.go_wr_i.eq(1)
+    yield
+    yield dut.go_wr_i.eq(0)
+    yield
+
+def test_issue_unit():
+    dut = IssueUnitGroup(3)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_issue_unit_group.il", "w") as f:
+        f.write(vl)
+
+    dut = IssueUnit(32, 3)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_issue_unit.il", "w") as f:
+        f.write(vl)
+
+    dut = IntFPIssueUnit(32, 3, 3)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_intfp_issue_unit.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, issue_unit_sim(dut), vcd_name='test_issue_unit.vcd')
+
+if __name__ == '__main__':
+    test_issue_unit()
diff --git a/src/soc/scoreboard/ldst_dep_cell.py b/src/soc/scoreboard/ldst_dep_cell.py
new file mode 100644
index 00000000..70f4b9ba
--- /dev/null
+++ b/src/soc/scoreboard/ldst_dep_cell.py
@@ -0,0 +1,116 @@
+""" Mitch Alsup 6600-style LD/ST scoreboard Dependency Cell
+
+Relevant bugreports:
+
+* http://bugs.libre-riscv.org/show_bug.cgi?id=81
+
+"""
+
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Repl, Elaboratable
+from nmutil.latch import SRLatch
+
+
+class LDSTDepCell(Elaboratable):
+    """ implements 11.4.12 mitch alsup load/store dependence cell, p45
+    """
+    def __init__(self, n_ls=1):
+        self.n_ls = n_ls
+        # inputs
+        self.load_h_i = Signal(reset_less=True)     # load in (left)
+        self.stor_h_i = Signal(reset_less=True)     # store in (left)
+        self.load_v_i = Signal(n_ls, reset_less=True)     # load in (top)
+        self.stor_v_i = Signal(n_ls, reset_less=True)     # store in (top)
+        self.issue_i = Signal(reset_less=True)    # Issue in (left)
+        self.go_die_i = Signal(reset_less=True)    # Issue in (left)
+
+        # load / store hit - basically connect these to go_wr from LD/STCompUnit
+        # LD.go_wr -> load_hit_i, ST.go_wr -> stwd_hit_i.
+        self.load_hit_i = Signal(n_ls, reset_less=True) # ld hit in (right)
+        self.stwd_hit_i = Signal(n_ls, reset_less=True) # st w/ hit in (right)
+
+        # outputs (latched rd/wr pend)
+        self.ld_hold_st_o = Signal(reset_less=True) # ld holds st out (l)
+        self.st_hold_ld_o = Signal(reset_less=True) # st holds ld out (l)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.war_l = war_l = SRLatch(sync=False, llen=self.n_ls) # WaR
+        m.submodules.raw_l = raw_l = SRLatch(sync=False, llen=self.n_ls) # RaW
+
+        # temporaries (repeat-extend)
+        issue = Repl(self.issue_i, self.n_ls)
+        die = Repl(self.go_die_i, self.n_ls)
+
+        # issue & store & load - used for WAR Setting.  LD is left, ST is top
+        i_s = Signal(reset_less=True)
+        i_s_l = Signal(self.n_ls, reset_less=True)
+        m.d.comb += i_s.eq(issue & self.stor_h_i) # horizontal single-signal
+        m.d.comb += i_s_l.eq(Repl(i_s, self.n_ls) & self.load_v_i) # multi, vert
+
+        # issue & load & store - used for RAW Setting.  ST is left, LD is top
+        i_l = Signal(reset_less=True)
+        i_l_s = Signal(self.n_ls, reset_less=True)
+        m.d.comb += i_l.eq(issue & self.load_h_i) # horizontal single-signal
+        m.d.comb += i_l_s.eq(Repl(i_l, self.n_ls) & self.stor_v_i) # multi, vert
+
+        # write after read latch: loads block stores
+        m.d.comb += war_l.s.eq(i_s_l)
+        m.d.comb += war_l.r.eq(die | ~self.load_v_i) # reset on LD
+
+        # read after write latch: stores block loads
+        m.d.comb += raw_l.s.eq(i_s_l)
+        m.d.comb += raw_l.r.eq(die | ~self.stor_v_i) # reset on ST
+
+        # Hold results (read out horizontally, accumulate in OR fashion)
+        m.d.comb += self.ld_hold_st_o.eq((war_l.qn & self.load_hit_i).bool())
+        m.d.comb += self.st_hold_ld_o.eq((raw_l.qn & self.stwd_hit_i).bool())
+
+        return m
+
+    def __iter__(self):
+        yield self.load_h_i
+        yield self.load_v_i
+        yield self.stor_h_i
+        yield self.stor_h_i
+        yield self.issue_i
+        yield self.load_hit_i
+        yield self.stwd_hit_i
+        yield self.ld_hold_st_o
+        yield self.st_hold_ld_o
+
+    def ports(self):
+        return list(self)
+
+
+def dcell_sim(dut):
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_rd_i.eq(1)
+    yield
+    yield dut.go_rd_i.eq(0)
+    yield
+    yield dut.go_wr_i.eq(1)
+    yield
+    yield dut.go_wr_i.eq(0)
+    yield
+
+def test_dcell():
+    dut = LDSTDepCell()
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_ldst_dcell.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, dcell_sim(dut), vcd_name='test_ldst_dcell.vcd')
+
+if __name__ == '__main__':
+    test_dcell()
diff --git a/src/soc/scoreboard/ldst_matrix.py b/src/soc/scoreboard/ldst_matrix.py
new file mode 100644
index 00000000..1bb75b03
--- /dev/null
+++ b/src/soc/scoreboard/ldst_matrix.py
@@ -0,0 +1,163 @@
+""" Mitch Alsup 6600-style LD/ST Memory Scoreboard Matrix (sparse vector)
+
+6600 LD/ST Dependency Table Matrix inputs / outputs
+---------------------------------------------------
+
+Relevant comments (p45-46):
+
+* If there are no WAR dependencies on a Load instruction with a computed
+  address it can assert Bank_Addressable and Translate_Addressable.
+
+* If there are no RAW dependencies on a Store instruction with both a
+  write permission and store data present it can assert Bank_Addressable
+
+Relevant bugreports:
+
+* http://bugs.libre-riscv.org/show_bug.cgi?id=81
+
+Notes:
+
+* Load Hit (or Store Hit with Data) are asserted by the LD/ST Computation
+  Unit when it has data and address ready
+
+* Asserting the ld_hit_i (or stwd_hit_i) *requires* that the output be
+  captured or at least taken into consideration for the next LD/STs
+  *right then*.  Failure to observe the xx_hold_xx_o *will* result in
+  data corruption, as they are *only* asserted if xx_hit_i is asserted
+
+* The hold signals still have to go through "maybe address clashes"
+  detection, they cannot just be used as-is to stop a LD/ST.
+
+"""
+
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Elaboratable, Array, Cat, Const
+
+from ldst_dep_cell import LDSTDepCell
+
+
+class LDSTDepMatrix(Elaboratable):
+    """ implements 11.4.12 mitch alsup LD/ST Dependency Matrix, p46
+        actually a sparse matrix along the diagonal.
+
+        load-hold-store and store-hold-load accumulate in a priority-picking
+        fashion, ORing together.  the OR gate from the dependency cell is
+        here.
+    """
+    def __init__(self, n_ldst):
+        self.n_ldst = n_ldst                  # X and Y (FUs)
+        self.ld_pend_i = Signal(n_ldst, reset_less=True)  # load pending in
+        self.st_pend_i = Signal(n_ldst, reset_less=True)  # store pending in
+        self.issue_i = Signal(n_ldst, reset_less=True) # Issue in
+        self.go_die_i = Signal(n_ldst, reset_less=True) # Die/Reset in
+
+        self.load_hit_i = Signal(n_ldst, reset_less=True) # load hit in
+        self.stwd_hit_i = Signal(n_ldst, reset_less=True) # store w/data hit in
+
+        # outputs
+        self.ld_hold_st_o = Signal(n_ldst, reset_less=True) # load holds st out
+        self.st_hold_ld_o = Signal(n_ldst, reset_less=True) # st holds load out
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # ---
+        # matrix of dependency cells.  actually, LDSTDepCell is a row, now
+        # ---
+        dm = Array(LDSTDepCell(self.n_ldst) for f in range(self.n_ldst))
+        for fu in range(self.n_ldst):
+            setattr(m.submodules, "dm_fu%d" % (fu), dm[fu])
+
+        # ---
+        # connect Function Unit vector, all horizontal
+        # ---
+        lhs_l = []
+        shl_l = []
+        issue_l = []
+        go_die_l = []
+        lh_l = []
+        sh_l = []
+        for fu in range(self.n_ldst):
+            dc = dm[fu]
+            # accumulate load-hold-store / store-hold-load bits (horizontal)
+            lhs_l.append(dc.ld_hold_st_o)
+            shl_l.append(dc.st_hold_ld_o)
+            # accumulate inputs (for Cat'ing later) - TODO: must be a better way
+            issue_l.append(dc.issue_i)
+            go_die_l.append(dc.go_die_i)
+
+            # load-hit and store-with-data-hit go in vertically (top)
+            m.d.comb += [dc.load_hit_i.eq(self.load_hit_i),
+                         dc.stwd_hit_i.eq(self.stwd_hit_i),
+                         dc.load_v_i.eq(self.ld_pend_i),
+                         dc.stor_v_i.eq(self.st_pend_i),
+                        ]
+
+        # connect cell inputs using Cat(*list_of_stuff)
+        m.d.comb += [Cat(*issue_l).eq(self.issue_i),
+                     Cat(*go_die_l).eq(self.go_die_i),
+                    ]
+        # connect the load-hold-store / store-hold-load OR-accumulated outputs
+        m.d.comb += self.ld_hold_st_o.eq(Cat(*lhs_l))
+        m.d.comb += self.st_hold_ld_o.eq(Cat(*shl_l))
+
+        # the load/store input also needs to be connected to "top" (vertically)
+        for fu in range(self.n_ldst):
+            load_h_l = []
+            stor_h_l = []
+            for fux in range(self.n_ldst):
+                dc = dm[fux]
+                load_h_l.append(dc.load_h_i)
+                stor_h_l.append(dc.stor_h_i)
+            m.d.comb += [Cat(*load_h_l).eq(self.ld_pend_i),
+                         Cat(*stor_h_l).eq(self.st_pend_i),
+                        ]
+
+        return m
+
+    def __iter__(self):
+        yield self.ld_pend_i
+        yield self.st_pend_i
+        yield self.issue_i
+        yield self.go_die_i
+        yield self.load_hit_i
+        yield self.stwd_hit_i
+        yield self.ld_hold_st_o
+        yield self.st_hold_ld_o
+
+    def ports(self):
+        return list(self)
+
+def d_matrix_sim(dut):
+    """ XXX TODO
+    """
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_rd_i.eq(1)
+    yield
+    yield dut.go_rd_i.eq(0)
+    yield
+    yield dut.go_wr_i.eq(1)
+    yield
+    yield dut.go_wr_i.eq(0)
+    yield
+
+def test_d_matrix():
+    dut = LDSTDepMatrix(n_ldst=4)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_ld_st_matrix.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, d_matrix_sim(dut), vcd_name='test_ld_st_matrix.vcd')
+
+if __name__ == '__main__':
+    test_d_matrix()
diff --git a/src/soc/scoreboard/mdm.py b/src/soc/scoreboard/mdm.py
new file mode 100644
index 00000000..184931ef
--- /dev/null
+++ b/src/soc/scoreboard/mdm.py
@@ -0,0 +1,22 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module
+
+from scoreboard.fu_reg_matrix import FURegDepMatrix
+from scoreboard.addr_match import PartialAddrMatch
+
+class FUMemMatchMatrix(FURegDepMatrix, PartialAddrMatch):
+    """ implement a FU-Regs overload with memory-address matching
+    """
+    def __init__(self, n_fu, addrbitwid):
+        PartialAddrMatch.__init__(self, n_fu, addrbitwid)
+        FURegDepMatrix.__init__(self, n_fu, n_fu, 1, self.addr_nomatch_o)
+
+    def elaborate(self, platform):
+        m = Module()
+        PartialAddrMatch._elaborate(self, m, platform)
+        FURegDepMatrix._elaborate(self, m, platform)
+
+        return m
+
+
diff --git a/src/soc/scoreboard/mem_dependence_cell.py b/src/soc/scoreboard/mem_dependence_cell.py
new file mode 100644
index 00000000..2958d864
--- /dev/null
+++ b/src/soc/scoreboard/mem_dependence_cell.py
@@ -0,0 +1,120 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl
+from nmutil.latch import SRLatch
+
+
+class MemDepRow(Elaboratable):
+    """ implements 1st phase Memory Depencency cell
+    """
+    def __init__(self, n_reg):
+        self.n_reg = n_reg
+        # inputs
+        self.ld_i = Signal(n_reg, reset_less=True)     # Dest in (top)
+        self.st_i = Signal(n_reg, reset_less=True)     # oper1 in (top)
+        self.issue_i = Signal(reset_less=True)    # Issue in (top)
+
+        self.st_pend_i = Signal(n_reg, reset_less=True) # Read pend in (top)
+        self.ld_pend_i = Signal(n_reg, reset_less=True) # Write pend in (top)
+        self.v_st_rsel_o = Signal(n_reg, reset_less=True) # Read pend out (bot)
+        self.v_ld_rsel_o = Signal(n_reg, reset_less=True) # Write pend out (bot)
+
+        self.go_ld_i = Signal(reset_less=True) # Go Write in (left)
+        self.go_st_i = Signal(reset_less=True)  # Go Read in (left)
+        self.go_die_i = Signal(reset_less=True) # Go Die in (left)
+
+        # for Register File Select Lines (vertical)
+        self.ld_rsel_o = Signal(n_reg, reset_less=True)  # dest reg sel (bot)
+        self.st_rsel_o = Signal(n_reg, reset_less=True)  # src1 reg sel (bot)
+
+        # for Function Unit "forward progress" (horizontal)
+        self.ld_fwd_o = Signal(n_reg, reset_less=True)   # dest FU fw (right)
+        self.st_fwd_o = Signal(n_reg, reset_less=True)   # src1 FU fw (right)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.ld_c = ld_c = SRLatch(sync=False, llen=self.n_reg)
+        m.submodules.st_c = st_c = SRLatch(sync=False, llen=self.n_reg)
+
+        # connect go_rd / go_wr (dest->wr, src->rd)
+        ld_die = Signal(reset_less=True)
+        st_die = Signal(reset_less=True)
+        m.d.comb += ld_die.eq(self.go_ld_i | self.go_die_i)
+        m.d.comb += st_die.eq(self.go_st_i | self.go_die_i)
+        m.d.comb += ld_c.r.eq(Repl(ld_die, self.n_reg))
+        m.d.comb += st_c.r.eq(Repl(st_die, self.n_reg))
+
+        # connect input reg bit (unary)
+        i_ext = Repl(self.issue_i, self.n_reg)
+        m.d.comb += ld_c.s.eq(i_ext & self.ld_i)
+        m.d.comb += st_c.s.eq(i_ext & self.st_i)
+
+        # connect up hazard checks: read-after-write and write-after-read
+        m.d.comb += self.ld_fwd_o.eq(ld_c.q & self.st_pend_i)
+        m.d.comb += self.st_fwd_o.eq(st_c.q & self.ld_pend_i)
+
+        # connect reg-sel outputs
+        st_ext = Repl(self.go_st_i, self.n_reg)
+        ld_ext = Repl(self.go_ld_i, self.n_reg)
+        m.d.comb += self.ld_rsel_o.eq(ld_c.qlq & ld_ext)
+        m.d.comb += self.st_rsel_o.eq(st_c.qlq & st_ext)
+
+        # to be accumulated to indicate if register is in use (globally)
+        # after ORing, is fed back in to st_pend_i / ld_pend_i
+        m.d.comb += self.v_st_rsel_o.eq(st_c.qlq)
+        m.d.comb += self.v_ld_rsel_o.eq(ld_c.qlq)
+
+        return m
+
+    def __iter__(self):
+        yield self.ld_i
+        yield self.st_i
+        yield self.st_pend_i
+        yield self.ld_pend_i
+        yield self.issue_i
+        yield self.go_ld_i
+        yield self.go_st_i
+        yield self.go_die_i
+        yield self.v_ld_rsel_o
+        yield self.v_st_rsel_o
+        yield self.ld_rsel_o
+        yield self.st_rsel_o
+        yield self.ld_fwd_o
+        yield self.st_fwd_o
+
+    def ports(self):
+        return list(self)
+
+
+def dcell_sim(dut):
+    yield dut.ld_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.st_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_st_i.eq(1)
+    yield
+    yield dut.go_st_i.eq(0)
+    yield
+    yield dut.go_ld_i.eq(1)
+    yield
+    yield dut.go_ld_i.eq(0)
+    yield
+
+def test_dcell():
+    dut = MemDepRow(4)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_mem_drow.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, dcell_sim(dut), vcd_name='test_mem_dcell.vcd')
+
+if __name__ == '__main__':
+    test_dcell()
diff --git a/src/soc/scoreboard/mem_fu_matrix.py b/src/soc/scoreboard/mem_fu_matrix.py
new file mode 100644
index 00000000..98595996
--- /dev/null
+++ b/src/soc/scoreboard/mem_fu_matrix.py
@@ -0,0 +1,218 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Elaboratable, Array, Cat
+
+from scoreboard.mem_dependence_cell import MemDepRow
+from scoreboard.mem_fu_pending import MemFU_Pend
+from scoreboard.mem_select import Mem_Rsv
+from scoreboard.global_pending import GlobalPending
+
+"""
+
+"""
+
+class MemFUDepMatrix(Elaboratable):
+    """ implements 1st phase Memory-to-FU Dependency Matrix
+    """
+    def __init__(self, n_fu_row, n_reg_col):
+        self.n_fu_row = n_fu_row                  # Y (FUs)   ^v
+        self.n_reg_col = n_reg_col                # X (Regs)  <>
+        self.ld_i = Signal(n_reg_col, reset_less=True)     # LD in (top)
+        self.st_i = Signal(n_reg_col, reset_less=True)     # ST in (top)
+
+        # Register "Global" vectors for determining RaW and WaR hazards
+        self.ld_pend_i = Signal(n_reg_col, reset_less=True) # ld pending (top)
+        self.st_pend_i = Signal(n_reg_col, reset_less=True) # st pending (top)
+        self.v_ld_rsel_o = Signal(n_reg_col, reset_less=True) # ld pending (bot)
+        self.v_st_rsel_o = Signal(n_reg_col, reset_less=True) # st pending (bot)
+
+        self.issue_i = Signal(n_fu_row, reset_less=True)  # Issue in (top)
+        self.go_ld_i = Signal(n_fu_row, reset_less=True)  # Go LOAD in (left)
+        self.go_st_i = Signal(n_fu_row, reset_less=True)  # Go STOR in (left)
+        self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left)
+
+        # for Register File Select Lines (horizontal), per-reg
+        self.ld_rsel_o = Signal(n_reg_col, reset_less=True) # ld reg (bot)
+        self.st_rsel_o = Signal(n_reg_col, reset_less=True) # st reg (bot)
+
+        # for Function Unit "forward progress" (vertical), per-FU
+        self.ld_pend_o = Signal(n_fu_row, reset_less=True) # ld pending (right)
+        self.st_pend_o = Signal(n_fu_row, reset_less=True) # st pending (right)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # ---
+        # matrix of dependency cells
+        # ---
+        dm = Array(MemDepRow(self.n_reg_col) for r in range(self.n_fu_row))
+        for fu in range(self.n_fu_row):
+            setattr(m.submodules, "dr_fu%d" % fu, dm[fu])
+
+        # ---
+        # array of Function Unit Pending vectors
+        # ---
+        fupend = Array(MemFU_Pend(self.n_reg_col) for f in range(self.n_fu_row))
+        for fu in range(self.n_fu_row):
+            setattr(m.submodules, "fu_fu%d" % (fu), fupend[fu])
+
+        # ---
+        # array of Register Reservation vectors
+        # ---
+        regrsv = Array(Mem_Rsv(self.n_fu_row) for r in range(self.n_reg_col))
+        for rn in range(self.n_reg_col):
+            setattr(m.submodules, "rr_r%d" % (rn), regrsv[rn])
+
+        # ---
+        # connect Function Unit vector
+        # ---
+        ld_pend = []
+        st_pend = []
+        for fu in range(self.n_fu_row):
+            dc = dm[fu]
+            fup = fupend[fu]
+            ld_fwd_o = []
+            st_fwd_o = []
+            for rn in range(self.n_reg_col):
+                # accumulate cell fwd outputs for dest/src1
+                ld_fwd_o.append(dc.ld_fwd_o[rn])
+                st_fwd_o.append(dc.st_fwd_o[rn])
+            # connect cell fwd outputs to FU Vector in [Cat is gooood]
+            m.d.comb += [fup.ld_fwd_i.eq(Cat(*ld_fwd_o)),
+                         fup.st_fwd_i.eq(Cat(*st_fwd_o)),
+                        ]
+            # accumulate FU Vector outputs
+            ld_pend.append(fup.reg_ld_pend_o)
+            st_pend.append(fup.reg_st_pend_o)
+
+        # ... and output them from this module (vertical, width=FUs)
+        m.d.comb += self.ld_pend_o.eq(Cat(*ld_pend))
+        m.d.comb += self.st_pend_o.eq(Cat(*st_pend))
+
+        # ---
+        # connect Reg Selection vector
+        # ---
+        ld_rsel = []
+        st_rsel = []
+        for rn in range(self.n_reg_col):
+            rsv = regrsv[rn]
+            ld_rsel_o = []
+            st_rsel_o = []
+            for fu in range(self.n_fu_row):
+                dc = dm[fu]
+                # accumulate cell reg-select outputs dest/src1
+                ld_rsel_o.append(dc.ld_rsel_o[rn])
+                st_rsel_o.append(dc.st_rsel_o[rn])
+            # connect cell reg-select outputs to Reg Vector In
+            m.d.comb += [rsv.ld_rsel_i.eq(Cat(*ld_rsel_o)),
+                         rsv.st_rsel_i.eq(Cat(*st_rsel_o)),
+                        ]
+            # accumulate Reg-Sel Vector outputs
+            ld_rsel.append(rsv.ld_rsel_o)
+            st_rsel.append(rsv.st_rsel_o)
+
+        # ... and output them from this module (horizontal, width=REGs)
+        m.d.comb += self.ld_rsel_o.eq(Cat(*ld_rsel))
+        m.d.comb += self.st_rsel_o.eq(Cat(*st_rsel))
+
+        # ---
+        # connect Dependency Matrix dest/src1/issue to module d/s/s/i
+        # ---
+        for fu in range(self.n_fu_row):
+            dc = dm[fu]
+            # wire up inputs from module to row cell inputs (Cat is gooood)
+            m.d.comb += [dc.ld_i.eq(self.ld_i),
+                         dc.st_i.eq(self.st_i),
+                         dc.st_pend_i.eq(self.st_pend_i),
+                         dc.ld_pend_i.eq(self.ld_pend_i),
+                        ]
+
+        # accumulate rsel bits into read/write pending vectors.
+        st_pend_v = []
+        ld_pend_v = []
+        for fu in range(self.n_fu_row):
+            dc = dm[fu]
+            st_pend_v.append(dc.v_st_rsel_o)
+            ld_pend_v.append(dc.v_ld_rsel_o)
+        st_v = GlobalPending(self.n_reg_col, st_pend_v)
+        ld_v = GlobalPending(self.n_reg_col, ld_pend_v)
+        m.submodules.st_v = st_v
+        m.submodules.ld_v = ld_v
+
+        m.d.comb += self.v_st_rsel_o.eq(st_v.g_pend_o)
+        m.d.comb += self.v_ld_rsel_o.eq(ld_v.g_pend_o)
+
+        # ---
+        # connect Dep issue_i/go_st_i/go_ld_i to module issue_i/go_rd/go_wr
+        # ---
+        go_st_i = []
+        go_ld_i = []
+        go_die_i = []
+        issue_i = []
+        for fu in range(self.n_fu_row):
+            dc = dm[fu]
+            # accumulate cell fwd outputs for dest/src1
+            go_st_i.append(dc.go_st_i)
+            go_ld_i.append(dc.go_ld_i)
+            go_die_i.append(dc.go_die_i)
+            issue_i.append(dc.issue_i)
+        # wire up inputs from module to row cell inputs (Cat is gooood)
+        m.d.comb += [Cat(*go_st_i).eq(self.go_st_i),
+                     Cat(*go_ld_i).eq(self.go_ld_i),
+                     Cat(*go_die_i).eq(self.go_die_i),
+                     Cat(*issue_i).eq(self.issue_i),
+                    ]
+
+        return m
+
+    def __iter__(self):
+        yield self.ld_i
+        yield self.st_i
+        yield self.issue_i
+        yield self.go_ld_i
+        yield self.go_st_i
+        yield self.go_die_i
+        yield self.ld_rsel_o
+        yield self.st_rsel_o
+        yield self.ld_pend_o
+        yield self.st_pend_o
+        yield self.ld_pend_i
+        yield self.st_pend_i
+        yield self.ld_rsel_o
+        yield self.st_rsel_o
+
+    def ports(self):
+        return list(self)
+
+def d_matrix_sim(dut):
+    """ XXX TODO
+    """
+    yield dut.ld_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.st_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_st_i.eq(1)
+    yield
+    yield dut.go_st_i.eq(0)
+    yield
+    yield dut.go_ld_i.eq(1)
+    yield
+    yield dut.go_ld_i.eq(0)
+    yield
+
+def test_d_matrix():
+    dut = MemFUDepMatrix(n_fu_row=3, n_reg_col=3)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_fu_mem_matrix.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, d_matrix_sim(dut), vcd_name='test_fu_mem_matrix.vcd')
+
+if __name__ == '__main__':
+    test_d_matrix()
diff --git a/src/soc/scoreboard/mem_fu_pending.py b/src/soc/scoreboard/mem_fu_pending.py
new file mode 100644
index 00000000..951f7ac1
--- /dev/null
+++ b/src/soc/scoreboard/mem_fu_pending.py
@@ -0,0 +1,22 @@
+from nmigen import Elaboratable, Module, Signal, Cat
+
+
+class MemFU_Pend(Elaboratable):
+    """ these are allocated per-FU (horizontally),
+        and are of length reg_count
+    """
+    def __init__(self, reg_count):
+        self.reg_count = reg_count
+        self.ld_fwd_i = Signal(reg_count, reset_less=True)
+        self.st_fwd_i = Signal(reg_count, reset_less=True)
+
+        self.reg_ld_pend_o = Signal(reset_less=True)
+        self.reg_st_pend_o = Signal(reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.reg_ld_pend_o.eq(self.ld_fwd_i.bool())
+        m.d.comb += self.reg_st_pend_o.eq(self.st_fwd_i.bool())
+
+        return m
+
diff --git a/src/soc/scoreboard/mem_select.py b/src/soc/scoreboard/mem_select.py
new file mode 100644
index 00000000..627d7d10
--- /dev/null
+++ b/src/soc/scoreboard/mem_select.py
@@ -0,0 +1,20 @@
+from nmigen import Elaboratable, Module, Signal
+
+
+class Mem_Rsv(Elaboratable):
+    """ these are allocated per-Register (vertically),
+        and are each of length fu_count
+    """
+    def __init__(self, fu_count):
+        self.fu_count = fu_count
+        self.ld_rsel_i = Signal(fu_count, reset_less=True)
+        self.st_rsel_i = Signal(fu_count, reset_less=True)
+        self.ld_rsel_o = Signal(reset_less=True)
+        self.st_rsel_o = Signal(reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.ld_rsel_o.eq(self.ld_rsel_i.bool())
+        m.d.comb += self.st_rsel_o.eq(self.st_rsel_i.bool())
+        return m
+
diff --git a/src/soc/scoreboard/memfu.py b/src/soc/scoreboard/memfu.py
new file mode 100644
index 00000000..857d96c9
--- /dev/null
+++ b/src/soc/scoreboard/memfu.py
@@ -0,0 +1,120 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Array, Elaboratable
+
+from scoreboard.fu_fu_matrix import FUFUDepMatrix
+from scoreboard.mdm import FUMemMatchMatrix
+
+
+class MemFunctionUnits(Elaboratable):
+
+    def __init__(self, n_ldsts, addrbitwid):
+        self.n_ldsts = n_ldsts
+        self.bitwid = addrbitwid
+
+        self.st_i = Signal(n_ldsts, reset_less=True) # Dest R# in
+        self.ld_i = Signal(n_ldsts, reset_less=True) # oper1 R# in
+
+        self.g_int_ld_pend_o = Signal(n_ldsts, reset_less=True)
+        self.g_int_st_pend_o = Signal(n_ldsts, reset_less=True)
+
+        self.st_rsel_o = Signal(n_ldsts, reset_less=True) # dest reg (bot)
+        self.ld_rsel_o = Signal(n_ldsts, reset_less=True) # src1 reg (bot)
+
+        self.loadable_o = Signal(n_ldsts, reset_less=True)
+        self.storable_o = Signal(n_ldsts, reset_less=True)
+        self.addr_nomatch_o = Signal(n_ldsts, reset_less=True)
+
+        self.go_ld_i = Signal(n_ldsts, reset_less=True)
+        self.go_st_i = Signal(n_ldsts, reset_less=True)
+        self.go_die_i = Signal(n_ldsts, reset_less=True)
+        self.fn_issue_i = Signal(n_ldsts, reset_less=True)
+
+        # address matching
+        self.addrs_i = Array(Signal(self.bitwid, name="addrs_i%d" % i) \
+                             for i in range(n_ldsts))
+        self.addr_we_i = Signal(n_ldsts) # write-enable for incoming address
+        self.addr_en_i = Signal(n_ldsts) # address latched in
+        self.addr_rs_i = Signal(n_ldsts) # address deactivated
+
+        # Note: FURegs st_pend_o is also outputted from here, for use in WaWGrid
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        sync = m.d.sync
+
+        n_fus = self.n_ldsts
+
+        # Integer FU-FU Dep Matrix
+        intfudeps = FUFUDepMatrix(n_fus, n_fus)
+        m.submodules.intfudeps = intfudeps
+        # Integer FU-Reg Dep Matrix
+        intregdeps = FUMemMatchMatrix(n_fus, self.bitwid)
+        m.submodules.intregdeps = intregdeps
+
+        # ok, because we do not know in advance what the AGEN (address gen)
+        # is, we have to make a transitive dependency set.  i.e. the LD
+        # (or ST) being requested now must depend on ALL prior LDs *AND* STs.
+        # these get dropped very rapidly once AGEN is carried out.
+        # XXX TODO
+
+        # connect fureg matrix as a mem system
+        comb += self.g_int_ld_pend_o.eq(intregdeps.v_rd_rsel_o)
+        comb += self.g_int_st_pend_o.eq(intregdeps.v_wr_rsel_o)
+
+        comb += intregdeps.rd_pend_i.eq(intregdeps.v_rd_rsel_o)
+        comb += intregdeps.wr_pend_i.eq(intregdeps.v_wr_rsel_o)
+
+        comb += intfudeps.rd_pend_i.eq(intregdeps.rd_pend_o)
+        comb += intfudeps.wr_pend_i.eq(intregdeps.wr_pend_o)
+        self.st_pend_o = intregdeps.wr_pend_o # also output for use in WaWGrid
+
+        comb += intfudeps.issue_i.eq(self.fn_issue_i)
+        comb += intfudeps.go_rd_i.eq(self.go_ld_i)
+        comb += intfudeps.go_wr_i.eq(self.go_st_i)
+        comb += intfudeps.go_die_i.eq(self.go_die_i)
+        comb += self.loadable_o.eq(intfudeps.readable_o)
+        comb += self.storable_o.eq(intfudeps.writable_o)
+        comb += self.addr_nomatch_o.eq(intregdeps.addr_nomatch_o)
+
+        # Connect function issue / arrays, and dest/src1/src2
+        comb += intregdeps.dest_i.eq(self.st_i)
+        comb += intregdeps.src_i[0].eq(self.ld_i)
+
+        comb += intregdeps.go_rd_i.eq(self.go_ld_i)
+        comb += intregdeps.go_wr_i.eq(self.go_st_i)
+        comb += intregdeps.go_die_i.eq(self.go_die_i)
+        comb += intregdeps.issue_i.eq(self.fn_issue_i)
+
+        comb += self.st_rsel_o.eq(intregdeps.dest_rsel_o)
+        comb += self.ld_rsel_o.eq(intregdeps.src_rsel_o[0])
+
+        # connect address matching: these get connected to the Addr CUs
+        for i in range(self.n_ldsts):
+            comb += intregdeps.addrs_i[i].eq(self.addrs_i[i])
+        comb += intregdeps.addr_we_i.eq(self.addr_we_i)
+        comb += intregdeps.addr_en_i.eq(self.addr_en_i)
+        comb += intregdeps.addr_rs_i.eq(self.addr_rs_i)
+
+        return m
+
+    def __iter__(self):
+        yield self.ld_i
+        yield self.st_i
+        yield self.g_int_st_pend_o
+        yield self.g_int_ld_pend_o
+        yield self.ld_rsel_o
+        yield self.st_rsel_o
+        yield self.loadable_o
+        yield self.storable_o
+        yield self.go_st_i
+        yield self.go_ld_i
+        yield self.go_die_i
+        yield self.fn_issue_i
+        yield from self.addrs_i
+        yield self.addr_we_i
+        yield self.addr_en_i
+
+    def ports(self):
+        return list(self)
diff --git a/src/soc/scoreboard/reg_select.py b/src/soc/scoreboard/reg_select.py
new file mode 100644
index 00000000..3919cce3
--- /dev/null
+++ b/src/soc/scoreboard/reg_select.py
@@ -0,0 +1,24 @@
+from nmigen import Elaboratable, Module, Signal, Array
+
+
+class Reg_Rsv(Elaboratable):
+    """ these are allocated per-Register (vertically),
+        and are each of length fu_count
+    """
+    def __init__(self, fu_count, n_src):
+        self.n_src = n_src
+        self.fu_count = fu_count
+        self.dest_rsel_i = Signal(fu_count, reset_less=True)
+        self.src_rsel_i = Array(Signal(fu_count, name="src_rsel_i",
+                                       reset_less=True) \
+                                for i in range(n_src))
+        self.dest_rsel_o = Signal(reset_less=True)
+        self.src_rsel_o = Signal(n_src, reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.dest_rsel_o.eq(self.dest_rsel_i.bool())
+        for i in range(self.n_src):
+            m.d.comb += self.src_rsel_o[i].eq(self.src_rsel_i[i].bool())
+        return m
+
diff --git a/src/soc/scoreboard/shadow.py b/src/soc/scoreboard/shadow.py
new file mode 100644
index 00000000..12f20893
--- /dev/null
+++ b/src/soc/scoreboard/shadow.py
@@ -0,0 +1,226 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Cat, Array, Const, Elaboratable, Repl
+from nmigen.lib.coding import Decoder
+
+from scoreboard.shadow_fn import ShadowFn
+
+
+class ShadowMatrix(Elaboratable):
+    """ Matrix of Shadow Functions.  One per FU.
+
+        Inputs
+        * :n_fus:       register file width
+        * :shadow_wid:  number of shadow/fail/good/go_die sets
+
+        Notes:
+
+        * Shadow enable/fail/good are all connected to all Shadow Functions
+          (incoming at the top)
+
+        * Output is an array of "shadow active" (schroedinger wires: neither
+          alive nor dead) and an array of "go die" signals, one per FU.
+
+        * the shadown must be connected to the Computation Unit's
+          write release request, preventing it (ANDing) from firing
+          (and thus preventing Writable.  this by the way being the
+           whole point of having the Shadow Matrix...)
+
+        * go_die_o must be connected to *both* the Computation Unit's
+          src-operand and result-operand latch resets, causing both
+          of them to reset.
+
+        * go_die_o also needs to be wired into the Dependency and Function
+          Unit Matrices by way of over-enabling (ORing) into Go_Read and
+          Go_Write, resetting every cell that is required to "die"
+    """
+    def __init__(self, n_fus, shadow_wid=0, syncreset=False):
+        self.syncreset = syncreset
+        self.n_fus = n_fus
+        self.shadow_wid = shadow_wid
+
+        # inputs
+        self.issue_i = Signal(n_fus, reset_less=True)
+        self.reset_i = Signal(n_fus, reset_less=True)
+        self.shadow_i = Array(Signal(shadow_wid, name="sh_i", reset_less=True) \
+                            for f in range(n_fus))
+        self.s_fail_i = Array(Signal(shadow_wid, name="fl_i", reset_less=True) \
+                            for f in range(n_fus))
+        self.s_good_i = Array(Signal(shadow_wid, name="gd_i", reset_less=True) \
+                            for f in range(n_fus))
+        # outputs
+        self.go_die_o = Signal(n_fus, reset_less=True)
+        self.shadown_o = Signal(n_fus, reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        shadows = []
+        for i in range(self.n_fus):
+            sh = ShadowFn(self.shadow_wid, self.syncreset)
+            setattr(m.submodules, "sh%d" % i, sh)
+            shadows.append(sh)
+            # connect shadow/fail/good to all shadows
+            m.d.comb += sh.s_fail_i.eq(self.s_fail_i[i])
+            m.d.comb += sh.s_good_i.eq(self.s_good_i[i])
+            # this one is the matrix (shadow enables)
+            m.d.comb += sh.shadow_i.eq(self.shadow_i[i])
+
+        # connect all shadow outputs and issue input
+        issue_l = []
+        reset_l = []
+        sho_l = []
+        rec_l = []
+        for l in shadows:
+            issue_l.append(l.issue_i)
+            reset_l.append(l.reset_i)
+            sho_l.append(l.shadown_o)
+            rec_l.append(l.go_die_o)
+        m.d.comb += Cat(*issue_l).eq(self.issue_i)
+        m.d.comb += Cat(*reset_l).eq(self.reset_i)
+        m.d.comb += self.shadown_o.eq(Cat(*sho_l))
+        m.d.comb += self.go_die_o.eq(Cat(*rec_l))
+
+        return m
+
+    def __iter__(self):
+        yield self.issue_i
+        yield self.reset_i
+        yield from self.shadow_i
+        yield from self.s_fail_i
+        yield from self.s_good_i
+        yield self.go_die_o
+        yield self.shadown_o
+
+    def ports(self):
+        return list(self)
+
+
+class BranchSpeculationRecord(Elaboratable):
+    """ A record of which function units will be cancelled and which
+        allowed to proceed, on a branch.
+
+        Whilst the input is a pair that says whether the instruction is
+        under the "success" branch shadow (good_i) or the "fail" shadow
+        (fail_i path), when the branch result is known, the "good" path
+        must be cancelled if "fail" occurred, and the "fail" path cancelled
+        if "good" occurred.
+
+        therefore, use "good|~fail" and "fail|~good" respectively as
+        output.
+    """
+
+    def __init__(self, n_fus):
+        self.n_fus = n_fus
+
+        # inputs: record *expected* status
+        self.active_i = Signal(reset_less=True)
+        self.good_i = Signal(n_fus, reset_less=True)
+        self.fail_i = Signal(n_fus, reset_less=True)
+
+        # inputs: status of branch (when result was known)
+        self.br_i = Signal(reset_less=True)
+        self.br_ok_i = Signal(reset_less=True)
+
+        # outputs: true if the *expected* outcome matched the *actual* outcome
+        self.match_f_o = Signal(n_fus, reset_less=True)
+        self.match_g_o = Signal(n_fus, reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # registers to record *expected* status
+        good_r = Signal(self.n_fus)
+        fail_r = Signal(self.n_fus)
+
+        for i in range(self.n_fus):
+            with m.If(self.active_i):
+                m.d.sync += good_r[i].eq(good_r[i] | self.good_i[i])
+                m.d.sync += fail_r[i].eq(fail_r[i] | self.fail_i[i])
+            with m.If(self.br_i):
+                with m.If(good_r[i]):
+                    # we expected good, return OK that good was EXPECTED
+                    m.d.comb += self.match_g_o[i].eq(self.br_ok_i)
+                    m.d.comb += self.match_f_o[i].eq(~self.br_ok_i)
+                with m.If(fail_r[i]):
+                    # we expected fail, return OK that fail was EXPECTED
+                    m.d.comb += self.match_g_o[i].eq(~self.br_ok_i)
+                    m.d.comb += self.match_f_o[i].eq(self.br_ok_i)
+                m.d.sync += good_r[i].eq(0) # might be set if issue set as well
+                m.d.sync += fail_r[i].eq(0) # might be set if issue set as well
+
+        return m
+
+    def __iter__(self):
+        yield self.active_i
+        yield self.good_i
+        yield self.fail_i
+        yield self.br_i
+        yield self.br_good_i
+        yield self.br_fail_i
+        yield self.good_o
+        yield self.fail_o
+
+    def ports(self):
+        return list(self)
+
+
+
+class WaWGrid(Elaboratable):
+    """ An NxM grid-selector which raises a 2D bit selected by N and M
+    """
+
+    def __init__(self, n_fus, shadow_wid):
+        self.n_fus = n_fus
+        self.shadow_wid = shadow_wid
+
+        self.shadow_i = Signal(shadow_wid, reset_less=True)
+        self.fu_i = Signal(n_fus, reset_less=True)
+
+        self.waw_o = Array(Signal(shadow_wid, name="waw_o", reset_less=True) \
+                            for f in range(n_fus))
+
+    def elaborate(self, platform):
+        m = Module()
+        for i in range(self.n_fus):
+            v = Repl(self.fu_i[i], self.shadow_wid)
+            m.d.comb += self.waw_o[i].eq(v & self.shadow_i)
+        return m
+
+
+def shadow_sim(dut):
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_rd_i.eq(1)
+    yield
+    yield dut.go_rd_i.eq(0)
+    yield
+    yield dut.go_wr_i.eq(1)
+    yield
+    yield dut.go_wr_i.eq(0)
+    yield
+
+def test_shadow():
+    dut = ShadowMatrix(4, 2)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_shadow.il", "w") as f:
+        f.write(vl)
+
+    dut = BranchSpeculationRecord(4)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_branchspecrecord.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, shadow_sim(dut), vcd_name='test_shadow.vcd')
+
+if __name__ == '__main__':
+    test_shadow()
diff --git a/src/soc/scoreboard/shadow_fn.py b/src/soc/scoreboard/shadow_fn.py
new file mode 100644
index 00000000..69a56a5c
--- /dev/null
+++ b/src/soc/scoreboard/shadow_fn.py
@@ -0,0 +1,111 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Cat, Repl, Const, Elaboratable
+from nmutil.latch import SRLatch
+
+
+class ShadowFn(Elaboratable):
+    """ implements shadowing 11.5.1, p55, just the individual shadow function
+
+        shadowing can be used for branches as well as exceptions (interrupts),
+        load/store hold (exceptions again), and vector-element predication
+        (once the predicate is known, which it may not be at instruction issue)
+
+        Inputs
+        * :shadow_wid:  number of shadow/fail/good/go_die sets
+
+        notes:
+        * when shadow_wid = 0, recover and shadown are Consts (i.e. do nothing)
+    """
+    def __init__(self, slen, syncreset=False):
+
+        self.slen = slen
+        self.syncreset = syncreset
+
+        if self.slen:
+            # inputs
+            self.issue_i = Signal(reset_less=True)
+            self.shadow_i  = Signal(slen, reset_less=True)
+            self.reset_i  = Signal(reset_less=True)
+            self.s_fail_i  = Signal(slen, reset_less=True)
+            self.s_good_i  = Signal(slen, reset_less=True)
+
+            # outputs
+            self.shadown_o = Signal(reset_less=True)
+            self.go_die_o = Signal(reset_less=True)
+        else:
+            # outputs when no shadowing needed
+            self.shadown_o = Const(1)
+            self.go_die_o = Const(0)
+
+    def elaborate(self, platform):
+        m = Module()
+        if self.slen == 0:
+            return
+
+        m.submodules.sl = sl = SRLatch(sync=False, llen=self.slen)
+
+        r_ext = Repl(self.reset_i, self.slen)
+        reset_r = Signal(self.slen)
+        if self.syncreset:
+            m.d.comb += reset_r.eq(self.s_good_i | self.s_fail_i | r_ext)
+        else:
+            m.d.comb += reset_r.eq(self.s_good_i | self.s_fail_i | r_ext)
+
+        i_ext = Repl(self.issue_i, self.slen)
+        m.d.comb += sl.s.eq(self.shadow_i & i_ext & \
+                            ~self.s_good_i & ~reset_r)
+        m.d.comb += sl.r.eq(r_ext | reset_r | self.s_good_i | \
+                            (i_ext & ~self.shadow_i))
+        m.d.comb += self.go_die_o.eq((sl.qlq & self.s_fail_i).bool())
+        m.d.comb += self.shadown_o.eq(~sl.qlq.bool())
+
+        return m
+
+    def __iter__(self):
+        yield self.issue_i
+        yield self.reset_i
+        yield self.shadow_i
+        yield self.s_fail_i
+        yield self.s_good_i
+        yield self.shadown_o
+        yield self.go_die_o
+
+    def ports(self):
+        return list(self)
+
+
+def shadow_fn_unit_sim(dut):
+    yield dut.dest_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.src1_i.eq(1)
+    yield dut.issue_i.eq(1)
+    yield
+    yield
+    yield
+    yield dut.issue_i.eq(0)
+    yield
+    yield dut.go_rd_i.eq(1)
+    yield
+    yield dut.go_rd_i.eq(0)
+    yield
+    yield dut.go_wr_i.eq(1)
+    yield
+    yield dut.go_wr_i.eq(0)
+    yield
+
+
+def test_shadow_fn_unit():
+    dut = ShadowFn(4)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_shadow_fn_unit.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, shadow_fn_unit_sim(dut),
+                   vcd_name='test_shadow_fn_unit.vcd')
+
+if __name__ == '__main__':
+    test_shadow_fn_unit()
diff --git a/src/soc/scoreboard/test_iq.py b/src/soc/scoreboard/test_iq.py
new file mode 100644
index 00000000..94ceac7e
--- /dev/null
+++ b/src/soc/scoreboard/test_iq.py
@@ -0,0 +1,126 @@
+""" testing of InstructionQ
+"""
+
+from copy import deepcopy
+from random import randint
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+
+from scoreboard.instruction_q import InstructionQ
+from nmutil.nmoperator import eq
+
+
+class IQSim:
+    def __init__(self, dut, iq, n_in, n_out):
+        self.dut = dut
+        self.iq = iq
+        self.oq = []
+        self.n_in = n_in
+        self.n_out = n_out
+
+    def send(self):
+        i = 0
+        while i < len(self.iq):
+            sendlen = randint(1, self.n_in)
+            sendlen = 1
+            sendlen = min(len(self.iq) - i, sendlen)
+            print ("sendlen", len(self.iq)-i, sendlen)
+            for idx in range(sendlen):
+                instr = self.iq[i+idx]
+                yield from eq(self.dut.data_i[idx], instr)
+                di = yield self.dut.data_i[idx]#.src1_i
+                print ("senddata %d %x" % ((i+idx), di))
+                self.oq.append(di)
+            yield self.dut.p_add_i.eq(sendlen)
+            yield
+            o_p_ready = yield self.dut.p_ready_o
+            while not o_p_ready:
+                yield
+                o_p_ready = yield self.dut.p_ready_o
+
+            yield self.dut.p_add_i.eq(0)
+
+            print ("send", len(self.iq), i, sendlen)
+
+            # wait random period of time before queueing another value
+            for j in range(randint(0, 3)):
+                yield
+
+            i += sendlen
+
+        yield self.dut.p_add_i.eq(0)
+        yield
+
+        print ("send ended")
+
+        ## wait random period of time before queueing another value
+        #for i in range(randint(0, 3)):
+        #    yield
+
+        #send_range = randint(0, 3)
+        #if send_range == 0:
+        #    send = True
+        #else:
+        #    send = randint(0, send_range) != 0
+
+    def rcv(self):
+        i = 0
+        yield
+        yield
+        yield
+        while i < len(self.iq):
+            rcvlen = randint(1, self.n_out)
+            #print ("outreq", rcvlen)
+            yield self.dut.n_sub_i.eq(rcvlen)
+            n_sub_o = yield self.dut.n_sub_o
+            print ("recv", n_sub_o)
+            for j in range(n_sub_o):
+                r = yield self.dut.data_o[j]#.src1_i
+                print ("recvdata %x %s" % (r, repr(self.iq[i+j])))
+                assert r == self.oq[i+j]
+            yield
+            if n_sub_o == 0:
+                continue
+            yield self.dut.n_sub_i.eq(0)
+
+            i += n_sub_o
+
+        print ("recv ended")
+
+
+def mk_insns(n_insns, wid, opwid):
+    res = []
+    for i in range(n_insns):
+        op1 = randint(0, (1<<wid)-1)
+        opi = randint(0, 1)
+        op2 = randint(0, (1<<wid)-1)
+        dst = randint(0, (1<<wid)-1)
+        oper = randint(0, (1<<opwid)-1)
+        imm = randint(0, (1<<wid)-1)
+        res.append({'oper_i': oper, 'opim_i': opi, 
+                    'imm_i': imm, 'dest_i': dst,
+                    'src1_i': op1, 'src2_i': op2})
+    return res
+
+
+def test_iq():
+    wid = 8
+    opwid = 4
+    qlen = 2
+    n_in = 1
+    n_out = 1
+    dut = InstructionQ(wid, opwid, qlen, n_in, n_out)
+    insns = mk_insns(1000, wid, opwid)
+
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_iq.il", "w") as f:
+        f.write(vl)
+
+    test = IQSim(dut, insns, n_in, n_out)
+    print (insns)
+    run_simulation(dut, [test.rcv(), test.send()
+                        ],
+                   vcd_name="test_iq.vcd")
+
+if __name__ == '__main__':
+    test_iq()
diff --git a/src/soc/scoreboard/test_mem2_fu_matrix.py b/src/soc/scoreboard/test_mem2_fu_matrix.py
new file mode 100644
index 00000000..0b0150ea
--- /dev/null
+++ b/src/soc/scoreboard/test_mem2_fu_matrix.py
@@ -0,0 +1,586 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Const, Signal, Array, Cat, Elaboratable
+
+from regfile.regfile import RegFileArray, treereduce
+from scoreboard.global_pending import GlobalPending
+from scoreboard.group_picker import GroupPicker
+from scoreboard.issue_unit import IssueUnitGroup, IssueUnitArray, RegDecode
+from scoreboard.shadow import ShadowMatrix, BranchSpeculationRecord
+from scoreboard.memfu import MemFunctionUnits
+from nmutil.latch import SRLatch
+from nmutil.nmoperator import eq
+
+from random import randint, seed
+from copy import deepcopy
+from math import log
+
+
+class Memory(Elaboratable):
+    def __init__(self, regwid, addrw):
+        self.ddepth = regwid/8
+        depth = (1<<addrw) / self.ddepth
+        self.adr   = Signal(addrw)
+        self.dat_r = Signal(regwid)
+        self.dat_w = Signal(regwid)
+        self.we    = Signal()
+        self.mem   = Memory(width=regwid, depth=depth, init=range(0, depth))
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.rdport = rdport = self.mem.read_port()
+        m.submodules.wrport = wrport = self.mem.write_port()
+        m.d.comb += [
+            rdport.addr.eq(self.adr[self.ddepth:]), # ignore low bits
+            self.dat_r.eq(rdport.data),
+            wrport.addr.eq(self.adr),
+            wrport.data.eq(self.dat_w),
+            wrport.en.eq(self.we),
+        ]
+        return m
+
+
+class MemSim:
+    def __init__(self, regwid, addrw):
+        self.regwid = regwid
+        self.ddepth = regwid//8
+        depth = (1<<addrw) // self.ddepth
+        self.mem = list(range(0, depth))
+
+    def ld(self, addr):
+        return self.mem[addr>>self.ddepth]
+
+    def st(self, addr, data):
+        self.mem[addr>>self.ddepth] = data & ((1<<self.regwid)-1)
+
+
+class Scoreboard(Elaboratable):
+    def __init__(self, rwid, n_regs):
+        """ Inputs:
+
+            * :rwid:   bit width of register file(s) - both FP and INT
+            * :n_regs: depth of register file(s) - number of FP and INT regs
+        """
+        self.rwid = rwid
+        self.n_regs = n_regs
+
+        # Register Files
+        self.intregs = RegFileArray(rwid, n_regs)
+        self.fpregs = RegFileArray(rwid, n_regs)
+
+        # issue q needs to get at these
+        self.aluissue = IssueUnitGroup(4)
+        self.brissue = IssueUnitGroup(1)
+        # and these
+        self.alu_oper_i = Signal(4, reset_less=True)
+        self.alu_imm_i = Signal(rwid, reset_less=True)
+        self.br_oper_i = Signal(4, reset_less=True)
+        self.br_imm_i = Signal(rwid, reset_less=True)
+
+        # inputs
+        self.int_dest_i = Signal(max=n_regs, reset_less=True) # Dest R# in
+        self.int_src1_i = Signal(max=n_regs, reset_less=True) # oper1 R# in
+        self.int_src2_i = Signal(max=n_regs, reset_less=True) # oper2 R# in
+        self.reg_enable_i = Signal(reset_less=True) # enable reg decode
+
+        # outputs
+        self.issue_o = Signal(reset_less=True) # instruction was accepted
+        self.busy_o = Signal(reset_less=True) # at least one CU is busy
+
+        # for branch speculation experiment.  branch_direction = 0 if
+        # the branch hasn't been met yet.  1 indicates "success", 2 is "fail"
+        # branch_succ and branch_fail are requests to have the current
+        # instruction be dependent on the branch unit "shadow" capability.
+        self.branch_succ_i = Signal(reset_less=True)
+        self.branch_fail_i = Signal(reset_less=True)
+        self.branch_direction_o = Signal(2, reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        sync = m.d.sync
+
+        m.submodules.intregs = self.intregs
+        m.submodules.fpregs = self.fpregs
+
+        # register ports
+        int_dest = self.intregs.write_port("dest")
+        int_src1 = self.intregs.read_port("src1")
+        int_src2 = self.intregs.read_port("src2")
+
+        fp_dest = self.fpregs.write_port("dest")
+        fp_src1 = self.fpregs.read_port("src1")
+        fp_src2 = self.fpregs.read_port("src2")
+
+        # Int ALUs and Comp Units
+        n_int_alus = 5
+        cua = CompUnitALUs(self.rwid, 3)
+        cub = CompUnitBR(self.rwid, 3)
+        m.submodules.cu = cu = CompUnitsBase(self.rwid, [cua, cub])
+        bgt = cub.bgt # get at the branch computation unit
+        br1 = cub.br1
+
+        # Int FUs
+        m.submodules.intfus = intfus = FunctionUnits(self.n_regs, n_int_alus)
+
+        # Count of number of FUs
+        n_intfus = n_int_alus
+        n_fp_fus = 0 # for now
+
+        # Integer Priority Picker 1: Adder + Subtractor
+        intpick1 = GroupPicker(n_intfus) # picks between add, sub, mul and shf
+        m.submodules.intpick1 = intpick1
+
+        # INT/FP Issue Unit
+        regdecode = RegDecode(self.n_regs)
+        m.submodules.regdecode = regdecode
+        issueunit = IssueUnitArray([self.aluissue, self.brissue])
+        m.submodules.issueunit = issueunit
+
+        # Shadow Matrix.  currently n_intfus shadows, to be used for
+        # write-after-write hazards.  NOTE: there is one extra for branches,
+        # so the shadow width is increased by 1
+        m.submodules.shadows = shadows = ShadowMatrix(n_intfus, n_intfus, True)
+        m.submodules.bshadow = bshadow = ShadowMatrix(n_intfus, 1, False)
+
+        # record previous instruction to cast shadow on current instruction
+        prev_shadow = Signal(n_intfus)
+
+        # Branch Speculation recorder.  tracks the success/fail state as
+        # each instruction is issued, so that when the branch occurs the
+        # allow/cancel can be issued as appropriate.
+        m.submodules.specrec = bspec = BranchSpeculationRecord(n_intfus)
+
+        #---------
+        # ok start wiring things together...
+        # "now hear de word of de looord... dem bones dem bones dem dryy bones"
+        # https://www.youtube.com/watch?v=pYb8Wm6-QfA
+        #---------
+
+        #---------
+        # Issue Unit is where it starts.  set up some in/outs for this module
+        #---------
+        comb += [    regdecode.dest_i.eq(self.int_dest_i),
+                     regdecode.src1_i.eq(self.int_src1_i),
+                     regdecode.src2_i.eq(self.int_src2_i),
+                     regdecode.enable_i.eq(self.reg_enable_i),
+                     self.issue_o.eq(issueunit.issue_o)
+                    ]
+
+        # take these to outside (issue needs them)
+        comb += cua.oper_i.eq(self.alu_oper_i)
+        comb += cua.imm_i.eq(self.alu_imm_i)
+        comb += cub.oper_i.eq(self.br_oper_i)
+        comb += cub.imm_i.eq(self.br_imm_i)
+
+        # TODO: issueunit.f (FP)
+
+        # and int function issue / busy arrays, and dest/src1/src2
+        comb += intfus.dest_i.eq(regdecode.dest_o)
+        comb += intfus.src1_i.eq(regdecode.src1_o)
+        comb += intfus.src2_i.eq(regdecode.src2_o)
+
+        fn_issue_o = issueunit.fn_issue_o
+
+        comb += intfus.fn_issue_i.eq(fn_issue_o)
+        comb += issueunit.busy_i.eq(cu.busy_o)
+        comb += self.busy_o.eq(cu.busy_o.bool())
+
+        #---------
+        # merge shadow matrices outputs
+        #---------
+
+        # these are explained in ShadowMatrix docstring, and are to be
+        # connected to the FUReg and FUFU Matrices, to get them to reset
+        anydie = Signal(n_intfus, reset_less=True)
+        allshadown = Signal(n_intfus, reset_less=True)
+        shreset = Signal(n_intfus, reset_less=True)
+        comb += allshadown.eq(shadows.shadown_o & bshadow.shadown_o)
+        comb += anydie.eq(shadows.go_die_o | bshadow.go_die_o)
+        comb += shreset.eq(bspec.match_g_o | bspec.match_f_o)
+
+        #---------
+        # connect fu-fu matrix
+        #---------
+
+        # Group Picker... done manually for now.
+        go_rd_o = intpick1.go_rd_o
+        go_wr_o = intpick1.go_wr_o
+        go_rd_i = intfus.go_rd_i
+        go_wr_i = intfus.go_wr_i
+        go_die_i = intfus.go_die_i
+        # NOTE: connect to the shadowed versions so that they can "die" (reset)
+        comb += go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus]) # rd
+        comb += go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus]) # wr
+        comb += go_die_i[0:n_intfus].eq(anydie[0:n_intfus]) # die
+
+        # Connect Picker
+        #---------
+        comb += intpick1.rd_rel_i[0:n_intfus].eq(cu.rd_rel_o[0:n_intfus])
+        comb += intpick1.req_rel_i[0:n_intfus].eq(cu.req_rel_o[0:n_intfus])
+        int_rd_o = intfus.readable_o
+        int_wr_o = intfus.writable_o
+        comb += intpick1.readable_i[0:n_intfus].eq(int_rd_o[0:n_intfus])
+        comb += intpick1.writable_i[0:n_intfus].eq(int_wr_o[0:n_intfus])
+
+        #---------
+        # Shadow Matrix
+        #---------
+
+        comb += shadows.issue_i.eq(fn_issue_o)
+        #comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
+        comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
+        #---------
+        # NOTE; this setup is for the instruction order preservation...
+
+        # connect shadows / go_dies to Computation Units
+        comb += cu.shadown_i[0:n_intfus].eq(allshadown)
+        comb += cu.go_die_i[0:n_intfus].eq(anydie)
+
+        # ok connect first n_int_fu shadows to busy lines, to create an
+        # instruction-order linked-list-like arrangement, using a bit-matrix
+        # (instead of e.g. a ring buffer).
+        # XXX TODO
+
+        # when written, the shadow can be cancelled (and was good)
+        for i in range(n_intfus):
+            comb += shadows.s_good_i[i][0:n_intfus].eq(go_wr_o[0:n_intfus])
+
+        # *previous* instruction shadows *current* instruction, and, obviously,
+        # if the previous is completed (!busy) don't cast the shadow!
+        comb += prev_shadow.eq(~fn_issue_o & cu.busy_o)
+        for i in range(n_intfus):
+            comb += shadows.shadow_i[i][0:n_intfus].eq(prev_shadow)
+
+        #---------
+        # ... and this is for branch speculation.  it uses the extra bit
+        # tacked onto the ShadowMatrix (hence shadow_wid=n_intfus+1)
+        # only needs to set shadow_i, s_fail_i and s_good_i
+
+        # issue captures shadow_i (if enabled)
+        comb += bshadow.reset_i[0:n_intfus].eq(shreset[0:n_intfus])
+
+        bactive = Signal(reset_less=True)
+        comb += bactive.eq((bspec.active_i | br1.issue_i) & ~br1.go_wr_i)
+
+        # instruction being issued (fn_issue_o) has a shadow cast by the branch
+        with m.If(bactive & (self.branch_succ_i | self.branch_fail_i)):
+            comb += bshadow.issue_i.eq(fn_issue_o)
+            for i in range(n_intfus):
+                with m.If(fn_issue_o & (Const(1<<i))):
+                    comb += bshadow.shadow_i[i][0].eq(1)
+
+        # finally, we need an indicator to the test infrastructure as to
+        # whether the branch succeeded or failed, plus, link up to the
+        # "recorder" of whether the instruction was under shadow or not
+
+        with m.If(br1.issue_i):
+            sync += bspec.active_i.eq(1)
+        with m.If(self.branch_succ_i):
+            comb += bspec.good_i.eq(fn_issue_o & 0x1f)
+        with m.If(self.branch_fail_i):
+            comb += bspec.fail_i.eq(fn_issue_o & 0x1f)
+
+        # branch is active (TODO: a better signal: this is over-using the
+        # go_write signal - actually the branch should not be "writing")
+        with m.If(br1.go_wr_i):
+            sync += self.branch_direction_o.eq(br1.data_o+Const(1, 2))
+            sync += bspec.active_i.eq(0)
+            comb += bspec.br_i.eq(1)
+            # branch occurs if data == 1, failed if data == 0
+            comb += bspec.br_ok_i.eq(br1.data_o == 1)
+            for i in range(n_intfus):
+                # *expected* direction of the branch matched against *actual*
+                comb += bshadow.s_good_i[i][0].eq(bspec.match_g_o[i])
+                # ... or it didn't
+                comb += bshadow.s_fail_i[i][0].eq(bspec.match_f_o[i])
+
+        #---------
+        # Connect Register File(s)
+        #---------
+        comb += int_dest.wen.eq(intfus.dest_rsel_o)
+        comb += int_src1.ren.eq(intfus.src1_rsel_o)
+        comb += int_src2.ren.eq(intfus.src2_rsel_o)
+
+        # connect ALUs to regfule
+        comb += int_dest.data_i.eq(cu.data_o)
+        comb += cu.src1_i.eq(int_src1.data_o)
+        comb += cu.src2_i.eq(int_src2.data_o)
+
+        # connect ALU Computation Units
+        comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
+        comb += cu.go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus])
+        comb += cu.issue_i[0:n_intfus].eq(fn_issue_o[0:n_intfus])
+
+        return m
+
+    def __iter__(self):
+        yield from self.intregs
+        yield from self.fpregs
+        yield self.int_dest_i
+        yield self.int_src1_i
+        yield self.int_src2_i
+        yield self.issue_o
+        yield self.branch_succ_i
+        yield self.branch_fail_i
+        yield self.branch_direction_o
+
+    def ports(self):
+        return list(self)
+
+
+
+
+def int_instr(dut, op, imm, src1, src2, dest, branch_success, branch_fail):
+    yield from disable_issue(dut)
+    yield dut.int_dest_i.eq(dest)
+    yield dut.int_src1_i.eq(src1)
+    yield dut.int_src2_i.eq(src2)
+    if (op & (0x3<<2)) != 0: # branch
+        yield dut.brissue.insn_i.eq(1)
+        yield dut.br_oper_i.eq(Const(op & 0x3, 2))
+        yield dut.br_imm_i.eq(imm)
+        dut_issue = dut.brissue
+    else:
+        yield dut.aluissue.insn_i.eq(1)
+        yield dut.alu_oper_i.eq(Const(op & 0x3, 2))
+        yield dut.alu_imm_i.eq(imm)
+        dut_issue = dut.aluissue
+    yield dut.reg_enable_i.eq(1)
+
+    # these indicate that the instruction is to be made shadow-dependent on
+    # (either) branch success or branch fail
+    yield dut.branch_fail_i.eq(branch_fail)
+    yield dut.branch_succ_i.eq(branch_success)
+
+    yield
+    yield from wait_for_issue(dut, dut_issue)
+
+
+def print_reg(dut, rnums):
+    rs = []
+    for rnum in rnums:
+        reg = yield dut.intregs.regs[rnum].reg
+        rs.append("%x" % reg)
+    rnums = map(str, rnums)
+    print ("reg %s: %s" % (','.join(rnums), ','.join(rs)))
+
+
+def create_random_ops(dut, n_ops, shadowing=False, max_opnums=3):
+    insts = []
+    for i in range(n_ops):
+        src1 = randint(1, dut.n_regs-1)
+        src2 = randint(1, dut.n_regs-1)
+        imm = randint(1, (1<<dut.rwid)-1)
+        dest = randint(1, dut.n_regs-1)
+        op = randint(0, max_opnums)
+        opi = 0 if randint(0, 2) else 1 # set true if random is nonzero
+
+        if shadowing:
+            insts.append((src1, src2, dest, op, opi, imm, (0, 0)))
+        else:
+            insts.append((src1, src2, dest, op, opi, imm))
+    return insts
+
+
+
+def scoreboard_sim(dut, alusim):
+
+    seed(0)
+
+    for i in range(50):
+
+        # set random values in the registers
+        for i in range(1, dut.n_regs):
+            val = randint(0, (1<<alusim.rwidth)-1)
+            #val = 31+i*3
+            #val = i
+            yield dut.intregs.regs[i].reg.eq(val)
+            alusim.setval(i, val)
+
+        # create some instructions (some random, some regression tests)
+        instrs = []
+        if True:
+            instrs = create_random_ops(dut, 15, True, 4)
+
+        if False:
+            instrs.append( (1, 2, 2, 1, 1, 20, (0, 0)) )
+
+        if False:
+            instrs.append( (7, 3, 2, 4, (0, 0)) )
+            instrs.append( (7, 6, 6, 2, (0, 0)) )
+            instrs.append( (1, 7, 2, 2, (0, 0)) )
+
+        if False:
+            instrs.append((2, 3, 3, 0, 0, 0, (0, 0)))
+            instrs.append((5, 3, 3, 1, 0, 0, (0, 0)))
+            instrs.append((3, 5, 5, 2, 0, 0, (0, 0)))
+            instrs.append((5, 3, 3, 3, 0, 0, (0, 0)))
+            instrs.append((3, 5, 5, 0, 0, 0, (0, 0)))
+
+        if False:
+            instrs.append( (3, 3, 4, 0, 0, 13979, (0, 0)))
+            instrs.append( (6, 4, 1, 2, 0, 40976, (0, 0)))
+            instrs.append( (1, 4, 7, 4, 1, 23652, (0, 0)))
+
+        if False:
+            instrs.append((5, 6, 2, 1))
+            instrs.append((2, 2, 4, 0))
+            #instrs.append((2, 2, 3, 1))
+
+        if False:
+            instrs.append((2, 1, 2, 3))
+
+        if False:
+            instrs.append((2, 6, 2, 1))
+            instrs.append((2, 1, 2, 0))
+
+        if False:
+            instrs.append((1, 2, 7, 2))
+            instrs.append((7, 1, 5, 0))
+            instrs.append((4, 4, 1, 1))
+
+        if False:
+            instrs.append((5, 6, 2, 2))
+            instrs.append((1, 1, 4, 1))
+            instrs.append((6, 5, 3, 0))
+
+        if False:
+            # Write-after-Write Hazard
+            instrs.append( (3, 6, 7, 2) )
+            instrs.append( (4, 4, 7, 1) )
+
+        if False:
+            # self-read/write-after-write followed by Read-after-Write
+            instrs.append((1, 1, 1, 1))
+            instrs.append((1, 5, 3, 0))
+
+        if False:
+            # Read-after-Write followed by self-read-after-write
+            instrs.append((5, 6, 1, 2))
+            instrs.append((1, 1, 1, 1))
+
+        if False:
+            # self-read-write sandwich
+            instrs.append((5, 6, 1, 2))
+            instrs.append((1, 1, 1, 1))
+            instrs.append((1, 5, 3, 0))
+
+        if False:
+            # very weird failure
+            instrs.append( (5, 2, 5, 2) )
+            instrs.append( (2, 6, 3, 0) )
+            instrs.append( (4, 2, 2, 1) )
+
+        if False:
+            v1 = 4
+            yield dut.intregs.regs[5].reg.eq(v1)
+            alusim.setval(5, v1)
+            yield dut.intregs.regs[3].reg.eq(5)
+            alusim.setval(3, 5)
+            instrs.append((5, 3, 3, 4, (0, 0)))
+            instrs.append((4, 2, 1, 2, (0, 1)))
+
+        if False:
+            v1 = 6
+            yield dut.intregs.regs[5].reg.eq(v1)
+            alusim.setval(5, v1)
+            yield dut.intregs.regs[3].reg.eq(5)
+            alusim.setval(3, 5)
+            instrs.append((5, 3, 3, 4, (0, 0)))
+            instrs.append((4, 2, 1, 2, (1, 0)))
+
+        if False:
+            instrs.append( (4, 3, 5, 1, 0, (0, 0)) )
+            instrs.append( (5, 2, 3, 1, 0, (0, 0)) )
+            instrs.append( (7, 1, 5, 2, 0, (0, 0)) )
+            instrs.append( (5, 6, 6, 4, 0, (0, 0)) )
+            instrs.append( (7, 5, 2, 2, 0, (1, 0)) )
+            instrs.append( (1, 7, 5, 0, 0, (0, 1)) )
+            instrs.append( (1, 6, 1, 2, 0, (1, 0)) )
+            instrs.append( (1, 6, 7, 3, 0, (0, 0)) )
+            instrs.append( (6, 7, 7, 0, 0, (0, 0)) )
+
+        # issue instruction(s), wait for issue to be free before proceeding
+        for i, instr in enumerate(instrs):
+            src1, src2, dest, op, opi, imm, (br_ok, br_fail) = instr
+
+            print ("instr %d: (%d, %d, %d, %d, %d, %d)" % \
+                    (i, src1, src2, dest, op, opi, imm))
+            alusim.op(op, opi, imm, src1, src2, dest)
+            yield from instr_q(dut, op, opi, imm, src1, src2, dest,
+                               br_ok, br_fail)
+
+        # wait for all instructions to stop before checking
+        while True:
+            iqlen = yield dut.qlen_o
+            if iqlen == 0:
+                break
+            yield
+        yield
+        yield
+        yield
+        yield
+        yield from wait_for_busy_clear(dut)
+
+        # check status
+        yield from alusim.check(dut)
+        yield from alusim.dump(dut)
+
+
+def test_scoreboard():
+    dut = IssueToScoreboard(2, 1, 1, 16, 8, 8)
+    alusim = RegSim(16, 8)
+    memsim = MemSim(16, 16)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_scoreboard6600.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, scoreboard_sim(dut, alusim),
+                        vcd_name='test_scoreboard6600.vcd')
+
+    #run_simulation(dut, scoreboard_branch_sim(dut, alusim),
+    #                    vcd_name='test_scoreboard6600.vcd')
+
+
+def mem_sim(dut):
+    yield dut.ld_i.eq(0x1)
+    yield dut.fn_issue_i.eq(0x1)
+    yield
+    yield dut.ld_i.eq(0x0)
+    yield dut.st_i.eq(0x3)
+    yield dut.fn_issue_i.eq(0x2)
+    yield
+    yield dut.st_i.eq(0x0)
+    yield dut.fn_issue_i.eq(0x0)
+    yield
+
+    yield dut.addrs_i[0].eq(0x012)
+    yield dut.addrs_i[1].eq(0x012)
+    yield dut.addrs_i[2].eq(0x010)
+    yield dut.addr_en_i.eq(0x3)
+    yield
+    yield dut.addr_we_i.eq(0x3)
+    yield
+    yield dut.go_ld_i.eq(0x1)
+    yield
+    yield dut.go_ld_i.eq(0x0)
+    yield
+    yield dut.go_st_i.eq(0x2)
+    yield
+    yield dut.go_st_i.eq(0x0)
+    yield
+
+
+def test_mem_fus():
+    dut = MemFunctionUnits(3, 11)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_mem_fus.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, mem_sim(dut),
+                        vcd_name='test_mem_fus.vcd')
+
+
+if __name__ == '__main__':
+    test_mem_fus()
diff --git a/src/soc/scoreboard/test_mem_fu_matrix.py b/src/soc/scoreboard/test_mem_fu_matrix.py
new file mode 100644
index 00000000..9d2a7c6b
--- /dev/null
+++ b/src/soc/scoreboard/test_mem_fu_matrix.py
@@ -0,0 +1,679 @@
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Const, Signal, Array, Cat, Elaboratable
+
+from regfile.regfile import RegFileArray, treereduce
+from scoreboard.ldst_matrix import LDSTDepMatrix
+from scoreboard.fu_mem_matrix import FUMemDepMatrix
+from scoreboard.global_pending import GlobalPending
+from scoreboard.group_picker import GroupPicker
+from scoreboard.issue_unit import IssueUnitGroup, IssueUnitArray, RegDecode
+from scoreboard.shadow import ShadowMatrix, BranchSpeculationRecord
+
+from nmutil.latch import SRLatch
+from nmutil.nmoperator import eq
+
+from random import randint, seed
+from copy import deepcopy
+from math import log
+
+
+class Memory(Elaboratable):
+    def __init__(self, regwid, addrw):
+        self.ddepth = regwid/8
+        depth = (1<<addrw) / self.ddepth
+        self.adr   = Signal(addrw)
+        self.dat_r = Signal(regwid)
+        self.dat_w = Signal(regwid)
+        self.we    = Signal()
+        self.mem   = Memory(width=regwid, depth=depth, init=range(0, depth))
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.rdport = rdport = self.mem.read_port()
+        m.submodules.wrport = wrport = self.mem.write_port()
+        m.d.comb += [
+            rdport.addr.eq(self.adr[self.ddepth:]), # ignore low bits
+            self.dat_r.eq(rdport.data),
+            wrport.addr.eq(self.adr),
+            wrport.data.eq(self.dat_w),
+            wrport.en.eq(self.we),
+        ]
+        return m
+
+
+class MemSim:
+    def __init__(self, regwid, addrw):
+        self.regwid = regwid
+        self.ddepth = regwid//8
+        depth = (1<<addrw) // self.ddepth
+        self.mem = list(range(0, depth))
+
+    def ld(self, addr):
+        return self.mem[addr>>self.ddepth]
+
+    def st(self, addr, data):
+        self.mem[addr>>self.ddepth] = data & ((1<<self.regwid)-1)
+
+
+class MemFunctionUnits(Elaboratable):
+
+    def __init__(self, n_int_alus):
+        self.n_int_alus = n_int_alus
+
+        self.ld_i = Signal(n_int_alus, reset_less=True) # Dest R# in
+        self.st_i = Signal(n_int_alus, reset_less=True) # oper1 R# in
+
+        self.load_hit_i = Signal(n_int_alus, reset_less=True) # Load Hit
+        self.stwd_hit_i = Signal(n_int_alus, reset_less=True) # Store Hit
+
+        #self.g_int_st_pend_o = Signal(n_int_alus, reset_less=True)
+        #self.g_int_ld_pend_o = Signal(n_int_alus, reset_less=True)
+
+        #self.ld_rsel_o = Signal(n_int_alus, reset_less=True) # dest reg (bot)
+        #self.st_rsel_o = Signal(n_int_alus, reset_less=True) # src1 reg (bot)
+
+        self.req_rel_i = Signal(n_int_alus, reset_less = True)
+        self.loadable_o = Signal(n_int_alus, reset_less=True)
+        self.storable_o = Signal(n_int_alus, reset_less=True)
+
+        self.go_st_i = Signal(n_int_alus, reset_less=True)
+        self.go_ld_i = Signal(n_int_alus, reset_less=True)
+        self.go_die_i = Signal(n_int_alus, reset_less=True)
+        self.req_rel_o = Signal(n_int_alus, reset_less=True)
+        self.fn_issue_i = Signal(n_int_alus, reset_less=True)
+
+        # Note: FURegs ld_pend_o is also outputted from here, for use in WaWGrid
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        sync = m.d.sync
+
+        n_intfus = self.n_int_alus
+
+        # Integer LD/ST Dep Matrix
+        ldstdeps = LDSTDepMatrix(n_intfus)
+        m.submodules.ldstdeps = ldstdeps
+        # Integer FU-Mem Dep Matrix
+        fumemdeps = FUMemDepMatrix(n_intfus, n_intfus)
+        m.submodules.fumemdeps = fumemdeps
+
+        #comb += self.g_int_st_pend_o.eq(fumemdeps.v_st_rsel_o)
+        #comb += self.g_int_ld_pend_o.eq(fumemdeps.v_ld_rsel_o)
+
+        #comb += fumemdeps.st_pend_i.eq(fumemdeps.v_st_rsel_o)
+        #comb += fumemdeps.ld_pend_i.eq(fumemdeps.v_ld_rsel_o)
+
+        #comb += ldstdeps.st_pend_i.eq(fumemdeps.st_pend_o)
+        #comb += ldstdeps.ld_pend_i.eq(fumemdeps.ld_pend_o)
+        #self.ld_pend_o = fumemdeps.ld_pend_o # also output for use in WaWGrid
+
+        comb += ldstdeps.ld_pend_i.eq(self.ld_i)
+        comb += ldstdeps.st_pend_i.eq(self.st_i)
+        comb += ldstdeps.issue_i.eq(self.fn_issue_i)
+        comb += ldstdeps.load_hit_i.eq(self.load_hit_i)
+        comb += ldstdeps.stwd_hit_i.eq(self.stwd_hit_i)
+        comb += ldstdeps.go_die_i.eq(self.go_die_i)
+        comb += self.storable_o.eq(fumemdeps.storable_o)
+        comb += self.loadable_o.eq(fumemdeps.loadable_o)
+        comb += fumemdeps.ld_pend_i.eq(ldstdeps.ld_hold_st_o)
+        comb += fumemdeps.st_pend_i.eq(ldstdeps.st_hold_ld_o)
+
+        # Connect function issue / arrays, and dest/src1/src2
+
+        comb += fumemdeps.go_st_i.eq(self.stwd_hit_i)
+        comb += fumemdeps.go_ld_i.eq(self.load_hit_i)
+        comb += fumemdeps.go_die_i.eq(self.go_die_i)
+        comb += fumemdeps.issue_i.eq(self.fn_issue_i)
+
+        #comb += self.ld_rsel_o.eq(fumemdeps.ld_rsel_o)
+        #comb += self.st_rsel_o.eq(fumemdeps.st_rsel_o)
+
+        return m
+
+    def __iter__(self):
+        yield self.ld_i
+        yield self.st_i
+        #yield self.g_int_st_pend_o
+        #yield self.g_int_ld_pend_o
+        #yield self.ld_rsel_o
+        #yield self.st_rsel_o
+        yield self.req_rel_i
+        yield self.loadable_o
+        yield self.storable_o
+        yield self.load_hit_i
+        yield self.stwd_hit_i
+        yield self.go_st_i
+        yield self.go_ld_i
+        yield self.go_die_i
+        yield self.req_rel_o
+        yield self.fn_issue_i
+
+    def ports(self):
+        return list(self)
+
+
+class Scoreboard(Elaboratable):
+    def __init__(self, rwid, n_regs):
+        """ Inputs:
+
+            * :rwid:   bit width of register file(s) - both FP and INT
+            * :n_regs: depth of register file(s) - number of FP and INT regs
+        """
+        self.rwid = rwid
+        self.n_regs = n_regs
+
+        # Register Files
+        self.intregs = RegFileArray(rwid, n_regs)
+        self.fpregs = RegFileArray(rwid, n_regs)
+
+        # issue q needs to get at these
+        self.aluissue = IssueUnitGroup(4)
+        self.brissue = IssueUnitGroup(1)
+        # and these
+        self.alu_oper_i = Signal(4, reset_less=True)
+        self.alu_imm_i = Signal(rwid, reset_less=True)
+        self.br_oper_i = Signal(4, reset_less=True)
+        self.br_imm_i = Signal(rwid, reset_less=True)
+
+        # inputs
+        self.int_dest_i = Signal(max=n_regs, reset_less=True) # Dest R# in
+        self.int_src1_i = Signal(max=n_regs, reset_less=True) # oper1 R# in
+        self.int_src2_i = Signal(max=n_regs, reset_less=True) # oper2 R# in
+        self.reg_enable_i = Signal(reset_less=True) # enable reg decode
+
+        # outputs
+        self.issue_o = Signal(reset_less=True) # instruction was accepted
+        self.busy_o = Signal(reset_less=True) # at least one CU is busy
+
+        # for branch speculation experiment.  branch_direction = 0 if
+        # the branch hasn't been met yet.  1 indicates "success", 2 is "fail"
+        # branch_succ and branch_fail are requests to have the current
+        # instruction be dependent on the branch unit "shadow" capability.
+        self.branch_succ_i = Signal(reset_less=True)
+        self.branch_fail_i = Signal(reset_less=True)
+        self.branch_direction_o = Signal(2, reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        sync = m.d.sync
+
+        m.submodules.intregs = self.intregs
+        m.submodules.fpregs = self.fpregs
+
+        # register ports
+        int_dest = self.intregs.write_port("dest")
+        int_src1 = self.intregs.read_port("src1")
+        int_src2 = self.intregs.read_port("src2")
+
+        fp_dest = self.fpregs.write_port("dest")
+        fp_src1 = self.fpregs.read_port("src1")
+        fp_src2 = self.fpregs.read_port("src2")
+
+        # Int ALUs and Comp Units
+        n_int_alus = 5
+        cua = CompUnitALUs(self.rwid, 3)
+        cub = CompUnitBR(self.rwid, 3)
+        m.submodules.cu = cu = CompUnitsBase(self.rwid, [cua, cub])
+        bgt = cub.bgt # get at the branch computation unit
+        br1 = cub.br1
+
+        # Int FUs
+        m.submodules.intfus = intfus = FunctionUnits(self.n_regs, n_int_alus)
+
+        # Count of number of FUs
+        n_intfus = n_int_alus
+        n_fp_fus = 0 # for now
+
+        # Integer Priority Picker 1: Adder + Subtractor
+        intpick1 = GroupPicker(n_intfus) # picks between add, sub, mul and shf
+        m.submodules.intpick1 = intpick1
+
+        # INT/FP Issue Unit
+        regdecode = RegDecode(self.n_regs)
+        m.submodules.regdecode = regdecode
+        issueunit = IssueUnitArray([self.aluissue, self.brissue])
+        m.submodules.issueunit = issueunit
+
+        # Shadow Matrix.  currently n_intfus shadows, to be used for
+        # write-after-write hazards.  NOTE: there is one extra for branches,
+        # so the shadow width is increased by 1
+        m.submodules.shadows = shadows = ShadowMatrix(n_intfus, n_intfus, True)
+        m.submodules.bshadow = bshadow = ShadowMatrix(n_intfus, 1, False)
+
+        # record previous instruction to cast shadow on current instruction
+        prev_shadow = Signal(n_intfus)
+
+        # Branch Speculation recorder.  tracks the success/fail state as
+        # each instruction is issued, so that when the branch occurs the
+        # allow/cancel can be issued as appropriate.
+        m.submodules.specrec = bspec = BranchSpeculationRecord(n_intfus)
+
+        #---------
+        # ok start wiring things together...
+        # "now hear de word of de looord... dem bones dem bones dem dryy bones"
+        # https://www.youtube.com/watch?v=pYb8Wm6-QfA
+        #---------
+
+        #---------
+        # Issue Unit is where it starts.  set up some in/outs for this module
+        #---------
+        comb += [    regdecode.dest_i.eq(self.int_dest_i),
+                     regdecode.src1_i.eq(self.int_src1_i),
+                     regdecode.src2_i.eq(self.int_src2_i),
+                     regdecode.enable_i.eq(self.reg_enable_i),
+                     self.issue_o.eq(issueunit.issue_o)
+                    ]
+
+        # take these to outside (issue needs them)
+        comb += cua.oper_i.eq(self.alu_oper_i)
+        comb += cua.imm_i.eq(self.alu_imm_i)
+        comb += cub.oper_i.eq(self.br_oper_i)
+        comb += cub.imm_i.eq(self.br_imm_i)
+
+        # TODO: issueunit.f (FP)
+
+        # and int function issue / busy arrays, and dest/src1/src2
+        comb += intfus.dest_i.eq(regdecode.dest_o)
+        comb += intfus.src1_i.eq(regdecode.src1_o)
+        comb += intfus.src2_i.eq(regdecode.src2_o)
+
+        fn_issue_o = issueunit.fn_issue_o
+
+        comb += intfus.fn_issue_i.eq(fn_issue_o)
+        comb += issueunit.busy_i.eq(cu.busy_o)
+        comb += self.busy_o.eq(cu.busy_o.bool())
+
+        #---------
+        # merge shadow matrices outputs
+        #---------
+
+        # these are explained in ShadowMatrix docstring, and are to be
+        # connected to the FUReg and FUFU Matrices, to get them to reset
+        anydie = Signal(n_intfus, reset_less=True)
+        allshadown = Signal(n_intfus, reset_less=True)
+        shreset = Signal(n_intfus, reset_less=True)
+        comb += allshadown.eq(shadows.shadown_o & bshadow.shadown_o)
+        comb += anydie.eq(shadows.go_die_o | bshadow.go_die_o)
+        comb += shreset.eq(bspec.match_g_o | bspec.match_f_o)
+
+        #---------
+        # connect fu-fu matrix
+        #---------
+
+        # Group Picker... done manually for now.
+        go_rd_o = intpick1.go_rd_o
+        go_wr_o = intpick1.go_wr_o
+        go_rd_i = intfus.go_rd_i
+        go_wr_i = intfus.go_wr_i
+        go_die_i = intfus.go_die_i
+        # NOTE: connect to the shadowed versions so that they can "die" (reset)
+        comb += go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus]) # rd
+        comb += go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus]) # wr
+        comb += go_die_i[0:n_intfus].eq(anydie[0:n_intfus]) # die
+
+        # Connect Picker
+        #---------
+        comb += intpick1.rd_rel_i[0:n_intfus].eq(cu.rd_rel_o[0:n_intfus])
+        comb += intpick1.req_rel_i[0:n_intfus].eq(cu.req_rel_o[0:n_intfus])
+        int_rd_o = intfus.readable_o
+        int_wr_o = intfus.writable_o
+        comb += intpick1.readable_i[0:n_intfus].eq(int_rd_o[0:n_intfus])
+        comb += intpick1.writable_i[0:n_intfus].eq(int_wr_o[0:n_intfus])
+
+        #---------
+        # Shadow Matrix
+        #---------
+
+        comb += shadows.issue_i.eq(fn_issue_o)
+        #comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
+        comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
+        #---------
+        # NOTE; this setup is for the instruction order preservation...
+
+        # connect shadows / go_dies to Computation Units
+        comb += cu.shadown_i[0:n_intfus].eq(allshadown)
+        comb += cu.go_die_i[0:n_intfus].eq(anydie)
+
+        # ok connect first n_int_fu shadows to busy lines, to create an
+        # instruction-order linked-list-like arrangement, using a bit-matrix
+        # (instead of e.g. a ring buffer).
+        # XXX TODO
+
+        # when written, the shadow can be cancelled (and was good)
+        for i in range(n_intfus):
+            comb += shadows.s_good_i[i][0:n_intfus].eq(go_wr_o[0:n_intfus])
+
+        # *previous* instruction shadows *current* instruction, and, obviously,
+        # if the previous is completed (!busy) don't cast the shadow!
+        comb += prev_shadow.eq(~fn_issue_o & cu.busy_o)
+        for i in range(n_intfus):
+            comb += shadows.shadow_i[i][0:n_intfus].eq(prev_shadow)
+
+        #---------
+        # ... and this is for branch speculation.  it uses the extra bit
+        # tacked onto the ShadowMatrix (hence shadow_wid=n_intfus+1)
+        # only needs to set shadow_i, s_fail_i and s_good_i
+
+        # issue captures shadow_i (if enabled)
+        comb += bshadow.reset_i[0:n_intfus].eq(shreset[0:n_intfus])
+
+        bactive = Signal(reset_less=True)
+        comb += bactive.eq((bspec.active_i | br1.issue_i) & ~br1.go_wr_i)
+
+        # instruction being issued (fn_issue_o) has a shadow cast by the branch
+        with m.If(bactive & (self.branch_succ_i | self.branch_fail_i)):
+            comb += bshadow.issue_i.eq(fn_issue_o)
+            for i in range(n_intfus):
+                with m.If(fn_issue_o & (Const(1<<i))):
+                    comb += bshadow.shadow_i[i][0].eq(1)
+
+        # finally, we need an indicator to the test infrastructure as to
+        # whether the branch succeeded or failed, plus, link up to the
+        # "recorder" of whether the instruction was under shadow or not
+
+        with m.If(br1.issue_i):
+            sync += bspec.active_i.eq(1)
+        with m.If(self.branch_succ_i):
+            comb += bspec.good_i.eq(fn_issue_o & 0x1f)
+        with m.If(self.branch_fail_i):
+            comb += bspec.fail_i.eq(fn_issue_o & 0x1f)
+
+        # branch is active (TODO: a better signal: this is over-using the
+        # go_write signal - actually the branch should not be "writing")
+        with m.If(br1.go_wr_i):
+            sync += self.branch_direction_o.eq(br1.data_o+Const(1, 2))
+            sync += bspec.active_i.eq(0)
+            comb += bspec.br_i.eq(1)
+            # branch occurs if data == 1, failed if data == 0
+            comb += bspec.br_ok_i.eq(br1.data_o == 1)
+            for i in range(n_intfus):
+                # *expected* direction of the branch matched against *actual*
+                comb += bshadow.s_good_i[i][0].eq(bspec.match_g_o[i])
+                # ... or it didn't
+                comb += bshadow.s_fail_i[i][0].eq(bspec.match_f_o[i])
+
+        #---------
+        # Connect Register File(s)
+        #---------
+        comb += int_dest.wen.eq(intfus.dest_rsel_o)
+        comb += int_src1.ren.eq(intfus.src1_rsel_o)
+        comb += int_src2.ren.eq(intfus.src2_rsel_o)
+
+        # connect ALUs to regfule
+        comb += int_dest.data_i.eq(cu.data_o)
+        comb += cu.src1_i.eq(int_src1.data_o)
+        comb += cu.src2_i.eq(int_src2.data_o)
+
+        # connect ALU Computation Units
+        comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
+        comb += cu.go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus])
+        comb += cu.issue_i[0:n_intfus].eq(fn_issue_o[0:n_intfus])
+
+        return m
+
+    def __iter__(self):
+        yield from self.intregs
+        yield from self.fpregs
+        yield self.int_dest_i
+        yield self.int_src1_i
+        yield self.int_src2_i
+        yield self.issue_o
+        yield self.branch_succ_i
+        yield self.branch_fail_i
+        yield self.branch_direction_o
+
+    def ports(self):
+        return list(self)
+
+
+
+
+def int_instr(dut, op, imm, src1, src2, dest, branch_success, branch_fail):
+    yield from disable_issue(dut)
+    yield dut.int_dest_i.eq(dest)
+    yield dut.int_src1_i.eq(src1)
+    yield dut.int_src2_i.eq(src2)
+    if (op & (0x3<<2)) != 0: # branch
+        yield dut.brissue.insn_i.eq(1)
+        yield dut.br_oper_i.eq(Const(op & 0x3, 2))
+        yield dut.br_imm_i.eq(imm)
+        dut_issue = dut.brissue
+    else:
+        yield dut.aluissue.insn_i.eq(1)
+        yield dut.alu_oper_i.eq(Const(op & 0x3, 2))
+        yield dut.alu_imm_i.eq(imm)
+        dut_issue = dut.aluissue
+    yield dut.reg_enable_i.eq(1)
+
+    # these indicate that the instruction is to be made shadow-dependent on
+    # (either) branch success or branch fail
+    yield dut.branch_fail_i.eq(branch_fail)
+    yield dut.branch_succ_i.eq(branch_success)
+
+    yield
+    yield from wait_for_issue(dut, dut_issue)
+
+
+def print_reg(dut, rnums):
+    rs = []
+    for rnum in rnums:
+        reg = yield dut.intregs.regs[rnum].reg
+        rs.append("%x" % reg)
+    rnums = map(str, rnums)
+    print ("reg %s: %s" % (','.join(rnums), ','.join(rs)))
+
+
+def create_random_ops(dut, n_ops, shadowing=False, max_opnums=3):
+    insts = []
+    for i in range(n_ops):
+        src1 = randint(1, dut.n_regs-1)
+        src2 = randint(1, dut.n_regs-1)
+        imm = randint(1, (1<<dut.rwid)-1)
+        dest = randint(1, dut.n_regs-1)
+        op = randint(0, max_opnums)
+        opi = 0 if randint(0, 2) else 1 # set true if random is nonzero
+
+        if shadowing:
+            insts.append((src1, src2, dest, op, opi, imm, (0, 0)))
+        else:
+            insts.append((src1, src2, dest, op, opi, imm))
+    return insts
+
+
+
+def scoreboard_sim(dut, alusim):
+
+    seed(0)
+
+    for i in range(50):
+
+        # set random values in the registers
+        for i in range(1, dut.n_regs):
+            val = randint(0, (1<<alusim.rwidth)-1)
+            #val = 31+i*3
+            #val = i
+            yield dut.intregs.regs[i].reg.eq(val)
+            alusim.setval(i, val)
+
+        # create some instructions (some random, some regression tests)
+        instrs = []
+        if True:
+            instrs = create_random_ops(dut, 15, True, 4)
+
+        if False:
+            instrs.append( (1, 2, 2, 1, 1, 20, (0, 0)) )
+
+        if False:
+            instrs.append( (7, 3, 2, 4, (0, 0)) )
+            instrs.append( (7, 6, 6, 2, (0, 0)) )
+            instrs.append( (1, 7, 2, 2, (0, 0)) )
+
+        if False:
+            instrs.append((2, 3, 3, 0, 0, 0, (0, 0)))
+            instrs.append((5, 3, 3, 1, 0, 0, (0, 0)))
+            instrs.append((3, 5, 5, 2, 0, 0, (0, 0)))
+            instrs.append((5, 3, 3, 3, 0, 0, (0, 0)))
+            instrs.append((3, 5, 5, 0, 0, 0, (0, 0)))
+
+        if False:
+            instrs.append( (3, 3, 4, 0, 0, 13979, (0, 0)))
+            instrs.append( (6, 4, 1, 2, 0, 40976, (0, 0)))
+            instrs.append( (1, 4, 7, 4, 1, 23652, (0, 0)))
+
+        if False:
+            instrs.append((5, 6, 2, 1))
+            instrs.append((2, 2, 4, 0))
+            #instrs.append((2, 2, 3, 1))
+
+        if False:
+            instrs.append((2, 1, 2, 3))
+
+        if False:
+            instrs.append((2, 6, 2, 1))
+            instrs.append((2, 1, 2, 0))
+
+        if False:
+            instrs.append((1, 2, 7, 2))
+            instrs.append((7, 1, 5, 0))
+            instrs.append((4, 4, 1, 1))
+
+        if False:
+            instrs.append((5, 6, 2, 2))
+            instrs.append((1, 1, 4, 1))
+            instrs.append((6, 5, 3, 0))
+
+        if False:
+            # Write-after-Write Hazard
+            instrs.append( (3, 6, 7, 2) )
+            instrs.append( (4, 4, 7, 1) )
+
+        if False:
+            # self-read/write-after-write followed by Read-after-Write
+            instrs.append((1, 1, 1, 1))
+            instrs.append((1, 5, 3, 0))
+
+        if False:
+            # Read-after-Write followed by self-read-after-write
+            instrs.append((5, 6, 1, 2))
+            instrs.append((1, 1, 1, 1))
+
+        if False:
+            # self-read-write sandwich
+            instrs.append((5, 6, 1, 2))
+            instrs.append((1, 1, 1, 1))
+            instrs.append((1, 5, 3, 0))
+
+        if False:
+            # very weird failure
+            instrs.append( (5, 2, 5, 2) )
+            instrs.append( (2, 6, 3, 0) )
+            instrs.append( (4, 2, 2, 1) )
+
+        if False:
+            v1 = 4
+            yield dut.intregs.regs[5].reg.eq(v1)
+            alusim.setval(5, v1)
+            yield dut.intregs.regs[3].reg.eq(5)
+            alusim.setval(3, 5)
+            instrs.append((5, 3, 3, 4, (0, 0)))
+            instrs.append((4, 2, 1, 2, (0, 1)))
+
+        if False:
+            v1 = 6
+            yield dut.intregs.regs[5].reg.eq(v1)
+            alusim.setval(5, v1)
+            yield dut.intregs.regs[3].reg.eq(5)
+            alusim.setval(3, 5)
+            instrs.append((5, 3, 3, 4, (0, 0)))
+            instrs.append((4, 2, 1, 2, (1, 0)))
+
+        if False:
+            instrs.append( (4, 3, 5, 1, 0, (0, 0)) )
+            instrs.append( (5, 2, 3, 1, 0, (0, 0)) )
+            instrs.append( (7, 1, 5, 2, 0, (0, 0)) )
+            instrs.append( (5, 6, 6, 4, 0, (0, 0)) )
+            instrs.append( (7, 5, 2, 2, 0, (1, 0)) )
+            instrs.append( (1, 7, 5, 0, 0, (0, 1)) )
+            instrs.append( (1, 6, 1, 2, 0, (1, 0)) )
+            instrs.append( (1, 6, 7, 3, 0, (0, 0)) )
+            instrs.append( (6, 7, 7, 0, 0, (0, 0)) )
+
+        # issue instruction(s), wait for issue to be free before proceeding
+        for i, instr in enumerate(instrs):
+            src1, src2, dest, op, opi, imm, (br_ok, br_fail) = instr
+
+            print ("instr %d: (%d, %d, %d, %d, %d, %d)" % \
+                    (i, src1, src2, dest, op, opi, imm))
+            alusim.op(op, opi, imm, src1, src2, dest)
+            yield from instr_q(dut, op, opi, imm, src1, src2, dest,
+                               br_ok, br_fail)
+
+        # wait for all instructions to stop before checking
+        while True:
+            iqlen = yield dut.qlen_o
+            if iqlen == 0:
+                break
+            yield
+        yield
+        yield
+        yield
+        yield
+        yield from wait_for_busy_clear(dut)
+
+        # check status
+        yield from alusim.check(dut)
+        yield from alusim.dump(dut)
+
+
+def test_scoreboard():
+    dut = IssueToScoreboard(2, 1, 1, 16, 8, 8)
+    alusim = RegSim(16, 8)
+    memsim = MemSim(16, 16)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_scoreboard6600.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, scoreboard_sim(dut, alusim),
+                        vcd_name='test_scoreboard6600.vcd')
+
+    #run_simulation(dut, scoreboard_branch_sim(dut, alusim),
+    #                    vcd_name='test_scoreboard6600.vcd')
+
+
+def mem_sim(dut):
+    yield dut.ld_i.eq(0x1)
+    yield dut.fn_issue_i.eq(0x1)
+    yield
+    #yield dut.ld_i.eq(0x0)
+    yield dut.st_i.eq(0x2)
+    yield dut.fn_issue_i.eq(0x2)
+    yield
+    #yield dut.st_i.eq(0x0)
+    yield dut.fn_issue_i.eq(0x0)
+    yield
+
+    yield dut.load_hit_i.eq(0x1)
+    yield
+    yield dut.load_hit_i.eq(0x0)
+    yield
+    yield dut.stwd_hit_i.eq(0x2)
+    yield
+    yield dut.stwd_hit_i.eq(0x0)
+    yield
+
+
+def test_mem_fus():
+    dut = MemFunctionUnits(3)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_mem_fus.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, mem_sim(dut),
+                        vcd_name='test_mem_fus.vcd')
+
+
+if __name__ == '__main__':
+    test_mem_fus()