Allow the formal engine to perform a same-cycle result in the ALU master
authorCesar Strauss <cestrauss@gmail.com>
Sun, 5 Nov 2023 14:18:40 +0000 (11:18 -0300)
committerCesar Strauss <cestrauss@gmail.com>
Sun, 5 Nov 2023 14:18:40 +0000 (11:18 -0300)
This adds an exception to holding o_valid low, when the ALU is idle.
If a write to the ALU just occurred, allow o_valid to become high, in
the same cycle.

140 files changed:
.gitignore
.gitlab-ci.yml
Makefile
conf.py
mkpinmux.sh
pinmux
pyproject.toml [new file with mode: 0644]
setup.py
src/soc/bus/external_core.py [new file with mode: 0644]
src/soc/bus/opencores_ethmac.py [new file with mode: 0644]
src/soc/bus/sdr_ctrl.py [new file with mode: 0644]
src/soc/bus/sram.py
src/soc/bus/syscon.py [new file with mode: 0644]
src/soc/bus/tercel.py [new file with mode: 0644]
src/soc/bus/test/wb_rw.py
src/soc/bus/uart_16550.py [new file with mode: 0644]
src/soc/bus/wb_async.py [new file with mode: 0644]
src/soc/bus/wb_downconvert.py
src/soc/config/ifetch.py
src/soc/config/pinouts.py
src/soc/config/test/test_fetch.py
src/soc/config/test/test_pi2ls.py
src/soc/debug/.gitignore [new file with mode: 0644]
src/soc/debug/dmi.py
src/soc/debug/test/test_jtag_tap.py
src/soc/experiment/alu_hier.py
src/soc/experiment/cache_ram.py
src/soc/experiment/compalu_multi.py
src/soc/experiment/compldst_multi.py
src/soc/experiment/dcache.py
src/soc/experiment/formal/proof_compalu_multi.py [new file with mode: 0644]
src/soc/experiment/icache.py
src/soc/experiment/l0_cache.py
src/soc/experiment/mmu.py
src/soc/experiment/pi2ls.py
src/soc/experiment/pimem.py
src/soc/experiment/plru.py
src/soc/experiment/radix_walk_example.txt
src/soc/experiment/test/pagetables.py
src/soc/experiment/test/test_compalu_multi.py
src/soc/experiment/test/test_compldst_multi.py
src/soc/experiment/test/test_compldst_multi_mmu.py
src/soc/experiment/test/test_compldst_multi_mmu_fsm.py
src/soc/experiment/test/test_dcache.py
src/soc/experiment/test/test_dcache_tlb.py
src/soc/experiment/test/test_dcbz_pi.py
src/soc/experiment/test/test_l0_cache_buffer2.py
src/soc/experiment/test/test_ldst_pi.py
src/soc/experiment/test/test_ldst_pi_misalign.py
src/soc/experiment/test/test_loadstore1.py
src/soc/experiment/test/test_mmu_dcache.py
src/soc/experiment/test/test_mmu_dcache_pi.py
src/soc/experiment/test/test_wishbone.py
src/soc/fu/alu/formal/proof_input_stage.py
src/soc/fu/alu/formal/proof_main_stage.py
src/soc/fu/alu/formal/proof_output_stage.py
src/soc/fu/alu/main_stage.py
src/soc/fu/alu/pipe_data.py
src/soc/fu/alu/pipeline.py
src/soc/fu/alu/test/test_pipe_caller.py
src/soc/fu/branch/formal/proof_input_stage.py
src/soc/fu/branch/formal/proof_main_stage.py
src/soc/fu/branch/pipe_data.py
src/soc/fu/branch/pipeline.py
src/soc/fu/branch/test/test_pipe_caller.py
src/soc/fu/common_output_stage.py
src/soc/fu/compunits/compunits.py
src/soc/fu/compunits/test/test_compunit.py
src/soc/fu/cr/formal/proof_main_stage.py
src/soc/fu/cr/pipe_data.py
src/soc/fu/cr/test/test_pipe_caller.py
src/soc/fu/div/experiment/__init__.py [new file with mode: 0644]
src/soc/fu/div/experiment/goldschmidt_div_sqrt.py [new file with mode: 0644]
src/soc/fu/div/experiment/test/__init__.py [new file with mode: 0644]
src/soc/fu/div/experiment/test/test_goldschmidt_div_sqrt.py [new file with mode: 0644]
src/soc/fu/div/pipe_data.py
src/soc/fu/div/pipeline.py
src/soc/fu/div/setup_stage.py
src/soc/fu/div/test/helper.py
src/soc/fu/div/test/test_pipe_ilang.py
src/soc/fu/ldst/ldst_input_record.py
src/soc/fu/ldst/loadstore.py
src/soc/fu/ldst/pipe_data.py
src/soc/fu/logical/bpermd.py
src/soc/fu/logical/formal/proof_input_stage.py
src/soc/fu/logical/formal/proof_main_stage.py
src/soc/fu/logical/main_stage.py
src/soc/fu/logical/pipe_data.py
src/soc/fu/logical/pipeline.py
src/soc/fu/logical/popcount.py
src/soc/fu/logical/test/test_pipe_caller.py
src/soc/fu/mmu/fsm.py
src/soc/fu/mmu/mmu_input_record.py
src/soc/fu/mmu/pipe_data.py
src/soc/fu/mmu/test/test_issuer_mmu_data_path.py
src/soc/fu/mmu/test/test_non_production_core.py
src/soc/fu/mmu/test/test_pipe_caller.py
src/soc/fu/mul/formal/proof_main_stage.py
src/soc/fu/mul/pipe_data.py
src/soc/fu/mul/pre_stage.py
src/soc/fu/mul/test/helper.py
src/soc/fu/mul/test/test_pipe_caller_long.py
src/soc/fu/mul/test/test_pipe_ilang.py
src/soc/fu/pipe_data.py
src/soc/fu/regspec.py
src/soc/fu/shift_rot/formal/proof_main_stage.py
src/soc/fu/shift_rot/main_stage.py
src/soc/fu/shift_rot/pipe_data.py
src/soc/fu/shift_rot/pipeline.py
src/soc/fu/shift_rot/rotator.py
src/soc/fu/shift_rot/test/test_maskgen.py
src/soc/fu/shift_rot/test/test_pipe_caller.py
src/soc/fu/spr/formal/proof_main_stage.py
src/soc/fu/spr/main_stage.py
src/soc/fu/spr/pipe_data.py
src/soc/fu/spr/test/test_pipe_caller.py
src/soc/fu/trap/formal/proof_main_stage.py
src/soc/fu/trap/main_stage.py
src/soc/fu/trap/pipe_data.py
src/soc/fu/trap/test/test_pipe_caller.py
src/soc/fu/trap/trap_input_record.py
src/soc/interrupts/xics.py
src/soc/litex/florent
src/soc/minerva/wishbone.py
src/soc/regfile/regfile.py
src/soc/regfile/regfiles.py
src/soc/regfile/sram_wrapper.py [new file with mode: 0644]
src/soc/simple/core.py
src/soc/simple/core_data.py
src/soc/simple/inorder.py [new file with mode: 0644]
src/soc/simple/issuer.py
src/soc/simple/issuer_verilog.py
src/soc/simple/test/test_core.py
src/soc/simple/test/test_issuer.py
src/soc/simple/test/test_issuer_linux_5_7.py [new file with mode: 0644]
src/soc/simple/test/test_issuer_mmu_ifetch.py [new file with mode: 0644]
src/soc/simple/test/test_issuer_mmu_microwatt.py [new file with mode: 0644]
src/soc/simple/test/test_microwatt.py
src/soc/simple/test/test_runner.py
src/soc/simple/test/teststate.py

index d48dc7ff1f94bf7d776c91bb0774f1be6d66c98f..916979dfa880dc64cffd5b66439e5c1cf0602c8b 100644 (file)
@@ -10,9 +10,10 @@ Waveforms
 *.il
 **/*.gtkw
 .eggs
-
+formal_test_temp
 .vscode/*
 build
 gen
 .noseids
 nosetests.xml
+test-out
index 867411fa4ddce7beef88c6b704c9f3f8df997219..c57c2d547bf19eb76421deb3cb99e33e9d883a32 100644 (file)
@@ -10,6 +10,7 @@ cache:
 variables:
     PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
     GIT_SUBMODULE_STRATEGY: recursive
+    GIT_DEPTH: "500"
 
 build:
     stage: build
@@ -19,11 +20,29 @@ build:
         - apt-get -o dir::cache::archives="$(pwd)/apt-cache" update
         - >-
             apt-get -o dir::cache::archives="$(pwd)/apt-cache" -y install
-            build-essential git python3-dev python3-pip
-            python3-setuptools python3-wheel pkg-config tcl-dev
-            libreadline-dev bison flex libffi-dev ccache python3-venv
-            binutils-powerpc64-linux-gnu binutils-powerpc64le-linux-gnu
-            autoconf gperf libgmp-dev libmpfr-dev libssl-dev curl
+            build-essential
+            git
+            python3-dev
+            python3-pip
+            python3-setuptools
+            python3-setuptools-scm
+            python3-wheel
+            pkg-config
+            tcl-dev
+            libreadline-dev
+            bison
+            flex
+            libffi-dev
+            ccache
+            python3-venv
+            binutils-powerpc64-linux-gnu
+            binutils-powerpc64le-linux-gnu
+            autoconf
+            gperf
+            libgmp-dev
+            libmpfr-dev
+            libssl-dev
+            curl
         - export PATH="/usr/lib/ccache:$PATH"
         - export CCACHE_BASEDIR="$PWD"
         - export CCACHE_DIR="$PWD/ccache"
@@ -32,43 +51,65 @@ build:
         - ccache --show-stats || true
         - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
         - source $HOME/.cargo/env
-    after_script:
-        - export CCACHE_DIR="$PWD/ccache"
-        - ccache --show-stats
     script:
-        - python3 -m venv .env
+        - python3 -m venv --system-site-packages .env
         - . .env/bin/activate
-        - pip install nose
+        - pip install pytest-xdist==3.3.1 pytest==7.3.1
+
+        - git clone --depth 1 -b v0.1.1 https://github.com/cocotb/cocotb-bus.git cocotb-bus
+        - pushd cocotb-bus
+        - pip install . --no-deps
+        - popd
+
+        - git clone --depth 1 -b v1.5.2 https://github.com/cocotb/cocotb.git cocotb
+        - pushd cocotb
+        - pip install .
+        - popd
+
+        - git clone --depth 1 https://git.libre-soc.org/git/pytest-output-to-files.git pytest-output-to-files
+        - pushd pytest-output-to-files
+        - git rev-parse HEAD
+        - python3 setup.py develop
+        - popd
 
-        - git clone --depth 1 https://github.com/SRI-CSL/yices2.git yices2
+        - git clone --depth 1 -b Yices-2.6.4 https://github.com/SRI-CSL/yices2.git yices2
         - pushd yices2
         - autoconf
         - ./configure
-        - make -j$(nproc) > /dev/null
+        - make -j$(nproc)
         - make install
         - popd
 
-        - git clone --depth 1 https://github.com/YosysHQ/yosys.git yosys
+        - git clone --depth 1 -b yosys-0.17 https://github.com/YosysHQ/yosys.git yosys
         - pushd yosys
         - make config-gcc
-        - make -j$(nproc) > /dev/null
+        - make -j$(nproc)
         - make install
         - popd
         - yosys -V
 
-        - git clone --depth 1 https://github.com/YosysHQ/SymbiYosys.git SymbiYosys
+        - git clone https://github.com/YosysHQ/SymbiYosys.git SymbiYosys
         - pushd SymbiYosys
-        - make install > /dev/null
+        - git checkout d10e472edf4ea9be3aa6347b264ba575fbea933a
+        - make install
         - popd
 
-        - git clone --depth 1 https://github.com/nmigen/nmigen.git nmigen
+        - git clone --depth 1 https://gitlab.com/nmigen/nmigen.git nmigen
         - pushd nmigen
-        - python setup.py develop
+        - git rev-parse HEAD
+        - python3 setup.py develop
+        - popd
+
+        - git clone --depth 1 https://git.libre-soc.org/git/mdis.git mdis
+        - pushd mdis
+        - git rev-parse HEAD
+        - python3 setup.py develop
         - popd
 
         - git clone --depth 1 https://git.libre-soc.org/git/nmutil.git nmutil
         - pushd nmutil
-        - python setup.py develop
+        - git rev-parse HEAD
+        - python3 setup.py develop
         - popd
 
         - git clone --depth 1 https://git.libre-soc.org/git/nmigen-soc.git nmigen-soc
@@ -85,9 +126,7 @@ build:
         - git clone --depth 1 https://git.libre-soc.org/git/openpower-isa.git openpower-isa
         - pushd openpower-isa
         - python3 setup.py develop
-        - make -j$(nproc) svanalysis > /dev/null
-        - make -j$(nproc) pyfnwriter > /dev/null 2>&1
-        - make -j$(nproc) pywriter > /dev/null 2>&1
+        - if ! out="$(make 2>&1)"; then echo "$out"; exit 1; fi
         - popd
 
         - git clone --depth 1 https://git.libre-soc.org/git/c4m-jtag.git c4m-jtag
@@ -96,8 +135,9 @@ build:
         - popd
 
         - IEEE754FPU_PATH="$(pwd)"/ieee754fpu
-        - git clone --depth 1 --recursive https://github.com/billzorn/sfpy.git sfpy
+        - git clone --depth 1 --recursive -b v0.6.0 https://github.com/billzorn/sfpy.git sfpy
         - pushd sfpy
+        - git apply "$IEEE754FPU_PATH"/sfpy.patch
         - pushd berkeley-softfloat-3
         - git apply "$IEEE754FPU_PATH"/berkeley-softfloat.patch
         - popd
@@ -105,11 +145,11 @@ build:
         - git apply ../softposit_sfpy_build.patch
         - git apply "$IEEE754FPU_PATH"/SoftPosit.patch
         - popd
-        - pip install --upgrade -r requirements.txt
+        - pip install -r requirements.txt
         - make lib -j$(nproc)
         - make cython -j$(nproc)
         - make wheel -j$(nproc)
-        - pip install dist/sfpy*.whl
+        - pip install --force-reinstall dist/sfpy*.whl
         - popd
 
         - python3 -m pip install 'maturin>=0.11,<0.12'
@@ -120,4 +160,4 @@ build:
         - popd
 
         - python setup.py develop
-        - nosetests -v --processes=-1 --process-timeout=120 -w src/
+        - SILENCELOG='!*,default' pytest -v --maxfail=20
index 3d4ea62db5a779f896d1f59665014783681f0523..15670cf8b3babf7f8e0991cd3e5100fecb68d273 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -56,6 +56,43 @@ ls180_4k_verilog:
                --enable-xics --enable-sram4x4kblock --disable-svp64 \
                        src/soc/litex/florent/libresoc/libresoc.v
 
+# build microwatt "external core", note that the TLB set size is set to 16
+# for I/D-Cache which needs a corresponding alteration of the device-tree
+# entries for linux
+microwatt_external_core:
+       python3 src/soc/simple/issuer_verilog.py --microwatt-compat --enable-mmu \
+            external_core_top.v
+
+# build microwatt "external core" with fixed 64-bit width SVP64
+# note that the TLB set size is set to 16
+# for I/D-Cache which needs a corresponding alteration of the device-tree
+# entries for linux
+microwatt_external_core_svp64:
+       python3 src/soc/simple/issuer_verilog.py --microwatt-compat-svp64 --enable-mmu \
+            external_core_top.v
+
+microwatt_external_core_spi:
+       python3 src/soc/simple/issuer_verilog.py --microwatt-compat \
+            --small-cache \
+            --enable-mmu \
+            --pc-reset 0x10000000 \
+            external_core_top.v
+
+# microwatt-compatible core with smaller cache size (quick. VERSA_ECP5. just)
+microwatt_external_core_bram:
+       python3 src/soc/simple/issuer_verilog.py --microwatt-compat \
+            --small-cache \
+            --enable-mmu \
+            --pc-reset 0xFF000000 \
+            external_core_top.v
+
+# microwatt-compatible core with larger cache size (experiment on arty)
+microwatt_external_core_bram_arty:
+       python3 src/soc/simple/issuer_verilog.py --microwatt-compat \
+            --enable-mmu \
+            --pc-reset 0xFF000000 \
+            external_core_top.v
+
 # build the litex libresoc SoC without 4k SRAMs
 ls180_verilog_build: ls180_verilog
        make -C soc/soc/litex/florent ls180
diff --git a/conf.py b/conf.py
index 12b29a4fb10659843b17c069255fcc3199cc77ba..d752f59ef042ac4b8d5f42bcebdf50308f0ee585 100644 (file)
--- a/conf.py
+++ b/conf.py
@@ -47,7 +47,7 @@ extensions = [
     'sphinx.ext.coverage',
     'recommonmark',
     #'symbolator_sphinx',
-    'sphinxcontrib_verilog_diagrams',
+    #'sphinxcontrib_verilog_diagrams', # XXX now spinxcontrib-hdl-diagrams
     'sphinx_rtd_theme',
     #'sphinx_tabs.tabs',
 ]
index f79c6e0a093cc3ab4b421e8349abf12d97efb29d..c98e48044dfcf9019930997720ac5b431be7ac53 100755 (executable)
@@ -1,4 +1,5 @@
 #!/bin/sh
 cd pinmux
 python2 src/pinmux_generator.py -v -s ls180 -o ls180
-python2 src/pinmux_generator.py -v -s ngi_router -o ngi_router
+# temporary - return to older version of pinmux
+#python2 src/pinmux_generator.py -v -s ngi_router -o ngi_router
diff --git a/pinmux b/pinmux
index 20ca612b2600530ce901009b3d1b9ef0e05b7438..7cbf0e2a54448f549243cd602ebafd10de8d32f0 160000 (submodule)
--- a/pinmux
+++ b/pinmux
@@ -1 +1 @@
-Subproject commit 20ca612b2600530ce901009b3d1b9ef0e05b7438
+Subproject commit 7cbf0e2a54448f549243cd602ebafd10de8d32f0
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644 (file)
index 0000000..e21c7c4
--- /dev/null
@@ -0,0 +1,11 @@
+[tool.pytest.ini_options]
+minversion = "6.0"
+python_classes = ""
+python_functions = ""
+testpaths = ["src/soc"]
+required_plugins = ["pytest-xdist>=1.0.0", "pytest-output-to-files>=0.1.0"]
+addopts = [
+    "-n",
+    "auto",
+    "--shorten-output-dir=test-out",
+]
index 14cd4c6e9508c1a6b18bd5ace84897b9c9db6bb1..ddbdf8b4402fc7316616bf30bdad22b4386d4a88 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -8,16 +8,45 @@ NEWS = open(os.path.join(here, 'NEWS.txt')).read()
 
 version = '0.0.1'
 
+# the only reason this is added is because it's become a part of python 3.9.
+# the project standard is python 3.7 however in future that will be updated.
+# for now, cached_property is RELUCTANTLY added but a *copy* is added so
+# that the generation of HDL is not critically dependent on random crap
+# off the internet. you're spending USD 16 *MILLION* on masks, you better
+# be absolutely paranoid-level certain you know where every piece of the
+# chain creating the HDL comes from.
+cprop = "git+https://git.libre-soc.org/git/cached-property.git@1.5.2" \
+        "#egg=cached-property-1.5.2"
+
 # using pip3 for ongoing development is a royal pain.  seriously not
 # recommended.  therefore a number of these dependencies have been
 # commented out.  *they are still required* - they will need installing
 # manually.
 
+# XXX UNDER NO CIRCUMSTANCES ADD ARBITRARY DEPENDENCIES HERE. XXX
+# as this is HDL, not software, every dependency added is
+# a serious maintenance and reproducible-build problem.
+# dropping USD 16 million on 7nm Mask Charges when the
+# HDL can be compromised - accidentally or deliberately -
+# by pip3 going out and randomly downloading complete
+# shite is not going to do anyone any favours.
+
+# TODO: make *all* of these be from libre-soc git repo only
+# (which means updating the nmigen-soc one to mirror gitlab)
+
 install_requires = [
     #    'sfpy',    # needs manual patching
     'libresoc-ieee754fpu',   # uploaded (successfully, whew) to pip
     'libresoc-openpower-isa',  # uploaded (successfully, whew) to pip
     # 'nmigen-soc', # install manually from git.libre-soc.org
+
+    # git url needed for having `pip3 install -e .` install from libre-soc git
+    "cached-property@"+cprop,
+]
+
+# git url needed for having `setup.py develop` install from libre-soc git
+dependency_links = [
+    cprop,
 ]
 
 test_requires = [
@@ -34,7 +63,8 @@ setup(
     long_description_content_type='text/markdown',
     classifiers=[
         "Topic :: Software Development",
-        "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)",
+        "License :: OSI Approved :: " \
+            "GNU Lesser General Public License v3 or later (LGPLv3+)",
         "Programming Language :: Python :: 3",
         "Operating System :: OS Independent",
     ],
@@ -48,6 +78,7 @@ setup(
     include_package_data=True,
     zip_safe=False,
     install_requires=install_requires,
+    dependency_links=dependency_links,
     tests_require=test_requires,
     test_suite='nose.collector',
 )
diff --git a/src/soc/bus/external_core.py b/src/soc/bus/external_core.py
new file mode 100644 (file)
index 0000000..102e66c
--- /dev/null
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the external_core_top.v verilog module
+# which allows for faster development iteration (oh and microwatt or
+# other core to be dropped into a peripheral fabric)
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+                    ResetSignal, Const)
+from nmigen.cli import rtlil, verilog
+
+from soc.debug.dmi import DMIInterface
+from nmigen_soc.wishbone.bus import Interface
+import os
+
+__all__ = ["ExternalCore"]
+
+
+class ExternalCore(Elaboratable):
+    """External Core verilog wrapper for microwatt and libre-soc
+   (actually, anything prepared to map to the Signals defined below)
+   remember to call ExternalCore.add_verilog_source
+    """
+
+    def __init__(self, ibus=None, dbus=None, features=None, name=None):
+
+        # set up the icache wishbone bus
+        if features is None:
+            features = frozenset(("stall",))
+        if ibus is None:
+            ibus = Interface(addr_width=32,
+                            data_width=64,
+                            features=features,
+                            granularity=8,
+                            name="core_ibus")
+        if dbus is None:
+            dbus = Interface(addr_width=32,
+                            data_width=64,
+                            features=features,
+                            granularity=8,
+                            name="core_dbus")
+        self.dmi = DMIInterface(name="dmi")
+        self.ibus = ibus
+        self.dbus = dbus
+
+        assert len(self.ibus.dat_r) == 64, "bus width must be 64"
+        assert len(self.dbus.dat_r) == 64, "bus width must be 64"
+
+        # IRQ for data buffer receive/xmit
+        self.irq = Signal() 
+
+        # debug monitoring signals
+        self.nia = Signal(64)
+        self.nia_req = Signal()
+        self.msr = Signal(64)
+        self.ldst_addr = Signal(64)
+        self.ldst_req = Signal()
+
+        # alternative reset and termination indicator
+        self.alt_reset = Signal()
+        self.terminated_o = Signal()
+
+    @classmethod
+    def add_verilog_source(cls, verilog_src_dir, platform):
+        # add each of the verilog sources, needed for when doing platform.build
+        for fname in ['external_core_top.v',
+                     ]:
+            # prepend the src directory to each filename, add its contents
+            fullname = os.path.join(verilog_src_dir, fname)
+            with open(fullname) as f:
+                platform.add_file(fullname, f)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # create definition of external core here, so that
+        # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+        ibus, dbus, dmi = self.ibus, self.dbus, self.dmi
+
+        # sigh, microwatt wishbone address is borked, it contains the 3 LSBs
+        ibus_adr = Signal(32)
+        dbus_adr = Signal(32)
+        m.d.comb += ibus.adr.eq(ibus_adr[3:])
+        m.d.comb += dbus.adr.eq(dbus_adr[3:])
+
+        kwargs = {
+            # clock/reset signals
+            'i_clk': ClockSignal(),
+            'i_rst': ResetSignal(),
+            # DMI interface
+            'i_dmi_addr': dmi.addr_i,
+            'i_dmi_req': dmi.req_i,
+            'i_dmi_wr': dmi.we_i,
+            'i_dmi_din': dmi.din,
+            'o_dmi_dout': dmi.dout,
+            'o_dmi_ack': dmi.ack_o,
+            # debug/monitor signals
+            'o_nia': self.nia,
+            'o_nia_req': self.nia_req,
+            'o_msr_o': self.msr,
+            'o_ldst_addr': self.ldst_addr,
+            'o_ldst_req': self.ldst_req,
+            'i_alt_reset': self.alt_reset,
+            'o_terminated_out': self.terminated_o,
+            # wishbone instruction bus
+            'o_wishbone_insn_out.adr': ibus_adr,
+            'o_wishbone_insn_out.dat': ibus.dat_w,
+            'o_wishbone_insn_out.sel': ibus.sel,
+            'o_wishbone_insn_out.cyc': ibus.cyc,
+            'o_wishbone_insn_out.stb': ibus.stb,
+            'o_wishbone_insn_out.we': ibus.we,
+            'i_wishbone_insn_in.dat': ibus.dat_r,
+            'i_wishbone_insn_in.ack': ibus.ack,
+            'i_wishbone_insn_in.stall': ibus.stall,
+            # wishbone data bus
+            'o_wishbone_data_out.adr': dbus_adr,
+            'o_wishbone_data_out.dat': dbus.dat_w,
+            'o_wishbone_data_out.sel': dbus.sel,
+            'o_wishbone_data_out.cyc': dbus.cyc,
+            'o_wishbone_data_out.stb': dbus.stb,
+            'o_wishbone_data_out.we': dbus.we,
+            'i_wishbone_data_in.dat': dbus.dat_r,
+            'i_wishbone_data_in.ack': dbus.ack,
+            'i_wishbone_data_in.stall': dbus.stall,
+            # external interrupt request
+            'i_ext_irq': self.irq,
+        }
+        core = Instance("external_core_top", **kwargs)
+        m.submodules['core_top'] = core
+
+        return m
+
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+    vl = verilog.convert(dut, name=test_name, ports=ports)
+    with open("%s.v" % test_name, "w") as f:
+        f.write(vl)
+
+
+if __name__ == "__main__":
+    core = ExternalCore(name="core")
+    create_ilang(core, [
+                        core.ibus.cyc, core.ibus.stb, core.ibus.ack,
+                        core.ibus.dat_r, core.ibus.dat_w, core.ibus.adr,
+                        core.ibus.we, core.ibus.sel, core.ibus.stall,
+                        core.dbus.cyc, core.dbus.stb, core.dbus.ack,
+                        core.dbus.dat_r, core.dbus.dat_w, core.dbus.adr,
+                        core.dbus.we, core.dbus.sel,
+                        core.irq, core.alt_reset, core.terminated_o,
+                        core.msr, core.nia, core.nia_req,
+                        core.ldst_addr, core.ldst_req,
+                        core.dmi.addr_i, core.dmi.req_i, core.dmi.we_i,
+                        core.dmi.din, core.dmi.dout, core.dmi.ack_o,
+                       ], "core_0")
+
diff --git a/src/soc/bus/opencores_ethmac.py b/src/soc/bus/opencores_ethmac.py
new file mode 100644 (file)
index 0000000..cad4919
--- /dev/null
@@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2020-2022 Raptor Engineering LLC <support@raptorengineering.com>
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog 10/100 MAC
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+                    ResetSignal)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen_soc.memory import MemoryMap
+from nmigen.utils import log2_int
+from nmigen.cli import rtlil, verilog
+import os
+
+__all__ = ["EthMAC"]
+
+
+class EthMAC(Elaboratable):
+    """Ethernet MAC from opencores, nmigen wrapper.
+    remember to call EthMAC.add_verilog_source
+    """
+
+    def __init__(self, master_bus=None, slave_bus=None, name=None,
+                       irq=None, pins=None):
+        if name is not None:
+            # convention: give the name in the format "name_number"
+            self.idx = int(name.split("_")[-1])
+        else:
+            self.idx = 0
+            name = "eth_0"
+        self.granularity = 8
+        self.data_width = 32
+        self.dsize = log2_int(self.data_width//self.granularity)
+
+        # set up the wishbone busses
+        features = frozenset()
+        if master_bus is None:
+            master_bus = Interface(addr_width=30,
+                            data_width=32,
+                            features=features,
+                            granularity=8,
+                            name=name+"_wb_%d_0" % self.idx)
+        if slave_bus is None:
+            slave_bus = Interface(addr_width=12,
+                            data_width=32,
+                            features=features,
+                            granularity=8,
+                            name=name+"_wb_%d_1" % self.idx)
+        self.master_bus = master_bus
+        self.slave_bus = slave_bus
+        if irq is None:
+            irq = Signal()
+        self.irq = irq
+
+        slave_mmap = MemoryMap(addr_width=12+self.dsize,
+                        data_width=self.granularity)
+
+        self.slave_bus.memory_map = slave_mmap
+
+        # RMII TX signals
+        self.mtx_clk = Signal()
+        self.mtxd = Signal(4)
+        self.mtxen = Signal()
+        self.mtxerr = Signal()
+
+        # RMII RX signals
+        self.mrx_clk = Signal()
+        self.mrxd = Signal(4)
+        self.mrxdv = Signal()
+        self.mrxerr = Signal()
+
+        # RMII common signals
+        self.mcoll = Signal()
+        self.mcrs = Signal()
+
+        # RMII management interface signals
+        self.mdc = Signal()
+        self.md_in = Signal()
+        self.md_out = Signal()
+        self.md_direction = Signal()
+
+        # pins resource
+        self.pins = pins
+
+    @classmethod
+    def add_verilog_source(cls, verilog_src_dir, platform):
+        # add each of the verilog sources, needed for when doing platform.build
+        for fname in ['eth_clockgen.v', 'eth_cop.v', 'eth_crc.v',
+                    'eth_fifo.v', 'eth_maccontrol.v', 'ethmac_defines.v',
+                    'eth_macstatus.v', 'ethmac.v', 'eth_miim.v',
+                    'eth_outputcontrol.v', 'eth_random.v',
+                    'eth_receivecontrol.v', 'eth_registers.v',
+                    'eth_register.v', 'eth_rxaddrcheck.v',
+                    'eth_rxcounters.v', 'eth_rxethmac.v',
+                    'eth_rxstatem.v', 'eth_shiftreg.v',
+                    'eth_spram_256x32.v', 'eth_top.v',
+                    'eth_transmitcontrol.v', 'eth_txcounters.v',
+                    'eth_txethmac.v', 'eth_txstatem.v', 'eth_wishbone.v',
+                    'timescale.v']:
+            # prepend the src directory to each filename, add its contents
+            fullname = os.path.join(verilog_src_dir, fname)
+            with open(fullname) as f:
+                platform.add_file(fullname, f)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        idx = self.idx
+
+        # Calculate arbiter bus address
+        wb_master_bus_adr = Signal(32)
+        # arbiter address is in words, ethernet master address is in bytes
+        comb += self.master_bus.adr.eq(wb_master_bus_adr >> 2)
+
+        # create definition of external verilog EthMAC code here, so that
+        # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+        ethmac = Instance("eth_top",
+                            # Clock/reset (use DomainRenamer if needed)
+                            i_wb_clk_i=ClockSignal(),
+                            i_wb_rst_i=ResetSignal(),
+
+                            # Master Wishbone bus signals
+                            o_m_wb_adr_o=wb_master_bus_adr,
+                            i_m_wb_dat_i=self.master_bus.dat_r,
+                            o_m_wb_sel_o=self.master_bus.sel,
+                            o_m_wb_dat_o=self.master_bus.dat_w,
+                            o_m_wb_we_o=self.master_bus.we,
+                            o_m_wb_stb_o=self.master_bus.stb,
+                            o_m_wb_cyc_o=self.master_bus.cyc,
+                            i_m_wb_ack_i=self.master_bus.ack,
+
+                            # Slave Wishbone bus signals
+                            i_wb_adr_i=self.slave_bus.adr,
+                            i_wb_dat_i=self.slave_bus.dat_w,
+                            i_wb_sel_i=self.slave_bus.sel,
+                            o_wb_dat_o=self.slave_bus.dat_r,
+                            i_wb_we_i=self.slave_bus.we,
+                            i_wb_stb_i=self.slave_bus.stb,
+                            i_wb_cyc_i=self.slave_bus.cyc,
+                            o_wb_ack_o=self.slave_bus.ack,
+
+                            o_int_o=self.irq,
+
+                            # RMII TX
+                            i_mtx_clk_pad_i=self.mtx_clk,
+                            o_mtxd_pad_o=self.mtxd,
+                            o_mtxen_pad_o=self.mtxen,
+                            o_mtxerr_pad_o=self.mtxerr,
+
+                            # RMII RX
+                            i_mrx_clk_pad_i=self.mrx_clk,
+                            i_mrxd_pad_i=self.mrxd,
+                            i_mrxdv_pad_i=self.mrxdv,
+                            i_mrxerr_pad_i=self.mrxerr,
+
+                            # RMII common
+                            i_mcoll_pad_i=self.mcoll,
+                            i_mcrs_pad_i=self.mcrs,
+
+                            # Management Interface
+                            o_mdc_pad_o=self.mdc,
+                            i_md_pad_i=self.md_in,
+                            o_md_pad_o=self.md_out,
+                            o_md_padoe_o=self.md_direction
+                            );
+
+        m.submodules['ethmac_%d' % self.idx] = ethmac
+
+        if self.pins is not None:
+            comb += self.mtx_clk.eq(self.pins.mtx_clk.i)
+            comb += self.pins.mtxd.o.eq(self.mtxd)
+            comb += self.pins.mtxen.o.eq(self.mtxen)
+            comb += self.pins.mtxerr.o.eq(self.mtxerr)
+
+            comb += self.mrx_clk.eq(self.pins.mrx_clk.i)
+            comb += self.mrxd.eq(self.pins.mrxd.i)
+            comb += self.mrxdv.eq(self.pins.mrxdv.i)
+            comb += self.mrxerr.eq(self.pins.mrxerr.i)
+            comb += self.mcoll.eq(self.pins.mcoll.i)
+            comb += self.mcrs.eq(self.pins.mcrs.i)
+
+            comb += self.pins.mdc.o.eq(self.mdc)
+
+            comb += self.pins.md.o.eq(self.md_out)
+            comb += self.pins.md.oe.eq(self.md_direction)
+            comb += self.md_in.eq(self.pins.md.i)
+        return m
+
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+    vl = verilog.convert(dut, name=test_name, ports=ports)
+    with open("%s.v" % test_name, "w") as f:
+        f.write(vl)
+
+if __name__ == "__main__":
+    ethmac = EthMAC(name="eth_0")
+    create_ilang(ethmac, [ethmac.master_bus.cyc, ethmac.master_bus.stb,
+                        ethmac.master_bus.ack, ethmac.master_bus.dat_r,
+                        ethmac.master_bus.dat_w, ethmac.master_bus.adr,
+                        ethmac.master_bus.we, ethmac.master_bus.sel,
+                        ethmac.slave_bus.cyc, ethmac.slave_bus.stb,
+                        ethmac.slave_bus.ack,
+                        ethmac.slave_bus.dat_r, ethmac.slave_bus.dat_w,
+                        ethmac.slave_bus.adr,
+                        ethmac.slave_bus.we, ethmac.slave_bus.sel,
+                        ethmac.mtx_clk, ethmac.mtxd, ethmac.mtxen,
+                        ethmac.mtxerr, ethmac.mrx_clk, ethmac.mrxd,
+                        ethmac.mrxdv, ethmac.mrxerr, ethmac.mcoll,
+                        ethmac.mcrs, ethmac.mdc, ethmac.md_in,
+                        ethmac.md_out, ethmac.md_direction
+                       ], "eth_0")
+
diff --git a/src/soc/bus/sdr_ctrl.py b/src/soc/bus/sdr_ctrl.py
new file mode 100644 (file)
index 0000000..4b6799f
--- /dev/null
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog uart16550 module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+                    ResetSignal, Record)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen.cli import rtlil, verilog
+import os
+
+__all__ = ["SDRAM", "SDRAMConfig"]
+
+        """
+        class MT48LC16M16(SDRModule):
+            # geometry
+            nbanks = 4
+            nrows  = 8192
+            ncols  = 512
+            # timings
+            technology_timings = _TechnologyTimings(tREFI=64e6/8192,
+                                                    tWTR=(2, None),
+                                                    tCCD=(1, None),
+                                                    tRRD=(None, 15))
+            speedgrade_timings = {"default": _SpeedgradeTimings(tRP=20,
+                                                    tRCD=20,
+                                                    tWR=15,
+                                                    tRFC=(None, 66),
+                                                    tFAW=None,
+                                                    tRAS=44)}
+            # for MT48LC16M16-75 part
+            comb += self.cfg.sdr_en.eq(1)
+            comb += self.cfg.sdr_mode_reg.eq(0x033)
+            comb += self.cfg.req_depth.eq(3)    # max 
+            comb += self.cfg.sdr_tras_d.eq(44)  # Active to precharge delay
+            comb += self.cfg.sdr_trp_d.eq(20)   # Precharge to active delay
+            comb += self.cfg.sdr_trcd_d.eq(20)  # Active to R/W delay
+            comb += self.cfg.sdr_cas.eq(3)      # CAS latency
+            comb += self.cfg.sdr_trcar_d.eq(66) # tRFC auto-refresh period
+            comb += self.cfg.sdr_twr_d.eq(15) # clock + 7.5ns
+            comb += self.cfg.sdr_rfsh.eq(0x100)
+            comb += self.cfg.sdr_rfmax.eq(6)
+        """
+
+
+class SDRAMConfig(Record):
+    def __init__(self, refresh_timer_sz, refresh_row_count, name=None):
+        super().__init__(name=name, layout=[
+        # configuration parameters, these need to match the SDRAM IC datasheet
+                        ('req_depth', 2),       # max request accepted
+                        ('sdr_en', 1),          # Enable SDRAM controller
+                        ('sdr_mode_reg', 13),
+                        ('sdr_tras_d', 4),      # Active to precharge delay
+                        ('sdr_trp_d', 4),       # Precharge to active delay
+                        ('sdr_trcd_d', 4),      # Active to R/W delay
+                        ('sdr_cas', 3),         # SDRAM CAS Latency
+                        ('sdr_trcar_d', 4),     # Auto-refresh period
+                        ('sdr_twr_d', 4),       # Write recovery delay
+                        ('sdr_rfsh', refresh_timer_sz),
+                        ('sdr_rfmax', refresh_row_count)
+                    ])
+
+
+class SDRAM(Elaboratable):
+    """SDRAM controller from opencores, nmigen wrapper.  remember to call
+       SDRAM.add_verilog_source.
+
+    * the SDRAM IC will be accessible over the Wishbone Bus
+    * sdr_* signals must be wired to the IC
+    * cfg parameters must match those listed in the SDRAM IC's datasheet
+    """
+
+    def __init__(self, bus=None, features=None, name=None,
+                       data_width=32, addr_width=26,
+                       sdr_data_width=16,
+                       cfg=None,
+                       pins=None):
+        if name is not None:
+            name = "sdram"
+        self.data_width = data_width
+        self.sdr_data_width = sdr_data_width
+        self.addr_width = addr_width
+        self.refresh_timer_sz = 12
+        self.refresh_row_count = 3
+
+        # set up the wishbone bus
+        if features is None:
+            features = frozenset({'cti'})
+        if bus is None:
+            bus = Interface(addr_width=addr_width,
+                            data_width=data_width,
+                            features=features,
+                            granularity=8,
+                            name=name)
+        self.bus = bus
+        assert len(self.bus.dat_r) == data_width, \
+                        "bus width must be %d" % data_width
+
+        byte_width = sdr_data_width // 8 # for individual byte masks/enables
+
+        # SDRAM signals
+        self.sdram_clk     = Signal()           # sdram phy clock
+        self.sdram_resetn  = Signal(reset_less=True) # sdram reset (low)
+        self.sdr_cs_n      = Signal()           # chip select
+        self.sdr_cke       = Signal()           # clock-enable
+        self.sdr_ras_n     = Signal()           # read-address strobe
+        self.sdr_cas_n     = Signal()           # cas
+        self.sdr_we_n      = Signal()           # write-enable
+        self.sdr_dqm       = Signal(byte_width) # data mask
+        self.sdr_ba        = Signal(2)          # bank enable
+        self.sdr_addr      = Signal(13)         # sdram address, 13 bits
+        # these combine to create a bi-direction inout, sdr_dq
+        # note, each bit of sdr_den_n covers a *byte* of sdr_din/sdr_dout
+        self.sdr_den_n     = Signal(byte_width)
+        self.sdr_din       = Signal(data_width)
+        self.sdr_dout      = Signal(data_width)
+
+        # configuration parameters, these need to match the SDRAM IC datasheet
+        self.sdr_init_done       = Signal()  # Indicate SDRAM init Done
+        if cfg is None:
+            cfg = SDRAMConfig(self.refresh_timer_sz,
+                                   self.refresh_row_count, name="sdr_cfg")
+
+        # config and pins resource
+        self.pins = pins
+        self.cfg = cfg
+
+    @classmethod
+    def add_verilog_source(cls, verilog_src_dir, platform):
+        # add each of the verilog sources, needed for when doing platform.build
+        for fname in [ './core/sdrc_bank_ctl.v', './core/sdrc_bank_fsm.v',
+                        './core/sdrc_bs_convert.v', './core/sdrc_core.v',
+                        './core/sdrc_req_gen.v', './core/sdrc_xfr_ctl.v',
+                        './core/sdrc_define.v',
+                        './lib/async_fifo.v', './lib/sync_fifo.v',
+                        './top/sdrc_top.v', './wb2sdrc/wb2sdrc.v',
+                     ]:
+            # prepend the src directory to each filename, add its contents
+            fullname = os.path.join(verilog_src_dir, fname)
+            with open(fullname) as f:
+                platform.add_file(fullname, f)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # create definition of external verilog 16550 uart here, so that                # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+        bus = self.bus
+
+        params = {
+            # clock/reset (use DomainRenamer if needed)
+            'i_wb_clk_i' : ClockSignal(),
+            'i_wb_rst_i' : ResetSignal(),
+
+            # wishbone bus signals
+            'i_wb_adr_i' : bus.adr,
+            'i_wb_dat_i' : bus.dat_w,
+            'i_wb_sel_i' : bus.sel,
+            'o_wb_dat_o' : bus.dat_r,
+            'i_wb_we_i' : bus.we,
+            'i_wb_stb_i' : bus.stb,
+            'i_wb_cyc_i' : bus.cyc,
+            'o_wb_ack_o' : bus.ack,
+
+            # SDRAM signals
+            'i_sdram_clk'      :  self.sdram_clk,
+            'i_sdram_resetn'   :  self.sdram_resetn,
+            'o_sdr_cs_n'       :  self.sdr_cs_n,
+            'o_sdr_cke'        :  self.sdr_cke,
+            'o_sdr_ras_n'      :  self.sdr_ras_n,
+            'o_sdr_cas_n'      :  self.sdr_cas_n,
+            'o_sdr_we_n'       :  self.sdr_we_n,
+            'o_sdr_dqm'        :  self.sdr_dqm,
+            'o_sdr_ba'         :  self.sdr_ba,
+            'o_sdr_addr'       :  self.sdr_addr,
+            'o_sdr_den_n'      : self.sdr_den_n,
+            'i_sdr_din'        : self.sdr_din,
+            'o_sdr_dout'       : self.sdr_dout,
+
+            # configuration parameters (from the SDRAM IC datasheet)
+            'o_sdr_init_done'      : self.sdr_init_done       ,
+            'i_cfg_req_depth'      : self.cfg.req_depth       ,
+            'i_cfg_sdr_en'         : self.cfg.sdr_en          ,
+            'i_cfg_sdr_mode_reg'   : self.cfg.sdr_mode_reg    ,
+            'i_cfg_sdr_tras_d'     : self.cfg.sdr_tras_d      ,
+            'i_cfg_sdr_trp_d'      : self.cfg.sdr_trp_d       ,
+            'i_cfg_sdr_trcd_d'     : self.cfg.sdr_trcd_d      ,
+            'i_cfg_sdr_cas'        : self.cfg.sdr_cas         ,
+            'i_cfg_sdr_trcar_d'    : self.cfg.sdr_trcar_d     ,
+            'i_cfg_sdr_twr_d'      : self.cfg.sdr_twr_d       ,
+            'i_cfg_sdr_rfsh'       : self.cfg.sdr_rfsh        ,
+            'i_cfg_sdr_rfmax'      : self.cfg.sdr_rfmax,
+
+            # verilog parameters
+            'p_APP_AW'   : self.addr_width,    # Application Address Width
+            'p_APP_DW'   : self.data_width,    # Application Data Width
+            'p_APP_BW'   : self.addr_width//8, # Application Byte Width
+            'p_APP_RW'   : 9,                  # Application Request Width
+            'p_SDR_DW'   : self.sdr_data_width,    # SDR Data Width
+            'p_SDR_BW'   : self.sdr_data_width//8, # SDR Byte Width
+            'p_dw'       : self.data_width,    # data width
+            'p_tw'       : 8,   # tag id width
+            'p_bl'       : 9,   # burst_length_width
+        }
+        m.submodules['sdrc_top'] = Instance("sdrc_top", **params)
+
+        return m
+
+        if self.pins is not None:
+            comb += self.pins.tx.eq(self.tx_o)
+            comb += self.rx_i.eq(self.pins.rx)
+
+        return m
+
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+    vl = verilog.convert(dut, name=test_name, ports=ports)
+    with open("%s.v" % test_name, "w") as f:
+        f.write(vl)
+
+
+if __name__ == "__main__":
+    sdram = SDRAM(name="sdram", data_width=8)
+    create_ilang(sdram, [sdram.bus.cyc, sdram.bus.stb, sdram.bus.ack,
+                         sdram.bus.dat_r, sdram.bus.dat_w, sdram.bus.adr,
+                         sdram.bus.we, sdram.bus.sel,
+                         sdram.sdram_clk, sdram.sdram_resetn,
+                         sdram.sdr_cs_n, sdram.sdr_cke,
+                         sdram.sdr_ras_n, sdram.sdr_cas_n, sdram.sdr_we_n,
+                         sdram.sdr_dqm, sdram.sdr_ba, sdram.sdr_addr,
+                         sdram.sdr_den_n, sdram.sdr_din, sdram.sdr_dout,
+                         sdram.sdr_init_done, sdram.cfg.req_depth,
+                         sdram.cfg.sdr_en, sdram.cfg.sdr_mode_reg,
+                         sdram.cfg.sdr_tras_d, sdram.cfg.sdr_trp_d,
+                         sdram.cfg.sdr_trcd_d, sdram.cfg.sdr_cas,
+                         sdram.cfg.sdr_trcar_d, sdram.cfg.sdr_twr_d,
+                         sdram.cfg.sdr_rfsh, sdram.cfg.sdr_rfmax,
+                       ], "sdram")
+
index 9819302ff80e2ed2492efc6406bbe055dcecd901..f025211417ff28d42456b7b4f75f0b236cd6ba3b 100644 (file)
@@ -60,7 +60,7 @@ class SRAM(Elaboratable):
                             data_width=self.memory.width,
                             granularity=granularity,
                             features=features,
-                            alignment=0,
+                            #alignment=0,
                             name=None)
         self.bus = bus
         self.granularity = bus.granularity
diff --git a/src/soc/bus/syscon.py b/src/soc/bus/syscon.py
new file mode 100644 (file)
index 0000000..f3dcfc0
--- /dev/null
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Copyright (C) 2022 Raptor Engineering, LLC <support@raptorengineering.com>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a System Console peripheral compatible with microwatt
+# https://github.com/antonblanchard/microwatt/blob/master/syscon.vhdl
+
+from nmigen import (Elaboratable, Cat, Module, Signal)
+from nmigen.cli import rtlil, verilog
+
+from lambdasoc.periph import Peripheral
+
+__all__ = ["MicrowattSYSCON"]
+
+
+class MicrowattSYSCON(Peripheral, Elaboratable):
+    """Microwatt-compatible (Sys)tem (Con)figuration module
+    """
+
+    def __init__(self, *, sys_clk_freq=100e6,
+                          core_clk_freq=100e6,
+                          mem_clk_freq=100e6,
+                          spi_offset=None,
+                          dram_addr=None,
+                          has_uart=True,
+                          uart_is_16550=True
+                          ):
+        super().__init__(name="syscon")
+        self.sys_clk_freq = sys_clk_freq
+        self.core_clk_freq = core_clk_freq
+        self.mem_clk_freq = mem_clk_freq
+        self.has_uart = has_uart
+        self.spi_offset = spi_offset
+        self.dram_addr = dram_addr
+        self.uart_is_16550 = uart_is_16550
+
+        # System control ports
+        self.dram_at_0 = Signal()
+        self.core_reset = Signal()
+        self.soc_reset = Signal()
+
+        # set up a CSR Bank and associated bridge. has to be in this order
+        # (declare bank, declare bridge) for some unknown reason.
+        # (r)ead regs will have a r_stb and r_data Record entry
+        # (w)rite regs will have a w_stb and w_data Record entry
+        bank = self.csr_bank()
+        self._reg_sig_r       = bank.csr(64, "r") # signature
+        self._reg_info_r      = bank.csr(64, "r") # info
+        self._bram_info_r     = bank.csr(64, "r") # bram info
+        self._dram_info_r     = bank.csr(64, "r") # dram info
+        self._clk_info_r      = bank.csr(64, "r") # nest clock frequency
+        self._ctrl_info_r     = bank.csr(64, "rw") # control info
+        self._dram_init_r     = bank.csr(64, "r") # dram initialisation info
+        self._spiflash_info_r = bank.csr(64, "r") # spi flash info
+        self._uart0_info_r    = bank.csr(64, "r") # UART0 info (baud etc.)
+        self._uart1_info_r    = bank.csr(64, "r") # UART1 info (baud etc.)
+        self._bram_bootaddr_r = bank.csr(64, "r") # BRAM boot address
+        self._core_clk_info_r = bank.csr(64, "r") # core clock frequency
+        self._mem_clk_info_r  = bank.csr(64, "r") # memory clock frequency
+
+        # bridge the above-created CSRs over wishbone.  ordering and size
+        # above mattered, the bridge automatically packs them together
+        # as memory-addressable "things" for us
+        self._bridge = self.bridge(data_width=32, granularity=8, alignment=3)
+        self.bus = self._bridge.bus
+
+    def elaborate(self, platform):
+        m = Module()
+        comb, sync = m.d.comb, m.d.comb
+        m.submodules.bridge = self._bridge
+
+        # enter data into the CSRs. r_data can be left live all the time,
+        # w_data obviously has to be set only when w_stb triggers.
+
+        # identifying signature
+        comb += self._reg_sig_r.r_data.eq(0xf00daa5500010001)
+
+        # nest clock rate (hz)
+        comb += self._clk_info_r.r_data.eq(int(self.sys_clk_freq)) # in hz
+
+        # core clock rate (hz)
+        comb += self._core_clk_info_r.r_data.eq(int(self.core_clk_freq)) # in hz
+
+        # memory clock rate (hz)
+        comb += self._mem_clk_info_r.r_data.eq(int(self.mem_clk_freq)) # in hz
+
+        # detect peripherals
+        has_spi = self.spi_offset is not None
+        has_dram = self.dram_addr is not None
+
+        # uart peripheral clock rate, currently assumed to be system clock
+        # 0 ..31  : UART clock freq (in HZ)
+        #     32  : UART is 16550 (otherwise pp)
+        comb += self._uart0_info_r.r_data[0:32].eq(int(self.sys_clk_freq))
+        comb += self._uart0_info_r.r_data[32].eq(1)
+
+        # Reg Info, defines what peripherals and characteristics are present
+        comb += self._reg_info_r.r_data[0].eq(self.has_uart) # has UART0
+        comb += self._reg_info_r.r_data[1].eq(has_dram)      # has DDR DRAM
+        comb += self._reg_info_r.r_data[3].eq(has_spi)       # has SPI Flash
+        comb += self._reg_info_r.r_data[5].eq(1)             # Large SYSCON
+
+        # system control
+        sysctrl = Cat(self.dram_at_0, self.core_reset, self.soc_reset)
+        with m.If(self._ctrl_info_r.w_stb):
+            sync += sysctrl.eq(self._ctrl_info_r.w_data)
+        comb += self._ctrl_info_r.r_data.eq(sysctrl)
+
+        # SPI Flash Address
+        comb += self._spiflash_info_r.r_data.eq(self.spi_offset or 0)
+
+        return m
+
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+    vl = verilog.convert(dut, name=test_name, ports=ports)
+    with open("%s.v" % test_name, "w") as f:
+        f.write(vl)
+
+
+if __name__ == "__main__":
+    from nmigen_soc import wishbone
+    class QuickDemo(Elaboratable):
+        def elaborate(self, platform):
+            m = Module()
+            arbiter = wishbone.Arbiter(addr_width=30, data_width=32,
+                                       granularity=8)
+            decoder = wishbone.Decoder(addr_width=30, data_width=32,
+                                       granularity=8)
+            m.submodules.syscon = syscon = MicrowattSYSCON()
+            m.submodules.decoder = decoder
+            m.submodules.arbiter = arbiter
+            decoder.add(syscon.bus, addr=0xc0000000)
+            m.d.comb += arbiter.bus.connect(decoder.bus)
+            return m
+    m = QuickDemo()
+    create_ilang(m, None, "syscondemo")
+
diff --git a/src/soc/bus/tercel.py b/src/soc/bus/tercel.py
new file mode 100644 (file)
index 0000000..54ba925
--- /dev/null
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2020-2022 Raptor Engineering LLC <support@raptorengineering.com>
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog tercel module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+                    ResetSignal, Const)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen_soc.memory import MemoryMap
+from nmigen.utils import log2_int
+from nmigen.cli import rtlil, verilog
+from nmutil.byterev import byte_reverse
+import os
+
+__all__ = ["Tercel"]
+
+
+class Tercel(Elaboratable):
+    """Tercel SPI controller from Raptor Engineering, nmigen wrapper.
+    remember to call Tercel.add_verilog_source
+    """
+
+    def __init__(self, bus=None, cfg_bus=None, features=None, name=None,
+                       data_width=32, spi_region_addr_width=28, pins=None,
+                       clk_freq=None,
+                       lattice_ecp5_usrmclk=False,
+                       adr_offset=0): # address offset (bytes)
+        if name is not None:
+            # convention: give the name in the format "name_number"
+            self.idx = int(name.split("_")[-1])
+        else:
+            self.idx = 0
+            name = "spi_0"
+        self.granularity = 8
+        self.data_width = data_width
+        self.dsize = log2_int(self.data_width//self.granularity)
+        self.adr_offset = adr_offset
+        self.lattice_ecp5_usrmclk = lattice_ecp5_usrmclk
+
+        # TODO, sort this out.
+        assert clk_freq is not None
+        clk_freq = round(clk_freq)
+        self.clk_freq = Const(clk_freq, 32) #clk_freq.bit_length())
+
+        # set up the wishbone busses
+        if features is None:
+            #features = frozenset({'err'}) # sigh
+            features = frozenset()
+        if bus is None:
+            bus = Interface(addr_width=spi_region_addr_width,
+                            data_width=data_width,
+                            features=features,
+                            granularity=8,
+                            name=name+"_wb_%d_0" % self.idx)
+        if cfg_bus is None:
+            cfg_bus = Interface(addr_width=6,
+                            data_width=data_width,
+                            features=features,
+                            granularity=8,
+                            name=name+"_wb_%d_1" % self.idx)
+        self.bus = bus
+        assert len(self.bus.dat_r) == data_width, \
+                        "bus width must be %d" % data_width
+        self.cfg_bus = cfg_bus
+        assert len(self.cfg_bus.dat_r) == data_width, \
+                        "bus width must be %d" % data_width
+
+        mmap = MemoryMap(addr_width=spi_region_addr_width+self.dsize,
+                        data_width=self.granularity)
+        cfg_mmap = MemoryMap(addr_width=6+self.dsize,
+                        data_width=self.granularity)
+
+        self.bus.memory_map = mmap
+        self.cfg_bus.memory_map = cfg_mmap
+
+        # QSPI signals
+        self.dq_out = Signal(4)       # Data
+        self.dq_direction = Signal(4)
+        self.dq_in = Signal(4)
+        self.cs_n_out = Signal()      # Slave select
+        self.spi_clk = Signal()       # Clock
+        self.dbg_port = Signal(8)     # debug info
+
+        # pins resource
+        self.pins = pins
+
+    @classmethod
+    def add_verilog_source(cls, verilog_src_dir, platform):
+        # add each of the verilog sources, needed for when doing platform.build
+        for fname in ['wishbone_spi_master.v', 'phy.v']:
+            # prepend the src directory to each filename, add its contents
+            fullname = os.path.join(verilog_src_dir, fname)
+            with open(fullname) as f:
+                platform.add_file(fullname, f)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        pins, bus, cfg_bus = self.pins, self.bus, self.cfg_bus
+
+        # Calculate SPI flash address
+        spi_bus_adr = Signal(30)
+        # wb address is in words, offset is in bytes
+        comb += spi_bus_adr.eq(bus.adr - (self.adr_offset >> 2))
+
+        # urrr.... byte-reverse the config bus and data bus read/write
+        cdat_w = Signal.like(cfg_bus.dat_w)
+        cdat_r = Signal.like(cfg_bus.dat_r)
+        dat_w = Signal.like(bus.dat_w)
+        dat_r = Signal.like(bus.dat_r)
+        comb += cdat_w.eq(byte_reverse(m, "rv_cdat_w", cfg_bus.dat_w, 4))
+        comb += cfg_bus.dat_r.eq(byte_reverse(m, "rv_cdat_r", cdat_r, 4))
+        comb += dat_w.eq(byte_reverse(m, "rv_dat_w", bus.dat_w, 4))
+        comb += bus.dat_r.eq(byte_reverse(m, "rv_dat_r", dat_r, 4))
+
+        # create definition of external verilog Tercel code here, so that
+        # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+        idx, bus = self.idx, self.bus
+        tercel = Instance("tercel_core",
+                            # System parameters
+                            i_sys_clk_freq = self.clk_freq,
+
+                            # Clock/reset (use DomainRenamer if needed)
+                            i_peripheral_clock=ClockSignal(),
+                            i_peripheral_reset=ResetSignal(),
+
+                            # SPI region Wishbone bus signals
+                            i_wishbone_adr=spi_bus_adr,
+                            i_wishbone_dat_w=dat_w,
+                            i_wishbone_sel=bus.sel,
+                            o_wishbone_dat_r=dat_r,
+                            i_wishbone_we=bus.we,
+                            i_wishbone_stb=bus.stb,
+                            i_wishbone_cyc=bus.cyc,
+                            o_wishbone_ack=bus.ack,
+                            #o_wishbone_err=bus.err,
+
+                            # Configuration region Wishbone bus signals
+                            i_cfg_wishbone_adr=cfg_bus.adr,
+                            i_cfg_wishbone_dat_w=cdat_w,
+                            i_cfg_wishbone_sel=cfg_bus.sel,
+                            o_cfg_wishbone_dat_r=cdat_r,
+                            i_cfg_wishbone_we=cfg_bus.we,
+                            i_cfg_wishbone_stb=cfg_bus.stb,
+                            i_cfg_wishbone_cyc=cfg_bus.cyc,
+                            o_cfg_wishbone_ack=cfg_bus.ack,
+                            #o_cfg_wishbone_err=cfg_bus.err,
+
+                            # QSPI signals
+                            o_spi_d_out=self.dq_out,
+                            o_spi_d_direction=self.dq_direction,
+                            i_spi_d_in=self.dq_in,
+                            o_spi_ss_n=self.cs_n_out,
+                            o_spi_clock=self.spi_clk,
+
+                            # debug port
+                            o_debug_port=self.dbg_port
+                            );
+
+        m.submodules['tercel_%d' % self.idx] = tercel
+
+        if pins is not None:
+            for i in range(4):
+                pad = getattr(pins, "dq%d" % i)
+                comb += pad.o.eq(self.dq_out[i])
+                comb += pad.oe.eq(self.dq_direction[i])
+                comb += self.dq_in[i].eq(pad.i)
+                # ECP5 needs special handling for the SPI clock, sigh.
+                if self.lattice_ecp5_usrmclk:
+                    comb += pad.o_clk.eq(ClockSignal())
+                    comb += pad.i_clk.eq(ClockSignal())
+            # XXX invert handled by SPIFlashResource
+            comb += pins.cs_n.eq(self.cs_n_out)
+            # ECP5 needs special handling for the SPI clock, sigh.
+            if self.lattice_ecp5_usrmclk:
+                m.submodules += Instance("USRMCLK",
+                    i_USRMCLKI  = self.spi_clk,
+                    i_USRMCLKTS = 0
+                )
+            else:
+                comb += pins.clk.eq(self.spi_clk)
+
+        return m
+
+    def ports(self):
+        return [self.bus.cyc, self.bus.stb, self.bus.ack,
+                        self.bus.dat_r, self.bus.dat_w, self.bus.adr,
+                        self.bus.we, self.bus.sel,
+                        self.cfg_bus.cyc, self.cfg_bus.stb,
+                        self.cfg_bus.ack,
+                        self.cfg_bus.dat_r, self.cfg_bus.dat_w,
+                        self.cfg_bus.adr,
+                        self.cfg_bus.we, self.cfg_bus.sel,
+                        self.dq_out, self.dq_direction, self.dq_in,
+                        self.cs_n_out, self.spi_clk
+                       ]
+
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+    vl = verilog.convert(dut, name=test_name, ports=ports)
+    with open("%s.v" % test_name, "w") as f:
+        f.write(vl)
+
+
+if __name__ == "__main__":
+    tercel = Tercel(name="spi_0", data_width=32, clk_freq=100e6)
+    create_ilang(tercel, tercel.ports(), "spi_0")
+
index 8ee79b0c03fc2e0ff216c3636835d77c428488d6..8a43e88be1b1a29e9472faceabb4630fbd1bfac1 100644 (file)
@@ -2,13 +2,14 @@
 """
 
 
-def wb_write(bus, addr, data, sel=True):
+def wb_write(bus, addr, data, sel=0b1111):
 
     # write wb
     yield bus.we.eq(1)
     yield bus.cyc.eq(1)
     yield bus.stb.eq(1)
-    yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+    #yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+    yield bus.sel.eq(sel)
     yield bus.adr.eq(addr)
     yield bus.dat_w.eq(data)
 
@@ -33,13 +34,14 @@ def wb_write(bus, addr, data, sel=True):
     yield bus.dat_w.eq(0)
 
 
-def wb_read(bus, addr, sel=True):
+def wb_read(bus, addr, sel=0b1111):
 
     # read wb
     yield bus.cyc.eq(1)
     yield bus.stb.eq(1)
     yield bus.we.eq(0)
-    yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+    #yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+    yield bus.sel.eq(sel)
     yield bus.adr.eq(addr)
 
     # wait for ack to go high
diff --git a/src/soc/bus/uart_16550.py b/src/soc/bus/uart_16550.py
new file mode 100644 (file)
index 0000000..1a900ee
--- /dev/null
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog uart16550 module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+                    ResetSignal)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen.cli import rtlil, verilog
+import os
+import tempfile
+
+__all__ = ["UART16550"]
+
+
+class UART16550(Elaboratable):
+    """16550 UART from opencores, nmigen wrapper.  remember to call
+       UART16550.add_verilog_source
+    """
+
+    def __init__(self, bus=None, features=None, name=None, data_width=32,
+                       pins=None, irq=None):
+        if name is not None:
+            # convention: give the name in the format "name_number"
+            self.idx = int(name.split("_")[-1])
+        else:
+            self.idx = 0
+            name = "uart_0"
+        self.data_width = data_width
+
+        # set up the wishbone bus
+        if features is None:
+            features = frozenset()
+        if bus is None:
+            bus = Interface(addr_width=5,
+                            data_width=data_width,
+                            features=features,
+                            granularity=8,
+                            name=name+"_wb_%d" % self.idx)
+        self.bus = bus
+        assert len(self.bus.dat_r) == data_width, \
+                        "bus width must be %d" % data_width
+
+        # IRQ for data buffer receive/xmit
+        if irq is None:
+            irq = Signal()
+        self.irq = irq
+
+        # 9-pin UART signals (if anyone still remembers those...)
+        self.tx_o = Signal() # transmit
+        self.rx_i = Signal() # receive
+        self.rts_o = Signal() # ready to send
+        self.cts_i = Signal() # clear to send
+        self.dtr_o = Signal() # data terminal ready
+        self.dsr_i = Signal() # data send ready
+        self.ri_i = Signal() # can't even remember what this is!
+        self.dcd_i = Signal() # or this!
+
+        # pins resource
+        self.pins = pins
+
+    @classmethod
+    def add_verilog_source(cls, verilog_src_dir, platform):
+        # create a temp file containing "`define DATA_BUS_WIDTH_8"
+        t = tempfile.NamedTemporaryFile(delete=False, suffix=".v")
+        t.write("`define DATA_BUS_WIDTH_8\n".encode())
+        t.flush()
+        t.seek(0)
+        platform.add_file(t.name, t)
+
+        # add each of the verilog sources, needed for when doing platform.build
+        for fname in ['raminfr.v', 'uart_defines.v', 'uart_rfifo.v',
+                      'uart_top.v', 'timescale.v', 'uart_receiver.v',
+                      'uart_sync_flops.v', 'uart_transmitter.v',
+                      'uart_debug_if.v', 'uart_regs.v',
+                      'uart_tfifo.v', 'uart_wb.v'
+                     ]:
+            # prepend the src directory to each filename, add its contents
+            fullname = os.path.join(verilog_src_dir, fname)
+            with open(fullname) as f:
+                platform.add_file(fullname, f)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # create definition of external verilog 16550 uart here, so that                # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+        idx, bus = self.idx, self.bus
+        uart = Instance("uart_top",
+                            # clock/reset (use DomainRenamer if needed)
+                            i_wb_clk_i=ClockSignal(),
+                            i_wb_rst_i=ResetSignal(),
+                            # wishbone bus signals
+                            i_wb_adr_i=bus.adr,
+                            i_wb_dat_i=bus.dat_w,
+                            i_wb_sel_i=bus.sel,
+                            o_wb_dat_o=bus.dat_r,
+                            i_wb_we_i=bus.we,
+                            i_wb_stb_i=bus.stb,
+                            i_wb_cyc_i=bus.cyc,
+                            o_wb_ack_o=bus.ack,
+                            # interrupt line
+                            o_int_o=self.irq,
+                            # 9-pin RS232/UART signals
+                            o_stx_pad_o=self.tx_o,
+                            i_srx_pad_i=self.rx_i,
+                            o_rts_pad_o=self.rts_o,
+                            i_cts_pad_i=self.cts_i,
+                            o_dtr_pad_o=self.dtr_o,
+                            i_dsr_pad_i=self.dsr_i,
+                            i_ri_pad_i=self.ri_i,
+                            i_dcd_pad_i=self.dcd_i
+                            );
+
+        m.submodules['uart16550_%d' % self.idx] = uart
+
+        if self.pins is not None:
+            comb += self.pins.tx.eq(self.tx_o)
+            comb += self.rx_i.eq(self.pins.rx)
+
+        return m
+
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+    vl = verilog.convert(dut, name=test_name, ports=ports)
+    with open("%s.v" % test_name, "w") as f:
+        f.write(vl)
+
+
+if __name__ == "__main__":
+    uart = UART16550(name="uart_0", data_width=8)
+    create_ilang(uart, [uart.bus.cyc, uart.bus.stb, uart.bus.ack,
+                        uart.bus.dat_r, uart.bus.dat_w, uart.bus.adr,
+                        uart.bus.we, uart.bus.sel,
+                        uart.irq,
+                        uart.tx_o, uart.rx_i, uart.rts_o, uart.cts_i,
+                        uart.dtr_o, uart.dsr_i, uart.ri_i, uart.dcd_i
+                       ], "uart_0")
+
diff --git a/src/soc/bus/wb_async.py b/src/soc/bus/wb_async.py
new file mode 100644 (file)
index 0000000..5e024c3
--- /dev/null
@@ -0,0 +1,179 @@
+# Copyright (C) 2022 Raptor Engineering, LLC <support@raptorengineering.com>
+#
+# Based partly on code from LibreSoC
+#
+# Modifications for the Libre-SOC Project funded by NLnet and NGI POINTER
+# under EU Grants 871528 and 957073, under the LGPLv3+ License
+#
+# this is a wrapper around the Verilog Wishbone Components wb_async_reg module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+                    ResetSignal, Const)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen_soc.memory import MemoryMap
+from nmigen.utils import log2_int
+from nmigen.cli import rtlil, verilog
+from nmutil.byterev import byte_reverse
+import os
+
+__all__ = ["WBAsyncBridge"]
+
+
+class WBAsyncBridge(Elaboratable):
+    """Verilog Wishbone Components wb_async_reg module, nmigen wrapper.
+    remember to call WBAsyncBridge.add_verilog_source
+    """
+
+    def __init__(self, master_bus=None, slave_bus=None, master_features=None,
+                       slave_features=None, name=None,
+                       address_width=30, data_width=32, granularity=8,
+                       master_clock_domain=None, slave_clock_domain=None):
+        if name is not None:
+            # convention: give the name in the format "name_number"
+            self.idx = int(name.split("_")[-1])
+        else:
+            self.idx = 0
+            name = "wbasyncbridge_0"
+        self.address_width = address_width
+        self.data_width = data_width
+        self.granularity = granularity
+        self.dsize = log2_int(self.data_width//self.granularity)
+
+        # set up the clock domains
+        if master_clock_domain is None:
+            self.wb_mclk = ClockSignal()
+            self.wb_mrst = ResetSignal()
+        else:
+            self.wb_mclk = ClockSignal(master_clock_domain)
+            self.wb_mrst = ResetSignal(master_clock_domain)
+        if slave_clock_domain is None:
+            self.wb_sclk = ClockSignal()
+            self.wb_srst = ResetSignal()
+        else:
+            self.wb_sclk = ClockSignal(slave_clock_domain)
+            self.wb_srst = ResetSignal(slave_clock_domain)
+
+        # set up the wishbone busses
+        if master_features is None:
+            master_features = frozenset()
+        if slave_features is None:
+            slave_features = frozenset()
+        if master_bus is None:
+            master_bus = Interface(addr_width=self.address_width,
+                            data_width=self.data_width,
+                            features=master_features,
+                            granularity=self.granularity,
+                            name=name+"_wb_%d_master" % self.idx)
+        if slave_bus is None:
+            slave_bus = Interface(addr_width=self.address_width,
+                            data_width=self.data_width,
+                            features=slave_features,
+                            granularity=self.granularity,
+                            name=name+"_wb_%d_slave" % self.idx)
+        self.master_bus = master_bus
+        assert len(self.master_bus.dat_r) == data_width, \
+                        "bus width must be %d" % data_width
+        self.slave_bus = slave_bus
+        assert len(self.slave_bus.dat_r) == data_width, \
+                        "bus width must be %d" % data_width
+
+    @classmethod
+    def add_verilog_source(cls, verilog_src_dir, platform):
+        # add each of the verilog sources, needed for when doing platform.build
+        for fname in ['wb_async_reg.v']:
+            # prepend the src directory to each filename, add its contents
+            fullname = os.path.join(verilog_src_dir, fname)
+            with open(fullname) as f:
+                platform.add_file(fullname, f)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        master_bus, slave_bus = self.master_bus, self.slave_bus
+        slave_err = Signal()
+        slave_rty = Signal()
+
+        # create definition of external verilog bridge code here, so that
+        # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+        idx = self.idx
+        wb_async_bridge = Instance("wb_async_reg",
+                            # Parameters
+                            p_ADDR_WIDTH=self.address_width,
+                            p_DATA_WIDTH=self.data_width,
+                            # width of select is the data width
+                            # *divided* by the data granularity.
+                            # data_width=32-bit, data granularity=8-bit,
+                            # select_width ==> 32/8 ==> 4
+                            p_SELECT_WIDTH=self.data_width//self.granularity,
+
+                            # Clocks/resets
+                            i_wbm_clk=self.wb_mclk,
+                            i_wbm_rst=self.wb_mrst,
+                            i_wbs_clk=self.wb_sclk,
+                            i_wbs_rst=self.wb_srst,
+
+                            # Master Wishbone bus signals
+                            i_wbm_adr_i=self.master_bus.adr,
+                            i_wbm_dat_i=self.master_bus.dat_w,
+                            o_wbm_dat_o=self.master_bus.dat_r,
+                            i_wbm_we_i=self.master_bus.we,
+                            i_wbm_sel_i=self.master_bus.sel,
+                            i_wbm_stb_i=self.master_bus.stb,
+                            i_wbm_cyc_i=self.master_bus.cyc,
+                            o_wbm_ack_o=self.master_bus.ack,
+                            #o_wbm_err=self.master_bus.err,
+                            #o_wbm_rty_i=self.master_bus.rty,
+
+                            # Slave Wishbone bus signals
+                            o_wbs_adr_o=self.slave_bus.adr,
+                            i_wbs_dat_i=self.slave_bus.dat_r,
+                            o_wbs_dat_o=self.slave_bus.dat_w,
+                            o_wbs_we_o=self.slave_bus.we,
+                            o_wbs_sel_o=self.slave_bus.sel,
+                            o_wbs_stb_o=self.slave_bus.stb,
+                            o_wbs_cyc_o=self.slave_bus.cyc,
+                            i_wbs_ack_i=self.slave_bus.ack,
+                            i_wbs_err_i=slave_err,
+                            i_wbs_rty_i=slave_rty
+                            );
+
+        # Wire unused signals to 0
+        comb += slave_err.eq(0)
+        comb += slave_rty.eq(0)
+
+        m.submodules['wb_async_bridge_%d' % self.idx] = wb_async_bridge
+
+        return m
+
+    def ports(self):
+        return [self.master_bus.adr, self.master_bus.dat_w,
+                        self.master_bus.dat_r,
+                        self.master_bus.we, self.master_bus.sel,
+                        self.master_bus.stb,
+                        self.master_bus.cyc, self.master_bus.ack,
+                        self.master_bus.err,
+                        self.master_bus.rty,
+                        self.slave_bus.adr, self.slave_bus.dat_w,
+                        self.slave_bus.dat_r,
+                        self.slave_bus.we, self.slave_bus.sel,
+                        self.slave_bus.stb,
+                        self.slave_bus.cyc, self.slave_bus.ack,
+                        self.slave_bus.err,
+                        self.slave_bus.rty
+                       ]
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+    vl = verilog.convert(dut, name=test_name, ports=ports)
+    with open("%s.v" % test_name, "w") as f:
+        f.write(vl)
+
+
+if __name__ == "__main__":
+    wbasyncbridge = WBAsyncBridge(name="wbasyncbridge_0", address_width=30, data_width=32, granularity=8)
+    create_ilang(wbasyncbridge, wbasyncbridge.ports(), "wbasyncbridge_0")
index 2fe2a921c4631a54ee67b06e3fab276ecbe36e92..fbf8239fe1c1ce157a8abb3e77ad0530c4058a3c 100644 (file)
@@ -47,20 +47,17 @@ class WishboneDownConvert(Elaboratable):
         shift_reg = Signal(dw_from)
 
         counter = Signal(log2_int(ratio, False))
-        counter_reset = Signal()
-        counter_ce = Signal()
-        with m.If(counter_reset):
-            sync += counter.eq(0)
-        with m.Elif(counter_ce):
-            sync += counter.eq(counter + 1)
+        cur_counter = Signal(log2_int(ratio, False))
 
         counter_done = Signal()
         comb += counter_done.eq(counter == ratio-1)
+        comb += cur_counter.eq(counter)
+        skip = Signal()
 
         # Main FSM
         with m.FSM() as fsm:
             with m.State("IDLE"):
-                comb += counter_reset.eq(1)
+                sync += counter.eq(0)
                 sync += cached_data.eq(0)
                 with m.If(master.stb & master.cyc):
                     with m.If(master.we):
@@ -70,12 +67,13 @@ class WishboneDownConvert(Elaboratable):
 
             with m.State("WRITE"):
                 comb += write.eq(1)
-                comb += slave.we.eq(1)
-                comb += slave.cyc.eq(1)
                 with m.If(master.stb & master.cyc):
+                    comb += skip.eq(slave.sel == 0)
+                    comb += slave.we.eq(1)
+                    comb += slave.cyc.eq(1)
                     comb += slave.stb.eq(1)
-                    with m.If(slave.ack):
-                        comb += counter_ce.eq(1)
+                    with m.If(slave.ack | skip):
+                        sync += counter.eq(counter + 1)
                         with m.If(counter_done):
                             comb += master.ack.eq(1)
                             m.next = "IDLE"
@@ -84,11 +82,13 @@ class WishboneDownConvert(Elaboratable):
 
             with m.State("READ"):
                 comb += read.eq(1)
-                comb += slave.cyc.eq(1)
                 with m.If(master.stb & master.cyc):
+                    comb += skip.eq(slave.sel == 0)
+                    comb += slave.cyc.eq(1)
                     comb += slave.stb.eq(1)
-                    with m.If(slave.ack):
-                        comb += counter_ce.eq(1)
+                    with m.If(slave.ack | skip):
+                        comb += cur_counter.eq(counter + 1) # TODO use Picker
+                        sync += counter.eq(cur_counter)
                         with m.If(counter_done):
                             comb += master.ack.eq(1)
                             comb += master.dat_r.eq(shift_reg)
@@ -102,7 +102,7 @@ class WishboneDownConvert(Elaboratable):
                 comb += slave.cti.eq(7) # indicate end of burst
             with m.Else():
                 comb += slave.cti.eq(2)
-        comb += slave.adr.eq(Cat(counter, master.adr))
+        comb += slave.adr.eq(Cat(cur_counter, master.adr))
 
         # write Datapath - select fragments of data, depending on "counter"
         with m.Switch(counter):
@@ -117,7 +117,7 @@ class WishboneDownConvert(Elaboratable):
         # read Datapath - uses cached_data and master.dat_r as a shift-register.
         # by the time "counter" is done (counter_done) this is complete
         comb += shift_reg.eq(Cat(cached_data[dw_to:], slave.dat_r))
-        with m.If(read & counter_ce):
+        with m.If(read & (slave.ack | skip)):
             sync += cached_data.eq(shift_reg)
 
 
index a73a89bc6c5b2fe8962d0072a0b2939010ffd3fe..35a9ddec0d230aa8f6354871faad6aa202dd7a33 100644 (file)
@@ -18,6 +18,18 @@ class ConfigFetchUnit:
                    'bare_wb': BareFetchUnit,
                    #'test_cache_wb': TestCacheFetchUnit
                   }
+        self.pspec = pspec
+        if self.pspec.imem_ifacetype in ['mmu_cache_wb', 'test_mmu_cache_wb']:
+            # XXX BLECH! use pspec to transfer the I-Cache which is
+            # created down inside LoadStore1!
+            self.fu = icache = pspec.icache # ICache already FetchUnitInterface
+            # tell I-Cache to connect up to its FetchUnitInterface
+            icache.use_fetch_interface()
+            return
+
         fukls = fudict[pspec.imem_ifacetype]
         self.fu = fukls(pspec)
 
+    def wb_bus(self):
+        return self.fu.ibus
+
index 95129b1999e733b44c5f46c265d9b1c478c4a8f4..9ebe4f7cd2ee18f0af9545fef246298a58401095 100644 (file)
@@ -98,7 +98,7 @@ def load_pinouts(chipname=None):
 
     # path is relative to this filename, in the pinmux submodule
     pinmux = os.getenv("PINMUX", "%s/../../../pinmux" % pth)
-    fname = "%s/%s/litex_pinpads.json" % (pinmux, chipname)
+    fname = "%s/%s/fabric_pinpads.json" % (pinmux, chipname)
     with open(fname) as f:
         txt = f.read()
 
index 5c4097a54fe963bb3b4f2ddf6ad66b90fcef8933..39437b3c94d63ee86785c8350438f72238e6b595 100644 (file)
@@ -13,13 +13,14 @@ import sys
 sys.setrecursionlimit(10**6)
 
 
-def read_from_addr(dut, addr):
+def read_from_addr(dut, addr, stall=True):
     yield dut.a_pc_i.eq(addr)
     yield dut.a_i_valid.eq(1)
     yield dut.f_i_valid.eq(1)
-    yield dut.a_stall_i.eq(1)
-    yield
-    yield dut.a_stall_i.eq(0)
+    if stall:
+        yield dut.a_stall_i.eq(1)
+        yield
+        yield dut.a_stall_i.eq(0)
     yield
     yield Settle()
     while (yield dut.f_busy_o):
index 1009d6f18a6d68415f83c143bd921dfb61645470..96cadef23e4c6fd3e4a7c177cc703a664c5d6c3c 100644 (file)
@@ -24,8 +24,9 @@ def wait_addr(port,debug=None):
     cnt = 0
     while True:
         addr_ok = yield port.addr_ok_o
-        print("addrok", addr_ok,cnt,debug)
-        if addr_ok:
+        exc_happened = yield port.exc_o.happened
+        print("addrok", addr_ok,cnt,debug,exc_happened)
+        if addr_ok or exc_happened:
             break
         yield
         cnt += 1
@@ -43,7 +44,7 @@ def wait_ldok(port):
         yield
 
 
-def pi_st(port1, addr, data, datalen, msr_pr=0, is_dcbz=0):
+def pi_st(port1, addr, data, datalen, msr, is_dcbz=0):
 
     # have to wait until not busy
     yield from wait_busy(port1,debug="pi_st_A") # wait while busy
@@ -52,13 +53,39 @@ def pi_st(port1, addr, data, datalen, msr_pr=0, is_dcbz=0):
     yield port1.is_dcbz_i.eq(is_dcbz)  # reset dcbz too
     yield port1.is_st_i.eq(1)  # indicate ST
     yield port1.data_len.eq(datalen)  # ST length (1/2/4/8)
-    yield port1.msr_pr.eq(msr_pr)  # MSR PR bit (1==>virt, 0==>real)
+    yield port1.priv_mode.eq(~msr.pr)
+    yield port1.virt_mode.eq(msr.dr)
+    yield port1.mode_32bit.eq(~msr.sf)
 
     yield port1.addr.data.eq(addr)  # set address
     yield port1.addr.ok.eq(1)  # set ok
     yield Settle()
+
+    # must check exception even before waiting for address.
+    # XXX TODO: wait_addr should check for exception
+    exc_info = yield from get_exception_info(port1.exc_o)
+    exc_happened = exc_info.happened
+    if exc_happened:
+        print("print fast ST exception happened")
+        yield # MUST wait for one clock cycle before de-asserting these
+        yield port1.is_st_i.eq(0)  # end
+        yield port1.addr.ok.eq(0)  # set !ok
+        yield port1.is_dcbz_i.eq(0)  # reset dcbz too
+        return "fast", exc_info
+
     yield from wait_addr(port1)             # wait until addr ok
 
+    exc_info = yield from get_exception_info(port1.exc_o)
+    exc_happened = exc_info.happened
+    if exc_happened:
+        print("print fast ST exception happened")
+        yield # MUST wait for one clock cycle before de-asserting these
+        yield port1.is_st_i.eq(0)  # end
+        yield port1.addr.ok.eq(0)  # set !ok
+        yield port1.is_dcbz_i.eq(0)  # reset dcbz too
+        return "fast", exc_info
+
+
     # yield # not needed, just for checking
     # yield # not needed, just for checking
     # assert "ST" for one cycle (required by the API)
@@ -67,7 +94,6 @@ def pi_st(port1, addr, data, datalen, msr_pr=0, is_dcbz=0):
     yield
     yield port1.st.ok.eq(0)
     exc_info = yield from get_exception_info(port1.exc_o)
-    dar_o = yield port1.dar_o
     exc_happened = exc_info.happened
     if exc_happened:
         print("print fast ST exception happened")
@@ -75,11 +101,10 @@ def pi_st(port1, addr, data, datalen, msr_pr=0, is_dcbz=0):
         yield port1.is_st_i.eq(0)  # end
         yield port1.addr.ok.eq(0)  # set !ok
         yield port1.is_dcbz_i.eq(0)  # reset dcbz too
-        return "fast", exc_info, dar_o
+        return "fast", exc_info
 
     yield from wait_busy(port1,debug="pi_st_E") # wait while busy
     exc_info = yield from get_exception_info(port1.exc_o)
-    dar_o = yield port1.dar_o
     exc_happened = exc_info.happened
     if exc_happened:
         yield  # needed if mmu/dache is used
@@ -87,7 +112,7 @@ def pi_st(port1, addr, data, datalen, msr_pr=0, is_dcbz=0):
         yield port1.addr.ok.eq(0)  # set !ok
         yield port1.is_dcbz_i.eq(0)  # reset dcbz too
         yield  # needed if mmu/dache is used
-        return "slow", exc_info, dar_o
+        return "slow", exc_info
 
     # can go straight to reset.
     yield port1.is_st_i.eq(0)  # end
@@ -95,7 +120,7 @@ def pi_st(port1, addr, data, datalen, msr_pr=0, is_dcbz=0):
     yield port1.is_dcbz_i.eq(0)  # reset dcbz too
     yield  # needed if mmu/dache is used
 
-    return None, None, None
+    return None, None
 
 def get_exception_info(exc_o):
     attrs = []
@@ -108,7 +133,7 @@ def get_exception_info(exc_o):
 
 # copy of pi_st removed
 
-def pi_ld(port1, addr, datalen, msr_pr=0):
+def pi_ld(port1, addr, datalen, msr):
 
     # have to wait until not busy
     yield from wait_busy(port1,debug="pi_ld_A") # wait while busy
@@ -116,27 +141,27 @@ def pi_ld(port1, addr, datalen, msr_pr=0):
     # set up a LD on the port.  address first:
     yield port1.is_ld_i.eq(1)  # indicate LD
     yield port1.data_len.eq(datalen)  # LD length (1/2/4/8)
-    yield port1.msr_pr.eq(msr_pr)  # MSR PR bit (1==>virt, 0==>real)
+    yield port1.priv_mode.eq(~msr.pr)
+    yield port1.virt_mode.eq(msr.dr)
+    yield port1.mode_32bit.eq(~msr.sf)
 
     yield port1.addr.data.eq(addr)  # set address
     yield port1.addr.ok.eq(1)  # set ok
     yield Settle()
     yield from wait_addr(port1)             # wait until addr ok
     exc_info = yield from get_exception_info(port1.exc_o)
-    dar_o = yield port1.dar_o
     exc_happened = exc_info.happened
     if exc_happened:
         print("print fast LD exception happened")
         yield # MUST wait for one clock cycle before de-asserting these
         yield port1.is_ld_i.eq(0)  # end
         yield port1.addr.ok.eq(0)  # set !ok
-        return None, "fast", exc_info, dar_o
+        return None, "fast", exc_info
 
     yield
     yield from wait_ldok(port1)             # wait until ld ok
     data = yield port1.ld.data
     exc_info = yield from get_exception_info(port1.exc_o)
-    dar_o = yield port1.dar_o
     exc_happened = yield port1.exc_o.happened
     exc_happened = exc_info.happened
 
@@ -144,20 +169,19 @@ def pi_ld(port1, addr, datalen, msr_pr=0):
     yield port1.is_ld_i.eq(0)  # end
     yield port1.addr.ok.eq(0)  # set !ok
     if exc_happened:
-        return None, "slow", exc_info, dar_o
+        return None, "slow", exc_info
 
     yield from wait_busy(port1, debug="pi_ld_E") # wait while busy
 
     exc_info = yield from get_exception_info(port1.exc_o)
-    dar_o = yield port1.dar_o
     exc_happened = exc_info.happened
     if exc_happened:
-        return None, "slow", exc_info, dar_o
+        return None, "slow", exc_info
 
-    return data, None, None, None
+    return data, None, None
 
 
-def pi_ldst(arg, dut, msr_pr=0):
+def pi_ldst(arg, dut, msr):
 
     # do two half-word stores at consecutive addresses, then two loads
     addr1 = 0x04
@@ -165,10 +189,10 @@ def pi_ldst(arg, dut, msr_pr=0):
     data = 0xbeef
     data2 = 0xf00f
     #data = 0x4
-    assert(yield from pi_st(dut, addr1, data, 2, msr_pr) is None)
-    assert(yield from pi_st(dut, addr2, data2, 2, msr_pr) is None)
-    result, exc = yield from pi_ld(dut, addr1, 2, msr_pr)
-    result2, exc2 = yield from pi_ld(dut, addr2, 2, msr_pr)
+    assert(yield from pi_st(dut, addr1, data, 2, msr) is None)
+    assert(yield from pi_st(dut, addr2, data2, 2, msr) is None)
+    result, exc = yield from pi_ld(dut, addr1, 2, msr)
+    result2, exc2 = yield from pi_ld(dut, addr2, 2, msr)
     assert(exc is None)
     assert(exc2 is None)
     arg.assertEqual(data, result, "data %x != %x" % (result, data))
@@ -176,7 +200,7 @@ def pi_ldst(arg, dut, msr_pr=0):
 
     # now load both in a 32-bit load to make sure they're really consecutive
     data3 = data | (data2 << 16)
-    result3, exc3 = yield from pi_ld(dut, addr1, 4, msr_pr)
+    result3, exc3 = yield from pi_ld(dut, addr1, 4, msr)
     assert(exc3 is None)
     arg.assertEqual(data3, result3, "data3 %x != %x" % (result3, data3))
 
@@ -187,7 +211,7 @@ def tst_config_pi(testcls, ifacetype):
     dut = Module()
     pspec = TestMemPspec(ldst_ifacetype=ifacetype,
                          imem_ifacetype='',
-                         addr_wid=48,
+                         addr_wid=64,
                          mask_wid=8,
                          reg_wid=64)
     cmpi = ConfigMemoryPortInterface(pspec)
@@ -202,8 +226,9 @@ def tst_config_pi(testcls, ifacetype):
                    vcd_name='test_pi_%s.vcd' % ifacetype)
 
 
+# FIXME: TypeError: pi_ldst() missing 1 required positional argument: 'msr'
+@unittest.skip('broken')
 class TestPIMem(unittest.TestCase):
-
     def test_pi_mem(self):
         tst_config_pi(self, 'testpi')
 
diff --git a/src/soc/debug/.gitignore b/src/soc/debug/.gitignore
new file mode 100644 (file)
index 0000000..8edaee0
--- /dev/null
@@ -0,0 +1 @@
+ls180_pins.py
index 8a6686df5c26af6d26052598552d0e7ad3fb4921..03bd8dc8eabcde75a191147a666ad9086b120ac5 100644 (file)
@@ -11,12 +11,13 @@ from nmutil.iocontrol import RecordObject
 from nmigen.utils import log2_int
 from nmigen.cli import rtlil
 from soc.config.state import CoreState
+from openpower.consts import FastRegsEnum
 
 
 # DMI register addresses
 class DBGCore:
-    CTRL         = 0b0000
-    STAT         = 0b0001
+    CTRL         = 0b0000 # Control: start/stop/reset
+    STAT         = 0b0001 # Status (read started/stopped/stopping)
     NIA          = 0b0010 # NIA register (read only for now)
     MSR          = 0b0011 # MSR (read only)
     GSPR_IDX     = 0b0100 # GSPR register index
@@ -26,6 +27,7 @@ class DBGCore:
     CR           = 0b1000 # CR (read only)
     XER          = 0b1001 # XER (read only) - note this is a TEMPORARY hack
     SVSTATE      = 0b1010 # SVSTATE register (read only for now)
+    STOPADDR     = 0b1011 # Address at which the core automatically stops
 
 
 # CTRL register (direct actions, write 1 to act, read back 0)
@@ -105,14 +107,10 @@ class CoreDebug(Elaboratable):
         self.core_stopped_i = Signal()
         self.state = CoreState("core_dbg")
 
-        # GSPR register read port
-        self.d_gpr = DbgReg("d_gpr")
-
-        # CR register read port
-        self.d_cr = DbgReg("d_cr")
-
-        # XER register read port
-        self.d_xer = DbgReg("d_xer")
+        self.d_gpr = DbgReg("d_gpr") # GSPR register read port
+        self.d_fast = DbgReg("d_fast") # GSPR register read port
+        self.d_cr = DbgReg("d_cr")   # CR register read port
+        self.d_xer = DbgReg("d_xer") # XER register read port
 
         # Core logging data
         self.log_data_i        = Signal(256)
@@ -120,6 +118,10 @@ class CoreDebug(Elaboratable):
         self.log_read_data_o   = Signal(64)
         self.log_write_addr_o  = Signal(32)
 
+        # address at which the processor stops automatically
+        # set to 0xffffffffffffffff by default (impossible to reach)
+        self.stop_addr_o = Signal(64, reset=-1)
+
         # Misc
         self.terminated_o  = Signal()
 
@@ -128,6 +130,7 @@ class CoreDebug(Elaboratable):
         m = Module()
         comb, sync = m.d.comb, m.d.sync
         dmi, d_gpr, d_cr, d_xer, = self.dmi, self.d_gpr, self.d_cr, self.d_xer
+        d_fast = self.d_fast
 
         # DMI needs fixing... make a one clock pulse
         dmi_req_i_1 = Signal()
@@ -142,7 +145,11 @@ class CoreDebug(Elaboratable):
         do_icreset   = Signal()
         terminated   = Signal()
         do_gspr_rd   = Signal()
+        # select either GPRs or FAST regs to read, based on GSPR_IDX
         gspr_index   = Signal.like(d_gpr.addr)
+        fast_index   = Signal.like(d_gpr.addr)
+        gspr_en      = Signal()
+        fast_en      = Signal()
 
         log_dmi_addr = Signal(32)
         log_dmi_data = Signal(64)
@@ -152,11 +159,15 @@ class CoreDebug(Elaboratable):
 
         LOG_INDEX_BITS = log2_int(self.LOG_LENGTH)
 
-        # Single cycle register accesses on DMI except for GSPR data
+        # Single cycle register accesses on DMI except for registers
         with m.Switch(dmi.addr_i):
             with m.Case(DBGCore.GSPR_DATA):
-                comb += dmi.ack_o.eq(d_gpr.ack)
-                comb += d_gpr.req.eq(dmi.req_i)
+                with m.If(gspr_en): # GPR requested, acknowledge GPR
+                    comb += dmi.ack_o.eq(d_gpr.ack)
+                    comb += d_gpr.req.eq(dmi.req_i)
+                with m.If(fast_en): # FAST requested
+                    comb += dmi.ack_o.eq(d_fast.ack)
+                    comb += d_fast.req.eq(dmi.req_i)
             with m.Case(DBGCore.CR):
                 comb += dmi.ack_o.eq(d_cr.ack)
                 comb += d_cr.req.eq(dmi.req_i)
@@ -164,6 +175,7 @@ class CoreDebug(Elaboratable):
                 comb += dmi.ack_o.eq(d_xer.ack)
                 comb += d_xer.req.eq(dmi.req_i)
             with m.Default():
+                # everything else is immediate-acknowledgement (combinatorial)
                 comb += dmi.ack_o.eq(dmi.req_i)
 
         # Status register read composition (DBUG_CORE_STAT_xxx)
@@ -173,24 +185,29 @@ class CoreDebug(Elaboratable):
 
         # DMI read data mux
         with m.Switch(dmi.addr_i):
-            with m.Case( DBGCore.STAT):
+            with m.Case( DBGCore.STAT):               # Status register
                 comb += dmi.dout.eq(stat_reg)
-            with m.Case( DBGCore.NIA):
+            with m.Case( DBGCore.NIA):                # NIA (PC)
                 comb += dmi.dout.eq(self.state.pc)
-            with m.Case( DBGCore.MSR):
+            with m.Case( DBGCore.MSR):                # MSR
                 comb += dmi.dout.eq(self.state.msr)
-            with m.Case( DBGCore.SVSTATE):
+            with m.Case( DBGCore.SVSTATE):            # SVSTATE
                 comb += dmi.dout.eq(self.state.svstate)
-            with m.Case( DBGCore.GSPR_DATA):
-                comb += dmi.dout.eq(d_gpr.data)
-            with m.Case( DBGCore.LOG_ADDR):
+            with m.Case( DBGCore.GSPR_DATA):          # GPR/FAST regs
+                with m.If(gspr_en):
+                    comb += dmi.dout.eq(d_gpr.data)   # GPR data selected
+                with m.If(fast_en):
+                    comb += dmi.dout.eq(d_fast.data)  # FAST reg read selected
+            with m.Case( DBGCore.LOG_ADDR):           # Logging
                 comb += dmi.dout.eq(Cat(log_dmi_addr, self.log_write_addr_o))
             with m.Case( DBGCore.LOG_DATA):
                 comb += dmi.dout.eq(log_dmi_data)
-            with m.Case(DBGCore.CR):
+            with m.Case(DBGCore.CR):                  # CR
                 comb += dmi.dout.eq(d_cr.data)
-            with m.Case(DBGCore.XER):
+            with m.Case(DBGCore.XER):                 # XER
                 comb += dmi.dout.eq(d_xer.data)
+            with m.Case(DBGCore.STOPADDR):            # Halt PC
+                comb += dmi.dout.eq(self.stop_addr_o)
 
         # DMI writes
         # Reset the 1-cycle "do" signals
@@ -225,12 +242,31 @@ class CoreDebug(Elaboratable):
 
                 # GSPR address
                 with m.Elif(dmi.addr_i == DBGCore.GSPR_IDX):
-                    sync += gspr_index.eq(dmi.din)
+                    sync += gspr_index.eq(0)
+                    sync += fast_index.eq(0)
+                    sync += gspr_en.eq(0)
+                    sync += fast_en.eq(0)
+                    with m.If(dmi.din <= 31):
+                        sync += gspr_index.eq(dmi.din)
+                        sync += gspr_en.eq(1)
+                    # cover the FastRegs LR, CTR, SRR0, SRR1 etc.
+                    # numbering is from microwatt
+                    for x, i in FastRegsEnum.__dict__.items():
+                        if not isinstance(i, int) or x == 'N_REGS':
+                            continue
+                        with m.If(dmi.din == 32+i):
+                            sync += fast_index.eq(i)
+                            sync += fast_en.eq(1)
 
                 # Log address
                 with m.Elif(dmi.addr_i == DBGCore.LOG_ADDR):
                     sync += log_dmi_addr.eq(dmi.din)
                     sync += do_dmi_log_rd.eq(1)
+
+                # set PC Halt address
+                with m.Elif(dmi.addr_i == DBGCore.STOPADDR):
+                    sync += self.stop_addr_o.eq(dmi.din)
+
             with m.Else():
                 # sync += Display("DMI read from " & to_string(dmi_addr))
                 pass
@@ -253,12 +289,16 @@ class CoreDebug(Elaboratable):
             sync += terminated.eq(1)
 
         comb += d_gpr.addr.eq(gspr_index)
+        comb += d_fast.addr.eq(fast_index)
 
         # Core control signals generated by the debug module
-        comb += self.core_stop_o.eq(stopping & ~do_step)
+        # Note: make stop and terminated synchronous, to help with timing
+        # however this *may* interfere with some of the DMI-based unit tests
+        # so has to be kept an eye on
+        sync += self.core_stop_o.eq((stopping & ~do_step) | self.terminate_i)
+        sync += self.terminated_o.eq(terminated | self.terminate_i)
         comb += self.core_rst_o.eq(do_reset)
         comb += self.icache_rst_o.eq(do_icreset)
-        comb += self.terminated_o.eq(terminated)
 
         # Logging RAM (none)
 
@@ -357,6 +397,7 @@ class CoreDebug(Elaboratable):
         yield from self.d_gpr
         yield from self.d_cr
         yield from self.d_xer
+        yield from self.d_fast
         yield self.log_data_i
         yield self.log_read_addr_i
         yield self.log_read_data_o
index 3981904a1a88bacaa9f65c0976dd675aa7104978..6c984ed34a6c776f5f2f5a4d89889c2533c36e18 100644 (file)
@@ -25,11 +25,16 @@ def tms_state_set(dut, bits):
         yield
     yield dut.bus.tms.eq(0)
 
+def tms_data_getset(dut, tms, d_len, d_in=0, reverse=False):
+    if reverse:
+        # Reverse the for loop to transmit MSB-first
+        bit_range = range(d_len-1, -1, -1)
+    else:
+        bit_range = range(d_len)
 
-def tms_data_getset(dut, tms, d_len, d_in=0):
     res = 0
     yield dut.bus.tms.eq(tms)
-    for i in range(d_len):
+    for i in bit_range:
         tdi = 1 if (d_in & (1<<i)) else 0
         yield dut.bus.tck.eq(1)
         res |= (1<<i) if (yield dut.bus.tdo) else 0
@@ -58,14 +63,14 @@ def jtag_set_idle(dut):
     yield from tms_state_set(dut, [1, 1, 0])
 
 
-def jtag_read_write_reg(dut, addr, d_len, d_in=0):
+def jtag_read_write_reg(dut, addr, d_len, d_in=0, reverse=False):
     yield from jtag_set_run(dut)
     yield from jtag_set_shift_ir(dut)
     yield from tms_data_getset(dut, 0, dut._ir_width, addr)
     yield from jtag_set_idle(dut)
 
     yield from jtag_set_shift_dr(dut)
-    result = yield from tms_data_getset(dut, 0, d_len, d_in)
+    result = yield from tms_data_getset(dut, 0, d_len, d_in, reverse)
     yield from jtag_set_idle(dut)
     return result
 
index c88689e7275c3f13c24af7d3636b3e83412677c6..459bbd951cb41a35e5f06089162e365fd8b03d9b 100644 (file)
@@ -188,10 +188,13 @@ class DummyALU(Elaboratable):
 #####################
 
 # input (and output) for logical initial stage (common input)
+
+
 class ALUInputData(FUBaseData):
-    regspec = [('INT', 'a', '0:63'), # RA
-               ('INT', 'b', '0:63'), # RB/immediate
+    regspec = [('INT', 'a', '0:63'),  # RA
+               ('INT', 'b', '0:63'),  # RB/immediate
                ]
+
     def __init__(self, pspec):
         super().__init__(pspec, False)
 
@@ -200,6 +203,7 @@ class ALUInputData(FUBaseData):
 class ALUOutputData(FUBaseData):
     regspec = [('INT', 'o', '0:63'),        # RT
                ]
+
     def __init__(self, pspec):
         super().__init__(pspec, True)
 
@@ -211,11 +215,11 @@ class ALUPipeSpec(CommonPipeSpec):
 
 
 class ALUFunctionUnit(FunctionUnitBaseSingle):
-#class ALUFunctionUnit(FunctionUnitBaseMulti):
+    # class ALUFunctionUnit(FunctionUnitBaseMulti):
     fnunit = Function.ALU
 
-    def __init__(self, idx):
-        super().__init__(ALUPipeSpec, ALU, 1)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(ALUPipeSpec, ALU, 1, parent_pspec)
 
 
 class ALU(Elaboratable):
index 50ee1367cc84301bcf9cecf0f6cae51d13273227..784b9a8151fac8578dca8a1b95404480d9e509f3 100644 (file)
@@ -1,7 +1,8 @@
 # TODO: replace with Memory at some point
-from nmigen import Elaboratable, Signal, Array, Module
+from nmigen import Elaboratable, Signal, Array, Module, Memory
 from nmutil.util import Display
 
+
 class CacheRam(Elaboratable):
 
     def __init__(self, ROW_BITS=16, WIDTH = 64, TRACE=True, ADD_BUF=False,
@@ -28,30 +29,52 @@ class CacheRam(Elaboratable):
         ADD_BUF = self.ADD_BUF
         SIZE = 2**ROW_BITS
      
-        ram = Array(Signal(WIDTH) for i in range(SIZE))
+        # set up the Cache RAM Memory and create one read and one write port
+        # the read port is *not* transparent (does not pass write-thru-read)
         #attribute ram_style of ram : signal is "block";
-     
-        rd_data0 = Signal(WIDTH)
-     
+        ram = Memory(depth=SIZE, width=WIDTH,
+                     attrs={'syn_ramstyle': "block_ram"})
+        m.submodules.rdport = rdport = ram.read_port(transparent=False)
+        m.submodules.wrport = wrport = ram.write_port(granularity=8)
+
         with m.If(TRACE):
             with m.If(self.wr_sel.bool()):
                 sync += Display( "write ramno %d a: %%x "
                                  "sel: %%x dat: %%x" % self.ram_num,
                                 self.wr_addr,
                                 self.wr_sel, self.wr_data)
-        for i in range(WIDTH//8):
-            lbit = i * 8;
-            mbit = lbit + 8;
-            with m.If(self.wr_sel[i]):
-                sync += ram[self.wr_addr][lbit:mbit].eq(self.wr_data[lbit:mbit])
-        with m.If(self.rd_en):
-            sync += rd_data0.eq(ram[self.rd_addr])
-            if TRACE:
+
+        # read data output and a latched copy. behaves like microwatt cacheram
+        rd_data0 = Signal(WIDTH)
+        rd_data0l = Signal(WIDTH)
+
+        # delay on read address/en
+        rd_delay = Signal()
+        rd_delay_addr = Signal.like(self.rd_addr)
+        sync += rd_delay_addr.eq(self.rd_addr)
+        sync += rd_delay.eq(self.rd_en)
+
+        # write port
+        comb += wrport.addr.eq(self.wr_addr)
+        comb += wrport.en.eq(self.wr_sel)
+        comb += wrport.data.eq(self.wr_data)
+
+        # read port (include a latch on the output, for microwatt compatibility)
+        comb += rdport.addr.eq(self.rd_addr)
+        comb += rdport.en.eq(self.rd_en)
+        with m.If(rd_delay):
+            comb += rd_data0.eq(rdport.data)
+            sync += rd_data0l.eq(rd_data0)   # preserve latched data
+        with m.Else():
+            comb += rd_data0.eq(rd_data0l)   # output latched (last-read)
+
+        if TRACE:
+            with m.If(rd_delay):
                 sync += Display("read ramno %d a: %%x dat: %%x" % self.ram_num,
-                                self.rd_addr, ram[self.rd_addr])
+                                rd_delay_addr, rd_data0)
                 pass
 
-
+        # extra delay requested?
         if ADD_BUF:
             sync += self.rd_data_o.eq(rd_data0)
         else:
index f76e40660ac3181bda5e4af48ef8aba97f952964..23ef36ea03077bef1b73e75c1d4b169b454ee99b 100644 (file)
@@ -182,6 +182,10 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
             rw_domain = m.d.sync
         else:
             rw_domain = m.d.comb
+        # generate a pulse on system reset, to reset any latches, if needed
+        system_reset = Signal(reset=1)
+        m.d.sync += system_reset.eq(0)
+
         # add the ALU to the MultiCompUnit only if it is a "real" ALU
         # see AllFunctionUnits as to why: a FunctionUnitBaseMulti
         # only has one "real" ALU but multiple pseudo front-ends,
@@ -198,12 +202,12 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         # ALU only proceeds when all src are ready.  rd_rel_o is delayed
         # so combine it with go_rd_i.  if all bits are set we're good
         all_rd = Signal(reset_less=True)
-        m.d.comb += all_rd.eq(self.busy_o & rok_l.q &
+        m.d.comb += all_rd.eq(self.busy_o & # rok_l.q & # XXX LOOP
                               (((~self.rd.rel_o) | self.rd.go_i).all()))
 
         # generate read-done pulse
         all_rd_pulse = Signal(reset_less=True)
-        m.d.comb += all_rd_pulse.eq(rising_edge(m, all_rd))
+        m.d.comb += all_rd_pulse.eq(rising_edge(m, all_rd)) # XXX LOOP
 
         # create rising pulse from alu valid condition.
         alu_done = self.cu.alu_done_o
@@ -241,11 +245,11 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         m.d.comb += reset.eq(req_done | self.go_die_i)
         m.d.comb += rst_r.eq(self.issue_i | self.go_die_i)
         m.d.comb += reset_w.eq(self.wr.go_i | Repl(self.go_die_i, self.n_dst))
-        m.d.comb += reset_r.eq(self.rd.go_i | Repl(self.go_die_i, self.n_src))
+        m.d.comb += reset_r.eq(self.rd.go_i | Repl(rst_r, self.n_src))
 
         # read-done,wr-proceed latch
         rw_domain += rok_l.s.eq(self.issue_i)  # set up when issue starts
-        rw_domain += rok_l.r.eq(self.alu.n.o_valid & self.busy_o)  # ALU done
+        rw_domain += rok_l.r.eq(self.alu.n.o_valid & self.busy_o) # ALUdone LOOP
 
         # wr-done, back-to-start latch
         rw_domain += rst_l.s.eq(all_rd)     # set when read-phase is fully done
@@ -258,12 +262,13 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         # src operand latch (not using go_wr_i) ANDed with rdmask
         rdmaskn = Signal(self.n_src)
         latchregister(m, self.rdmaskn, rdmaskn, self.issue_i, name="rdmask_l")
-        m.d.comb += src_l.s.eq(Repl(self.issue_i, self.n_src) & ~rdmaskn)
+        m.d.sync += src_l.s.eq(Repl(self.issue_i, self.n_src) & ~rdmaskn)
         m.d.sync += src_l.r.eq(reset_r)
 
         # dest operand latch (not using issue_i)
         rw_domain += req_l.s.eq(alu_pulsem & self.wrmask)
-        m.d.comb += req_l.r.eq(reset_w | prev_wr_go)
+        m.d.comb += req_l.r.eq(reset_w | prev_wr_go |
+                               Repl(system_reset, self.n_dst))
 
         # pass operation to the ALU (sync: plenty time to wait for src reads)
         op = self.get_op()
@@ -355,7 +360,7 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         m.submodules.alu_l = alu_l = SRLatch(False, name="alu")
         m.d.comb += self.alu.n.i_ready.eq(alu_l.q)
         m.d.sync += alu_l.r.eq(self.alu.n.o_valid & alu_l.q)
-        m.d.comb += alu_l.s.eq(all_rd_pulse)
+        m.d.comb += alu_l.s.eq(all_rd_pulse) # XXX LOOP
 
         # -----
         # outputs
index 2baedc29f03cdb4a49431d9b69f21ee8cdd901bb..2a54e51bf0caacddd0af4df62b7facb45c1a6395 100644 (file)
@@ -87,7 +87,7 @@ Terminology:
 
 from nmigen.compat.sim import run_simulation
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array, Repl
+from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array, Repl, C
 from nmigen.hdl.rec import Record, Layout
 
 from nmutil.latch import SRLatch, latchregister
@@ -189,7 +189,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
     TODO: use one module for the byte-reverse as it's quite expensive in gates
     """
 
-    def __init__(self, pi=None, rwid=64, awid=48, opsubset=CompLDSTOpSubset,
+    def __init__(self, pi=None, rwid=64, awid=64, opsubset=CompLDSTOpSubset,
                  debugtest=False, name=None):
         super().__init__(rwid)
         self.awid = awid
@@ -199,7 +199,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
 
         # POWER-compliant LD/ST has index and update: *fixed* number of ports
         self.n_src = n_src = 3   # RA, RB, RT/RS
-        self.n_dst = n_dst = 2  # RA, RT/RS
+        self.n_dst = n_dst = 3  # RA, RT/RS, CR0
 
         # set up array of src and dest signals
         for i in range(n_src):
@@ -245,6 +245,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
 
         self.o_data = Data(self.data_wid, name="o")  # Dest1 out: RT
         self.addr_o = Data(self.data_wid, name="ea")  # Addr out: Update => RA
+        self.cr_o = Data(4, name="cr0")  # CR0 (for stdcx etc)
         self.exc_o = cu.exc_o
         self.done_o = cu.done_o
         self.busy_o = cu.busy_o
@@ -265,7 +266,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
 
         #####################
         # latches for the FSM.
-        m.submodules.opc_l = opc_l = SRLatch(sync=False, name="opc")
+        m.submodules.opc_l = opc_l = SRLatch(sync=True, name="opc")
         m.submodules.src_l = src_l = SRLatch(False, self.n_src, name="src")
         m.submodules.alu_l = alu_l = SRLatch(sync=False, name="alu")
         m.submodules.adr_l = adr_l = SRLatch(sync=False, name="adr")
@@ -273,6 +274,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         m.submodules.sto_l = sto_l = SRLatch(sync=False, name="sto")
         m.submodules.wri_l = wri_l = SRLatch(sync=False, name="wri")
         m.submodules.upd_l = upd_l = SRLatch(sync=False, name="upd")
+        m.submodules.cr0_l = cr0_l = SRLatch(sync=False, name="cr0")
         m.submodules.rst_l = rst_l = SRLatch(sync=False, name="rst")
         m.submodules.lsd_l = lsd_l = SRLatch(sync=False, name="lsd") # done
 
@@ -284,6 +286,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         op_is_st = Signal(reset_less=True)
         op_is_dcbz = Signal(reset_less=True)
         op_is_st_or_dcbz = Signal(reset_less=True)
+        op_is_atomic = Signal(reset_less=True)
 
         # ALU/LD data output control
         alu_valid = Signal(reset_less=True)  # ALU operands are valid
@@ -295,6 +298,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         rd_done = Signal(reset_less=True)   # all *necessary* operands read
         wr_reset = Signal(reset_less=True)  # final reset condition
         canceln = Signal(reset_less=True)   # cancel (active low)
+        store_done = Signal(reset_less=True) # store has been actioned
 
         # LD and ALU out
         alu_o = Signal(self.data_wid, reset_less=True)
@@ -307,6 +311,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         reset_o = Signal(reset_less=True)             # reset opcode
         reset_w = Signal(reset_less=True)             # reset write
         reset_u = Signal(reset_less=True)             # reset update
+        reset_c = Signal(reset_less=True)             # reset cr0
         reset_a = Signal(reset_less=True)             # reset adr latch
         reset_i = Signal(reset_less=True)             # issue|die (use a lot)
         reset_r = Signal(self.n_src, reset_less=True)  # reset src
@@ -322,6 +327,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         comb += reset_o.eq(self.done_o | terminate)      # opcode reset
         comb += reset_w.eq(self.wr.go_i[0] | terminate)  # write reg 1
         comb += reset_u.eq(self.wr.go_i[1] | terminate)  # update (reg 2)
+        comb += reset_c.eq(self.wr.go_i[2] | terminate)  # cr0 (reg 3)
         comb += reset_s.eq(self.go_st_i | terminate)  # store reset
         comb += reset_r.eq(self.rd.go_i | Repl(terminate, self.n_src))
         comb += reset_a.eq(self.go_ad_i | terminate)
@@ -334,6 +340,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         comb += op_is_st.eq(oper_r.insn_type == MicrOp.OP_STORE)   # ST
         comb += op_is_ld.eq(oper_r.insn_type == MicrOp.OP_LOAD)    # LD
         comb += op_is_dcbz.eq(oper_r.insn_type == MicrOp.OP_DCBZ)  # DCBZ
+        comb += op_is_atomic.eq(oper_r.reserve) # atomic LR/SC
         comb += op_is_st_or_dcbz.eq(op_is_st | op_is_dcbz)
         # dcbz is special case of store
         #uncomment if needed
@@ -354,6 +361,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         #       - alu_l : looks after add of src1/2/imm (EA)
         #       - adr_l : waits for add (EA)
         #       - upd_l : waits for adr and Regfile (port 2)
+        #       - cr0_l : waits for Rc=1 and CR0 Regfile (port 3)
         #    - src_l[2] : ST
         # - lod_l       : waits for adr (EA) and for LD Data
         # - wri_l       : waits for LD Data and Regfile (port 1)
@@ -364,8 +372,8 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         # opcode latch - inverted so that busy resets to 0
         # note this MUST be sync so as to avoid a combinatorial loop
         # between busy_o and issue_i on the reset latch (rst_l)
-        sync += opc_l.s.eq(issue_i)  # XXX NOTE: INVERTED FROM book!
-        sync += opc_l.r.eq(reset_o)  # XXX NOTE: INVERTED FROM book!
+        comb += opc_l.s.eq(issue_i)  # XXX NOTE: INVERTED FROM book!
+        comb += opc_l.r.eq(reset_o)  # XXX NOTE: INVERTED FROM book!
 
         # src operand latch
         sync += src_l.s.eq(Repl(issue_i, self.n_src) & ~self.rdmaskn)
@@ -392,6 +400,11 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
                             #self.done_o | (self.pi.busy_o & op_is_update),
                                           self.n_dst))
 
+        # CR0 operand latch (CR0 written to reg 3 if Rc=1)
+        op_is_rc1 = self.oper_i.rc.rc & self.oper_i.rc.ok
+        comb += cr0_l.s.eq(issue_i & op_is_rc1)
+        sync += cr0_l.r.eq(reset_c)
+
         # update-mode operand latch (EA written to reg 2)
         sync += upd_l.s.eq(reset_i)
         sync += upd_l.r.eq(reset_u)
@@ -414,10 +427,15 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         with m.If(self.done_o | terminate):
             sync += oper_r.eq(0)
 
-        # and for LD
+        # and for LD and store-done
         ldd_r = Signal(self.data_wid, reset_less=True)  # Dest register
         latchregister(m, ldd_o, ldd_r, ld_ok, name="ldo_r")
 
+        # store actioned, communicate through CR0 (for atomic LR/SC)
+        latchregister(m, self.pi.store_done.data, store_done,
+                         self.pi.store_done.ok,
+                         name="std_r")
+
         # and for each input from the incoming src operands
         srl = []
         for i in range(self.n_src):
@@ -494,12 +512,15 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         comb += self.wr.rel_o[1].eq(upd_l.q & busy_o & op_is_update &
                                   alu_valid & canceln)
 
+        # request write of CR0 result only in reserve and Rc=1
+        comb += self.wr.rel_o[2].eq(cr0_l.q & busy_o & op_is_atomic &
+                                  alu_valid & canceln)
+
         # provide "done" signal: select req_rel for non-LD/ST, adr_rel for LD/ST
         comb += wr_any.eq(self.st.go_i | p_st_go |
-                          self.wr.go_i[0] | self.wr.go_i[1])
+                          self.wr.go_i.bool())
         comb += wr_reset.eq(rst_l.q & busy_o & canceln &
-                            ~(self.st.rel_o | self.wr.rel_o[0] |
-                              self.wr.rel_o[1]) &
+                            ~(self.st.rel_o | self.wr.rel_o.bool()) &
                             (lod_l.qn | op_is_st_or_dcbz)
                             )
         comb += self.done_o.eq(wr_reset & (~self.pi.busy_o | op_is_ld))
@@ -509,17 +530,26 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
 
         # put the LD-output register directly onto the output bus on a go_write
         comb += self.o_data.data.eq(self.dest[0])
+        comb += self.o_data.ok.eq(self.wr.rel_o[0])
         with m.If(self.wr.go_i[0]):
             comb += self.dest[0].eq(ldd_r)
 
         # "update" mode, put address out on 2nd go-write
         comb += self.addr_o.data.eq(self.dest[1])
+        comb += self.addr_o.ok.eq(self.wr.rel_o[1])
         with m.If(op_is_update & self.wr.go_i[1]):
             comb += self.dest[1].eq(addr_r)
 
+        # fun-fun-fun, calculate CR0 when Rc=1 requested.
+        cr0 = self.dest[2]
+        comb += self.cr_o.data.eq(cr0)
+        comb += self.cr_o.ok.eq(self.wr.rel_o[2])
+        with m.If(cr0_l.q):
+            comb += cr0.eq(Cat(C(0, 1), store_done, C(0, 2)))
+
         # need to look like MultiCompUnit: put wrmask out.
         # XXX may need to make this enable only when write active
-        comb += self.wrmask.eq(bro & Cat(op_is_ld, op_is_update))
+        comb += self.wrmask.eq(bro & Cat(op_is_ld, op_is_update, cr0_l.q))
 
         ###########################
         # PortInterface connections
@@ -527,8 +557,10 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
 
         # connect to LD/ST PortInterface.
         comb += pi.is_ld_i.eq(op_is_ld & busy_o)  # decoded-LD
+        comb += pi.is_nc.eq(op_is_cix & busy_o)  # cache-inhibited
         comb += pi.is_st_i.eq(op_is_st_or_dcbz & busy_o)  # decoded-ST
         comb += pi.is_dcbz_i.eq(op_is_dcbz & busy_o)  # decoded-DCBZ
+        comb += pi.reserve.eq(oper_r.reserve & busy_o)  # atomic LR/SC
         comb += pi.data_len.eq(oper_r.data_len)  # data_len
         # address: use sync to avoid long latency
         sync += pi.addr.data.eq(addr_r)           # EA from adder
@@ -538,10 +570,16 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         sync += pi.addr.ok.eq(alu_ok & lsd_l.q)  # "do address stuff" (once)
         comb += self.exc_o.eq(pi.exc_o)  # exception occurred
         comb += addr_ok.eq(self.pi.addr_ok_o)  # no exc, address fine
-        # connect MSR.PR for priv/virt operation
-        comb += pi.msr_pr.eq(oper_r.msr[MSR.PR])
-        comb += Display("LDSTCompUnit: oper_r.msr %x pi.msr_pr=%x",
-                                      oper_r.msr, oper_r.msr[MSR.PR])
+        # connect MSR.PR etc. for priv/virt operation
+        comb += pi.priv_mode.eq(~oper_r.msr[MSR.PR])
+        comb += pi.virt_mode.eq(oper_r.msr[MSR.DR])
+        comb += pi.mode_32bit.eq(~oper_r.msr[MSR.SF])
+        with m.If(self.issue_i): # display this only once
+            sync += Display("LDSTCompUnit: oper_r.msr %x pr=%x dr=%x sf=%x",
+                                      oper_r.msr,
+                                      oper_r.msr[MSR.PR],
+                                      oper_r.msr[MSR.DR],
+                                      oper_r.msr[MSR.SF])
 
         # byte-reverse on LD
         revnorev = Signal(64, reset_less=True)
@@ -575,6 +613,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
             comb += pi.st.data.eq(stdata_r)
         with m.Else():
             comb += pi.st.data.eq(op3)
+
         # store - data goes in based on go_st
         comb += pi.st.ok.eq(self.st.go_i)  # go store signals st data valid
 
@@ -588,6 +627,8 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
             return self.o_data # LDSTOutputData.regspec o
         if i == 1:
             return self.addr_o # LDSTOutputData.regspec o1
+        if i == 2:
+            return self.cr_o # LDSTOutputData.regspec cr_a
         # return self.dest[i]
 
     def get_fu_out(self, i):
@@ -610,6 +651,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         yield self.wr.rel_o
         yield from self.o_data.ports()
         yield from self.addr_o.ports()
+        yield from self.cr_o.ports()
         yield self.load_mem_o
         yield self.stwd_mem_o
 
@@ -798,7 +840,7 @@ def test_scoreboard():
     units = {}
     pspec = TestMemPspec(ldst_ifacetype='bare_wb',
                          imem_ifacetype='bare_wb',
-                         addr_wid=48,
+                         addr_wid=64,
                          mask_wid=8,
                          reg_wid=64,
                          units=units)
@@ -834,7 +876,7 @@ def test_scoreboard_regspec():
     units = {}
     pspec = TestMemPspec(ldst_ifacetype='bare_wb',
                          imem_ifacetype='bare_wb',
-                         addr_wid=48,
+                         addr_wid=64,
                          mask_wid=8,
                          reg_wid=64,
                          units=units)
index 4c79cd31a22fb431f6f9f5971466b126cd4e9d0b..eae0bc7582866b702318609491f974f9cc9e8e38 100644 (file)
@@ -1,3 +1,17 @@
+#!/usr/bin/env python3
+#
+# Copyright (C) 2020,2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Copyright (C) 2020 Cole Poirier
+# Copyright (C) 2020,2021 Cesar Strauss
+# Copyright (C) 2021 Tobias Platen
+#
+# Original dcache.vhdl Copyright of its authors and licensed
+# by IBM under CC-BY 4.0
+# https://github.com/antonblanchard/microwatt
+#
+# Conversion to nmigen funded by NLnet and NGI POINTER under EU Grants
+# 871528 and 957073, under the LGPL-v3+ License
+
 """DCache
 
 based on Anton Blanchard microwatt dcache.vhdl
@@ -13,6 +27,8 @@ Links:
 
 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
 * https://bugs.libre-soc.org/show_bug.cgi?id=469
+* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
+  (discussion about brams for ECP5)
 
 """
 
@@ -24,12 +40,16 @@ sys.setrecursionlimit(1000000)
 
 from enum import Enum, unique
 
-from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
+from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
+                    Record, Memory)
 from nmutil.util import Display
+from nmigen.lib.coding import Decoder
 
 from copy import deepcopy
 from random import randint, seed
 
+from nmigen_soc.wishbone.bus import Interface
+
 from nmigen.cli import main
 from nmutil.iocontrol import RecordObject
 from nmigen.utils import log2_int
@@ -45,8 +65,8 @@ from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
                                 WBIOMasterOut, WBIOSlaveOut)
 
 from soc.experiment.cache_ram import CacheRam
-#from soc.experiment.plru import PLRU
-from nmutil.plru import PLRU
+from soc.experiment.plru import PLRU, PLRUs
+#from nmutil.plru import PLRU, PLRUs
 
 # for test
 from soc.bus.sram import SRAM
@@ -59,224 +79,248 @@ from nmutil.sim_tmp_alternative import Simulator
 
 from nmutil.util import wrap
 
-
-# TODO: make these parameters of DCache at some point
-LINE_SIZE = 64    # Line size in bytes
-NUM_LINES = 16    # Number of lines in a set
-NUM_WAYS = 4      # Number of ways
-TLB_SET_SIZE = 64 # L1 DTLB entries per set
-TLB_NUM_WAYS = 2  # L1 DTLB number of sets
-TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
 LOG_LENGTH = 0    # Non-zero to enable log data collection
 
-# BRAM organisation: We never access more than
-#     -- WB_DATA_BITS at a time so to save
-#     -- resources we make the array only that wide, and
-#     -- use consecutive indices to make a cache "line"
-#     --
-#     -- ROW_SIZE is the width in bytes of the BRAM
-#     -- (based on WB, so 64-bits)
-ROW_SIZE = WB_DATA_BITS // 8;
-
-# ROW_PER_LINE is the number of row (wishbone
-# transactions) in a line
-ROW_PER_LINE = LINE_SIZE // ROW_SIZE
-
-# BRAM_ROWS is the number of rows in BRAM needed
-# to represent the full dcache
-BRAM_ROWS = NUM_LINES * ROW_PER_LINE
-
-print ("ROW_SIZE", ROW_SIZE)
-print ("ROW_PER_LINE", ROW_PER_LINE)
-print ("BRAM_ROWS", BRAM_ROWS)
-print ("NUM_WAYS", NUM_WAYS)
-
-# Bit fields counts in the address
-
-# REAL_ADDR_BITS is the number of real address
-# bits that we store
-REAL_ADDR_BITS = 56
-
-# ROW_BITS is the number of bits to select a row
-ROW_BITS = log2_int(BRAM_ROWS)
-
-# ROW_LINE_BITS is the number of bits to select
-# a row within a line
-ROW_LINE_BITS = log2_int(ROW_PER_LINE)
-
-# LINE_OFF_BITS is the number of bits for
-# the offset in a cache line
-LINE_OFF_BITS = log2_int(LINE_SIZE)
-
-# ROW_OFF_BITS is the number of bits for
-# the offset in a row
-ROW_OFF_BITS = log2_int(ROW_SIZE)
-
-# INDEX_BITS is the number if bits to
-# select a cache line
-INDEX_BITS = log2_int(NUM_LINES)
-
-# SET_SIZE_BITS is the log base 2 of the set size
-SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
-
-# TAG_BITS is the number of bits of
-# the tag part of the address
-TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
-
-# TAG_WIDTH is the width in bits of each way of the tag RAM
-TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
-
-# WAY_BITS is the number of bits to select a way
-WAY_BITS = log2_int(NUM_WAYS)
-
-# Example of layout for 32 lines of 64 bytes:
-layout = """\
-  ..  tag    |index|  line  |
-  ..         |   row   |    |
-  ..         |     |---|    | ROW_LINE_BITS  (3)
-  ..         |     |--- - --| LINE_OFF_BITS (6)
-  ..         |         |- --| ROW_OFF_BITS  (3)
-  ..         |----- ---|    | ROW_BITS      (8)
-  ..         |-----|        | INDEX_BITS    (5)
-  .. --------|              | TAG_BITS      (45)
-"""
-print (layout)
-print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
-            (TAG_BITS, INDEX_BITS, ROW_BITS,
-             ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
-print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
-print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
-print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
-
-TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
-
-print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
-
-def CacheTagArray():
-    return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
-                        for x in range(NUM_LINES))
-
-def CacheValidBitsArray():
-    return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
-                        for x in range(NUM_LINES))
-
-def RowPerLineValidArray():
-    return Array(Signal(name="rows_valid%d" % x) \
-                        for x in range(ROW_PER_LINE))
-
-# L1 TLB
-TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
-TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
-TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
-TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
-TLB_PTE_BITS     = 64
-TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
-
 def ispow2(x):
     return (1<<log2_int(x, False)) == x
 
-assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
-assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
-assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
-assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
-assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
-assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
-        "geometry bits don't add up"
-assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
-        "geometry bits don't add up"
-assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
-         "geometry bits don't add up"
-assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
-assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
-
-
-def TLBValidBitsArray():
-    return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
-                for x in range(TLB_SET_SIZE))
-
-def TLBTagEAArray():
-    return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
-                for x in range (TLB_NUM_WAYS))
-
-def TLBTagsArray():
-    return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
-                for x in range (TLB_SET_SIZE))
-
-def TLBPtesArray():
-    return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
-                for x in range(TLB_SET_SIZE))
-
-def HitWaySet():
-    return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
-                        for x in range(TLB_NUM_WAYS))
-
-# Cache RAM interface
-def CacheRamOut():
-    return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
-                 for x in range(NUM_WAYS))
-
-# PLRU output interface
-def PLRUOut():
-    return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
-                for x in range(NUM_LINES))
-
-# TLB PLRU output interface
-def TLBPLRUOut():
-    return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
-                for x in range(TLB_SET_SIZE))
-
-# Helper functions to decode incoming requests
-#
-# Return the cache line index (tag index) for an address
-def get_index(addr):
-    return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 
-# Return the cache row index (data memory) for an address
-def get_row(addr):
-    return addr[ROW_OFF_BITS:SET_SIZE_BITS]
+class DCacheConfig:
+    def __init__(self, LINE_SIZE = 64,    # Line size in bytes
+                       NUM_LINES = 64,    # Number of lines in a set
+                       NUM_WAYS = 2,      # Number of ways
+                       TLB_SET_SIZE = 64, # L1 DTLB entries per set
+                       TLB_NUM_WAYS = 2,  # L1 DTLB number of sets
+                       TLB_LG_PGSZ = 12): # L1 DTLB log_2(page_size)
+        self.LINE_SIZE = LINE_SIZE
+        self.NUM_LINES = NUM_LINES
+        self.NUM_WAYS = NUM_WAYS
+        self.TLB_SET_SIZE = TLB_SET_SIZE
+        self.TLB_NUM_WAYS = TLB_NUM_WAYS
+        self.TLB_LG_PGSZ = TLB_LG_PGSZ
+
+        # BRAM organisation: We never access more than
+        #     -- WB_DATA_BITS at a time so to save
+        #     -- resources we make the array only that wide, and
+        #     -- use consecutive indices to make a cache "line"
+        #     --
+        #     -- ROW_SIZE is the width in bytes of the BRAM
+        #     -- (based on WB, so 64-bits)
+        self.ROW_SIZE = WB_DATA_BITS // 8;
+
+        # ROW_PER_LINE is the number of row (wishbone
+        # transactions) in a line
+        self.ROW_PER_LINE = self.LINE_SIZE // self.ROW_SIZE
+
+        # BRAM_ROWS is the number of rows in BRAM needed
+        # to represent the full dcache
+        self.BRAM_ROWS = self.NUM_LINES * self.ROW_PER_LINE
+
+        print ("ROW_SIZE", self.ROW_SIZE)
+        print ("ROW_PER_LINE", self.ROW_PER_LINE)
+        print ("BRAM_ROWS", self.BRAM_ROWS)
+        print ("NUM_WAYS", self.NUM_WAYS)
+
+        # Bit fields counts in the address
+
+        # REAL_ADDR_BITS is the number of real address
+        # bits that we store
+        self.REAL_ADDR_BITS = 56
+
+        # ROW_BITS is the number of bits to select a row
+        self.ROW_BITS = log2_int(self.BRAM_ROWS)
+
+        # ROW_LINE_BITS is the number of bits to select
+        # a row within a line
+        self.ROW_LINE_BITS = log2_int(self.ROW_PER_LINE)
+
+        # LINE_OFF_BITS is the number of bits for
+        # the offset in a cache line
+        self.LINE_OFF_BITS = log2_int(self.LINE_SIZE)
+
+        # ROW_OFF_BITS is the number of bits for
+        # the offset in a row
+        self.ROW_OFF_BITS = log2_int(self.ROW_SIZE)
+
+        # INDEX_BITS is the number if bits to
+        # select a cache line
+        self.INDEX_BITS = log2_int(self.NUM_LINES)
+
+        # SET_SIZE_BITS is the log base 2 of the set size
+        self.SET_SIZE_BITS = self.LINE_OFF_BITS + self.INDEX_BITS
+
+        # TAG_BITS is the number of bits of
+        # the tag part of the address
+        self.TAG_BITS = self.REAL_ADDR_BITS - self.SET_SIZE_BITS
+
+        # TAG_WIDTH is the width in bits of each way of the tag RAM
+        self.TAG_WIDTH = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8)
+
+        # WAY_BITS is the number of bits to select a way
+        self.WAY_BITS = log2_int(self.NUM_WAYS)
+
+        # Example of layout for 32 lines of 64 bytes:
+        layout = f"""\
+          DCache Layout:
+         |.. -----------------------| REAL_ADDR_BITS ({self.REAL_ADDR_BITS})
+          ..         |--------------| SET_SIZE_BITS ({self.SET_SIZE_BITS})
+          ..  tag    |index|  line  |
+          ..         |   row   |    |
+          ..         |     |---|    | ROW_LINE_BITS ({self.ROW_LINE_BITS})
+          ..         |     |--- - --| LINE_OFF_BITS ({self.LINE_OFF_BITS})
+          ..         |         |- --| ROW_OFF_BITS  ({self.ROW_OFF_BITS})
+          ..         |----- ---|    | ROW_BITS      ({self.ROW_BITS})
+          ..         |-----|        | INDEX_BITS    ({self.INDEX_BITS})
+          .. --------|              | TAG_BITS      ({self.TAG_BITS})
+        """
+        print (layout)
+        print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
+                    (self.TAG_BITS, self.INDEX_BITS, self.ROW_BITS,
+                     self.ROW_OFF_BITS, self.LINE_OFF_BITS, self.ROW_LINE_BITS))
+        print ("index @: %d-%d" % (self.LINE_OFF_BITS, self.SET_SIZE_BITS))
+        print ("row @: %d-%d" % (self.LINE_OFF_BITS, self.ROW_OFF_BITS))
+        print ("tag @: %d-%d width %d" % (self.SET_SIZE_BITS,
+                                          self.REAL_ADDR_BITS, self.TAG_WIDTH))
+
+        self.TAG_RAM_WIDTH = self.TAG_WIDTH * self.NUM_WAYS
+
+        print ("TAG_RAM_WIDTH", self.TAG_RAM_WIDTH)
+        print ("    TAG_WIDTH", self.TAG_WIDTH)
+        print ("     NUM_WAYS", self.NUM_WAYS)
+        print ("    NUM_LINES", self.NUM_LINES)
+
+        # L1 TLB
+        self.TLB_SET_BITS     = log2_int(self.TLB_SET_SIZE)
+        self.TLB_WAY_BITS     = log2_int(self.TLB_NUM_WAYS)
+        self.TLB_EA_TAG_BITS  = 64 - (self.TLB_LG_PGSZ + self.TLB_SET_BITS)
+        self.TLB_TAG_WAY_BITS = self.TLB_NUM_WAYS * self.TLB_EA_TAG_BITS
+        self.TLB_PTE_BITS     = 64
+        self.TLB_PTE_WAY_BITS = self.TLB_NUM_WAYS * self.TLB_PTE_BITS;
+
+        assert (self.LINE_SIZE % self.ROW_SIZE) == 0, \
+                "LINE_SIZE not multiple of ROW_SIZE"
+        assert ispow2(self.LINE_SIZE), "LINE_SIZE not power of 2"
+        assert ispow2(self.NUM_LINES), "NUM_LINES not power of 2"
+        assert ispow2(self.ROW_PER_LINE), "ROW_PER_LINE not power of 2"
+        assert self.ROW_BITS == \
+                (self.INDEX_BITS + self.ROW_LINE_BITS), \
+                "geometry bits don't add up"
+        assert (self.LINE_OFF_BITS == \
+                self.ROW_OFF_BITS + self.ROW_LINE_BITS), \
+                "geometry bits don't add up"
+        assert self.REAL_ADDR_BITS == \
+                (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS), \
+                "geometry bits don't add up"
+        assert self.REAL_ADDR_BITS == \
+                (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS), \
+                 "geometry bits don't add up"
+        assert 64 == WB_DATA_BITS, \
+                "Can't yet handle wb width that isn't 64-bits"
+        assert self.SET_SIZE_BITS <= self.TLB_LG_PGSZ, \
+                "Set indexed by virtual address"
+
+    def CacheTagArray(self):
+        return Array(Signal(self.TAG_RAM_WIDTH, name="tag%d" % x) \
+                       for x in range(self.NUM_LINES))
+
+    def CacheValidsArray(self):
+        return Array(Signal(self.NUM_WAYS, name="tag_valids%d" % x)
+                     for x in range(self.NUM_LINES))
+
+    def RowPerLineValidArray(self):
+        return Array(Signal(name="rows_valid%d" % x) \
+                            for x in range(self.ROW_PER_LINE))
+
+    def TLBHit(self, name):
+        return Record([('valid', 1),
+                       ('way', self.TLB_WAY_BITS)], name=name)
+
+    def TLBTagEAArray(self):
+        return Array(Signal(self.TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
+                    for x in range (self.TLB_NUM_WAYS))
+
+    def TLBRecord(self, name):
+        tlb_layout = [('valid', self.TLB_NUM_WAYS),
+                      ('tag', self.TLB_TAG_WAY_BITS),
+                      ('pte', self.TLB_PTE_WAY_BITS)
+                     ]
+        return Record(tlb_layout, name=name)
+
+    def TLBValidArray(self):
+        return Array(Signal(self.TLB_NUM_WAYS, name="tlb_valid%d" % x)
+                            for x in range(self.TLB_SET_SIZE))
+
+    def HitWaySet(self):
+        return Array(Signal(self.WAY_BITS, name="hitway_%d" % x) \
+                            for x in range(self.TLB_NUM_WAYS))
+
+    # Cache RAM interface
+    def CacheRamOut(self):
+        return Array(Signal(self.WB_DATA_BITS, name="cache_out%d" % x) \
+                     for x in range(self.NUM_WAYS))
+
+    # PLRU output interface
+    def PLRUOut(self):
+        return Array(Signal(self.WAY_BITS, name="plru_out%d" % x) \
+                    for x in range(self.NUM_LINES))
+
+    # TLB PLRU output interface
+    def TLBPLRUOut(self):
+        return Array(Signal(self.TLB_WAY_BITS, name="tlbplru_out%d" % x) \
+                    for x in range(self.TLB_SET_SIZE))
+
+    # Helper functions to decode incoming requests
+    #
+    # Return the cache line index (tag index) for an address
+    def get_index(self, addr):
+        return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS]
+
+    # Return the cache row index (data memory) for an address
+    def get_row(self, addr):
+        return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS]
 
-# Return the index of a row within a line
-def get_row_of_line(row):
-    return row[:ROW_BITS][:ROW_LINE_BITS]
+    # Return the index of a row within a line
+    def get_row_of_line(self, row):
+        return row[:self.ROW_BITS][:self.ROW_LINE_BITS]
 
-# Returns whether this is the last row of a line
-def is_last_row_addr(addr, last):
-    return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
+    # Returns whether this is the last row of a line
+    def is_last_row_addr(self, addr, last):
+        return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last
 
-# Returns whether this is the last row of a line
-def is_last_row(row, last):
-    return get_row_of_line(row) == last
+    # Returns whether this is the last row of a line
+    def is_last_row(self, row, last):
+        return self.get_row_of_line(row) == last
 
-# Return the next row in the current cache line. We use a
-# dedicated function in order to limit the size of the
-# generated adder to be only the bits within a cache line
-# (3 bits with default settings)
-def next_row(row):
-    row_v = row[0:ROW_LINE_BITS] + 1
-    return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
+    # Return the next row in the current cache line. We use a
+    # dedicated function in order to limit the size of the
+    # generated adder to be only the bits within a cache line
+    # (3 bits with default settings)
+    def next_row(self, row):
+        row_v = row[0:self.ROW_LINE_BITS] + 1
+        return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:])
 
-# Get the tag value from the address
-def get_tag(addr):
-    return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
+    # Get the tag value from the address
+    def get_tag(self, addr):
+        return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS]
 
-# Read a tag from a tag memory row
-def read_tag(way, tagset):
-    return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
+    # Read a tag from a tag memory row
+    def read_tag(self, way, tagset):
+        return tagset.word_select(way, self.TAG_WIDTH)[:self.TAG_BITS]
 
-# Read a TLB tag from a TLB tag memory row
-def read_tlb_tag(way, tags):
-    return tags.word_select(way, TLB_EA_TAG_BITS)
+    # Read a TLB tag from a TLB tag memory row
+    def read_tlb_tag(self, way, tags):
+        return tags.word_select(way, self.TLB_EA_TAG_BITS)
 
-# Write a TLB tag to a TLB tag memory row
-def write_tlb_tag(way, tags, tag):
-    return read_tlb_tag(way, tags).eq(tag)
+    # Write a TLB tag to a TLB tag memory row
+    def write_tlb_tag(self, way, tags, tag):
+        return self.read_tlb_tag(way, tags).eq(tag)
 
-# Read a PTE from a TLB PTE memory row
-def read_tlb_pte(way, ptes):
-    return ptes.word_select(way, TLB_PTE_BITS)
+    # Read a PTE from a TLB PTE memory row
+    def read_tlb_pte(self, way, ptes):
+        return ptes.word_select(way, self.TLB_PTE_BITS)
 
-def write_tlb_pte(way, ptes, newpte):
-    return read_tlb_pte(way, ptes).eq(newpte)
+    def write_tlb_pte(self, way, ptes, newpte):
+        return self.read_tlb_pte(way, ptes).eq(newpte)
 
 
 # Record for storing permission, attribute, etc. bits from a PTE
@@ -347,15 +391,15 @@ class RegStage0(RecordObject):
 
 
 class MemAccessRequest(RecordObject):
-    def __init__(self, name=None):
+    def __init__(self, cfg, name=None):
         super().__init__(name=name)
         self.op        = Signal(Op)
         self.valid     = Signal()
         self.dcbz      = Signal()
-        self.real_addr = Signal(REAL_ADDR_BITS)
+        self.real_addr = Signal(cfg.REAL_ADDR_BITS)
         self.data      = Signal(64)
         self.byte_sel  = Signal(8)
-        self.hit_way   = Signal(WAY_BITS)
+        self.hit_way   = Signal(cfg.WAY_BITS)
         self.same_tag  = Signal()
         self.mmu_req   = Signal()
 
@@ -363,31 +407,30 @@ class MemAccessRequest(RecordObject):
 # First stage register, contains state for stage 1 of load hits
 # and for the state machine used by all other operations
 class RegStage1(RecordObject):
-    def __init__(self, name=None):
+    def __init__(self, cfg, name=None):
         super().__init__(name=name)
         # Info about the request
         self.full             = Signal() # have uncompleted request
         self.mmu_req          = Signal() # request is from MMU
-        self.req              = MemAccessRequest(name="reqmem")
+        self.req              = MemAccessRequest(cfg, name="reqmem")
 
         # Cache hit state
-        self.hit_way          = Signal(WAY_BITS)
+        self.hit_way          = Signal(cfg.WAY_BITS)
         self.hit_load_valid   = Signal()
-        self.hit_index        = Signal(INDEX_BITS)
+        self.hit_index        = Signal(cfg.INDEX_BITS)
         self.cache_hit        = Signal()
 
         # TLB hit state
-        self.tlb_hit          = Signal()
-        self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
-        self.tlb_hit_index    = Signal(TLB_WAY_BITS)
+        self.tlb_hit          = cfg.TLBHit("tlb_hit")
+        self.tlb_hit_index    = Signal(cfg.TLB_SET_BITS)
 
         # 2-stage data buffer for data forwarded from writes to reads
         self.forward_data1    = Signal(64)
         self.forward_data2    = Signal(64)
         self.forward_sel1     = Signal(8)
         self.forward_valid1   = Signal()
-        self.forward_way1     = Signal(WAY_BITS)
-        self.forward_row1     = Signal(ROW_BITS)
+        self.forward_way1     = Signal(cfg.WAY_BITS)
+        self.forward_row1     = Signal(cfg.ROW_BITS)
         self.use_forward1     = Signal()
         self.forward_sel      = Signal(8)
 
@@ -398,12 +441,12 @@ class RegStage1(RecordObject):
         self.write_tag        = Signal()
         self.slow_valid       = Signal()
         self.wb               = WBMasterOut("wb")
-        self.reload_tag       = Signal(TAG_BITS)
-        self.store_way        = Signal(WAY_BITS)
-        self.store_row        = Signal(ROW_BITS)
-        self.store_index      = Signal(INDEX_BITS)
-        self.end_row_ix       = Signal(ROW_LINE_BITS)
-        self.rows_valid       = RowPerLineValidArray()
+        self.reload_tag       = Signal(cfg.TAG_BITS)
+        self.store_way        = Signal(cfg.WAY_BITS)
+        self.store_row        = Signal(cfg.ROW_BITS)
+        self.store_index      = Signal(cfg.INDEX_BITS)
+        self.end_row_ix       = Signal(cfg.ROW_LINE_BITS)
+        self.rows_valid       = cfg.RowPerLineValidArray()
         self.acks_pending     = Signal(3)
         self.inc_acks         = Signal()
         self.dec_acks         = Signal()
@@ -421,94 +464,178 @@ class RegStage1(RecordObject):
 
 # Reservation information
 class Reservation(RecordObject):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, cfg, name=None):
+        super().__init__(name=name)
         self.valid = Signal()
-        self.addr  = Signal(64-LINE_OFF_BITS)
+        self.addr  = Signal(64-cfg.LINE_OFF_BITS)
 
 
 class DTLBUpdate(Elaboratable):
-    def __init__(self):
+    def __init__(self, cfg):
+        self.cfg = cfg
         self.tlbie    = Signal()
         self.tlbwe    = Signal()
         self.doall    = Signal()
-        self.updated  = Signal()
-        self.v_updated  = Signal()
-        self.tlb_hit    = Signal()
-        self.tlb_req_index = Signal(TLB_SET_BITS)
-
-        self.tlb_hit_way     = Signal(TLB_WAY_BITS)
-        self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
-        self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
-        self.repl_way        = Signal(TLB_WAY_BITS)
-        self.eatag           = Signal(TLB_EA_TAG_BITS)
-        self.pte_data        = Signal(TLB_PTE_BITS)
+        self.tlb_hit     = cfg.TLBHit("tlb_hit")
+        self.tlb_req_index = Signal(cfg.TLB_SET_BITS)
 
-        self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
+        self.repl_way        = Signal(cfg.TLB_WAY_BITS)
+        self.eatag           = Signal(cfg.TLB_EA_TAG_BITS)
+        self.pte_data        = Signal(cfg.TLB_PTE_BITS)
 
-        self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
-        self.db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
-        self.pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+        # read from dtlb array
+        self.tlb_read       = Signal()
+        self.tlb_read_index = Signal(cfg.TLB_SET_BITS)
+        self.tlb_way        = cfg.TLBRecord("o_tlb_way")
 
     def elaborate(self, platform):
         m = Module()
         comb = m.d.comb
         sync = m.d.sync
-
-        tagset   = Signal(TLB_TAG_WAY_BITS)
-        pteset   = Signal(TLB_PTE_WAY_BITS)
-
-        tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
-        comb += db_out.eq(self.dv)
+        cfg = self.cfg
+
+        # there are 3 parts to this:
+        # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
+        # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
+        # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
+        # be a Memory because they can all be cleared (tlbie, doall), i mean,
+        # we _could_, in theory, by overriding the Reset Signal of the Memory,
+        # hmmm....
+
+        dtlb_valid = cfg.TLBValidArray()
+        tlb_req_index = self.tlb_req_index
+
+        print ("TLB_TAG_WAY_BITS", cfg.TLB_TAG_WAY_BITS)
+        print ("     TLB_EA_TAG_BITS", cfg.TLB_EA_TAG_BITS)
+        print ("        TLB_NUM_WAYS", cfg.TLB_NUM_WAYS)
+        print ("TLB_PTE_WAY_BITS", cfg.TLB_PTE_WAY_BITS)
+        print ("    TLB_PTE_BITS", cfg.TLB_PTE_BITS)
+        print ("    TLB_NUM_WAYS", cfg.TLB_NUM_WAYS)
+
+        # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
+        tagway = Memory(depth=cfg.TLB_SET_SIZE, width=cfg.TLB_TAG_WAY_BITS,
+                             attrs={'syn_ramstyle': "block_ram"})
+        m.submodules.rd_tagway = rd_tagway = tagway.read_port()
+        m.submodules.wr_tagway = wr_tagway = tagway.write_port(
+                                    granularity=cfg.TLB_EA_TAG_BITS)
+
+        pteway = Memory(depth=cfg.TLB_SET_SIZE, width=cfg.TLB_PTE_WAY_BITS,
+                             attrs={'syn_ramstyle': "block_ram"})
+        m.submodules.rd_pteway = rd_pteway = pteway.read_port()
+        m.submodules.wr_pteway = wr_pteway = pteway.write_port(
+                                    granularity=cfg.TLB_PTE_BITS)
+
+        # commented out for now, can be put in if Memory.reset can be
+        # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
+        #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
+        #m.submodules.rd_valid = rd_valid = validm.read_port()
+        #m.submodules.wr_valid = wr_valid = validm.write_port(
+                                    #granularity=1)
+
+        # connect up read and write addresses to Valid/PTE/TAG SRAMs
+        m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
+        m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
+        #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
+        m.d.comb += wr_tagway.addr.eq(tlb_req_index)
+        m.d.comb += wr_pteway.addr.eq(tlb_req_index)
+        #m.d.comb += wr_valid.addr.eq(tlb_req_index)
+
+        updated  = Signal()
+        v_updated  = Signal()
+        tb_out = Signal(cfg.TLB_TAG_WAY_BITS) # tlb_way_tags_t
+        db_out = Signal(cfg.TLB_NUM_WAYS)     # tlb_way_valids_t
+        pb_out = Signal(cfg.TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+        dv = Signal(cfg.TLB_NUM_WAYS) # tlb_way_valids_t
+
+        comb += dv.eq(dtlb_valid[tlb_req_index])
+        comb += db_out.eq(dv)
 
         with m.If(self.tlbie & self.doall):
-            pass # clear all back in parent
+            # clear all valid bits at once
+            # XXX hmmm, validm _could_ use Memory reset here...
+            for i in range(cfg.TLB_SET_SIZE):
+                sync += dtlb_valid[i].eq(0)
         with m.Elif(self.tlbie):
-            with m.If(self.tlb_hit):
-                comb += db_out.bit_select(self.tlb_hit_way, 1).eq(0)
-                comb += self.v_updated.eq(1)
-
+            # invalidate just the hit_way
+            with m.If(self.tlb_hit.valid):
+                comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
+                comb += v_updated.eq(1)
         with m.Elif(self.tlbwe):
-
-            comb += tagset.eq(self.tlb_tag_way)
-            comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
-            comb += tb_out.eq(tagset)
-
-            comb += pteset.eq(self.tlb_pte_way)
-            comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
-            comb += pb_out.eq(pteset)
-
+            # write to the requested tag and PTE
+            comb += cfg.write_tlb_tag(self.repl_way, tb_out, self.eatag)
+            comb += cfg.write_tlb_pte(self.repl_way, pb_out, self.pte_data)
+            # set valid bit
             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 
-            comb += self.updated.eq(1)
-            comb += self.v_updated.eq(1)
+            comb += updated.eq(1)
+            comb += v_updated.eq(1)
+
+        # above, sometimes valid is requested to be updated but data not
+        # therefore split them out, here.  note the granularity thing matches
+        # with the shift-up of the eatag/pte_data into the correct TLB way.
+        # thus is it not necessary to write the entire lot, just the portion
+        # being altered: hence writing the *old* copy of the row is not needed
+        with m.If(updated): # PTE and TAG to be written
+            comb += wr_pteway.data.eq(pb_out)
+            comb += wr_pteway.en.eq(1<<self.repl_way)
+            comb += wr_tagway.data.eq(tb_out)
+            comb += wr_tagway.en.eq(1<<self.repl_way)
+        with m.If(v_updated): # Valid to be written
+            sync += dtlb_valid[tlb_req_index].eq(db_out)
+            #comb += wr_valid.data.eq(db_out)
+            #comb += wr_valid.en.eq(1<<self.repl_way)
+
+        # select one TLB way, use a register here
+        r_delay = Signal()
+        sync += r_delay.eq(self.tlb_read)
+        # first deal with the valids, which are not in a Memory.
+        # tlb way valid is output on a 1 clock delay with sync,
+        # but have to explicitly deal with "forwarding" here
+        with m.If(self.tlb_read):
+            with m.If(v_updated): # write *and* read in same cycle: forward
+                sync += self.tlb_way.valid.eq(db_out)
+            with m.Else():
+                sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
+        # now deal with the Memory-read case. the output must remain
+        # valid (stable) even when a read-request is not made, but stable
+        # on a one-clock delay, hence the register
+        r_tlb_way        = cfg.TLBRecord("r_tlb_way")
+        with m.If(r_delay):
+            # on one clock delay, capture the contents of the read port(s)
+            comb += self.tlb_way.tag.eq(rd_tagway.data)
+            comb += self.tlb_way.pte.eq(rd_pteway.data)
+            sync += r_tlb_way.tag.eq(rd_tagway.data)
+            sync += r_tlb_way.pte.eq(rd_pteway.data)
+        with m.Else():
+            # ... so that the register can output it when no read is requested
+            # it's rather overkill but better to be safe than sorry
+            comb += self.tlb_way.tag.eq(r_tlb_way.tag)
+            comb += self.tlb_way.pte.eq(r_tlb_way.pte)
+            #comb += self.tlb_way.eq(r_tlb_way)
 
         return m
 
 
 class DCachePendingHit(Elaboratable):
 
-    def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
+    def __init__(self, cfg, tlb_way,
                       cache_i_validdx, cache_tag_set,
-                    req_addr,
-                    hit_set):
+                    req_addr):
 
         self.go          = Signal()
         self.virt_mode   = Signal()
         self.is_hit      = Signal()
-        self.tlb_hit     = Signal()
-        self.hit_way     = Signal(WAY_BITS)
+        self.tlb_hit     = cfg.TLBHit("tlb_hit")
+        self.hit_way     = Signal(cfg.WAY_BITS)
         self.rel_match   = Signal()
-        self.req_index   = Signal(INDEX_BITS)
-        self.reload_tag  = Signal(TAG_BITS)
+        self.req_index   = Signal(cfg.INDEX_BITS)
+        self.reload_tag  = Signal(cfg.TAG_BITS)
 
-        self.tlb_hit_way = tlb_hit_way
-        self.tlb_pte_way = tlb_pte_way
-        self.tlb_valid_way = tlb_valid_way
+        self.tlb_way = tlb_way
         self.cache_i_validdx = cache_i_validdx
         self.cache_tag_set = cache_tag_set
         self.req_addr = req_addr
-        self.hit_set = hit_set
+        self.cfg = cfg
 
     def elaborate(self, platform):
         m = Module()
@@ -518,22 +645,22 @@ class DCachePendingHit(Elaboratable):
         go = self.go
         virt_mode = self.virt_mode
         is_hit = self.is_hit
-        tlb_pte_way = self.tlb_pte_way
-        tlb_valid_way = self.tlb_valid_way
+        tlb_way = self.tlb_way
         cache_i_validdx = self.cache_i_validdx
         cache_tag_set = self.cache_tag_set
         req_addr = self.req_addr
-        tlb_hit_way = self.tlb_hit_way
         tlb_hit = self.tlb_hit
-        hit_set = self.hit_set
         hit_way = self.hit_way
         rel_match = self.rel_match
         req_index = self.req_index
         reload_tag = self.reload_tag
+        cfg = self.cfg
 
+        hit_set     = Array(Signal(name="hit_set_%d" % i) \
+                                  for i in range(cfg.TLB_NUM_WAYS))
         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
-                                    for i in range(TLB_NUM_WAYS))
-        hit_way_set = HitWaySet()
+                                    for i in range(cfg.TLB_NUM_WAYS))
+        hit_way_set = cfg.HitWaySet()
 
         # Test if pending request is a hit on any way
         # In order to make timing in virtual mode,
@@ -542,38 +669,38 @@ class DCachePendingHit(Elaboratable):
         # the TLB, and then decide later which match to use.
 
         with m.If(virt_mode):
-            for j in range(TLB_NUM_WAYS): # tlb_num_way_t
-                s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
-                s_hit       = Signal()
-                s_pte       = Signal(TLB_PTE_BITS)
-                s_ra        = Signal(REAL_ADDR_BITS)
-                comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
-                comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
-                                    s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
-                comb += s_tag.eq(get_tag(s_ra))
-
-                for i in range(NUM_WAYS): # way_t
+            for j in range(cfg.TLB_NUM_WAYS): # tlb_num_way_t
+                s_tag       = Signal(cfg.TAG_BITS, name="s_tag%d" % j)
+                s_hit       = Signal(name="s_hit%d" % j)
+                s_pte       = Signal(cfg.TLB_PTE_BITS, name="s_pte%d" % j)
+                s_ra        = Signal(cfg.REAL_ADDR_BITS, name="s_ra%d" % j)
+                # read the PTE, calc the Real Address, get tge tag
+                comb += s_pte.eq(cfg.read_tlb_pte(j, tlb_way.pte))
+                comb += s_ra.eq(Cat(req_addr[0:cfg.TLB_LG_PGSZ],
+                                    s_pte[cfg.TLB_LG_PGSZ:cfg.REAL_ADDR_BITS]))
+                comb += s_tag.eq(cfg.get_tag(s_ra))
+                # for each way check tge tag against the cache tag set
+                for i in range(cfg.NUM_WAYS): # way_t
                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
-                                  (read_tag(i, cache_tag_set) == s_tag)
-                                  & tlb_valid_way[j])
+                                  (cfg.read_tag(i, cache_tag_set) == s_tag)
+                                  & (tlb_way.valid[j]))
                     with m.If(is_tag_hit):
                         comb += hit_way_set[j].eq(i)
                         comb += s_hit.eq(1)
                 comb += hit_set[j].eq(s_hit)
-                with m.If(s_tag == reload_tag):
-                    comb += rel_matches[j].eq(1)
-            with m.If(tlb_hit):
-                comb += is_hit.eq(hit_set[tlb_hit_way])
-                comb += hit_way.eq(hit_way_set[tlb_hit_way])
-                comb += rel_match.eq(rel_matches[tlb_hit_way])
+                comb += rel_matches[j].eq(s_tag == reload_tag)
+            with m.If(tlb_hit.valid):
+                comb += is_hit.eq(hit_set[tlb_hit.way])
+                comb += hit_way.eq(hit_way_set[tlb_hit.way])
+                comb += rel_match.eq(rel_matches[tlb_hit.way])
         with m.Else():
-            s_tag       = Signal(TAG_BITS)
-            comb += s_tag.eq(get_tag(req_addr))
-            for i in range(NUM_WAYS): # way_t
+            s_tag       = Signal(cfg.TAG_BITS)
+            comb += s_tag.eq(cfg.get_tag(req_addr))
+            for i in range(cfg.NUM_WAYS): # way_t
                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
-                          (read_tag(i, cache_tag_set) == s_tag))
+                          (cfg.read_tag(i, cache_tag_set) == s_tag))
                 with m.If(is_tag_hit):
                     comb += hit_way.eq(i)
                     comb += is_hit.eq(1)
@@ -583,7 +710,7 @@ class DCachePendingHit(Elaboratable):
         return m
 
 
-class DCache(Elaboratable):
+class DCache(Elaboratable, DCacheConfig):
     """Set associative dcache write-through
 
     TODO (in no specific order):
@@ -592,7 +719,7 @@ class DCache(Elaboratable):
       at the end of line (this requires dealing with requests coming in
       while not idle...)
     """
-    def __init__(self):
+    def __init__(self, pspec=None):
         self.d_in      = LoadStore1ToDCacheType("d_in")
         self.d_out     = DCacheToLoadStore1Type("d_out")
 
@@ -600,12 +727,54 @@ class DCache(Elaboratable):
         self.m_out     = DCacheToMMUType("m_out")
 
         self.stall_out = Signal()
-
-        self.wb_out    = WBMasterOut("wb_out")
-        self.wb_in     = WBSlaveOut("wb_in")
+        self.any_stall_out = Signal()
+        self.dreq_when_stall = Signal()
+        self.mreq_when_stall = Signal()
+
+        # standard naming (wired to non-standard for compatibility)
+        self.bus = Interface(addr_width=32,
+                            data_width=64,
+                            granularity=8,
+                            features={'stall'},
+                            #alignment=0,
+                            name="dcache")
 
         self.log_out   = Signal(20)
 
+        # test if small cache to be enabled
+        self.small_cache = (hasattr(pspec, "small_cache") and
+                                 (pspec.small_cache == True))
+        # test if microwatt compatibility is to be enabled
+        self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+                                 (pspec.microwatt_compat == True))
+        # test if fabric compatibility is to be enabled
+        self.fabric_compat = (hasattr(pspec, "fabric_compat") and
+                                 (pspec.fabric_compat == True))
+
+        XLEN = pspec.XLEN
+        TLB_SET_SIZE = 8
+        TLB_NUM_WAYS = 2
+        NUM_LINES = 8
+        NUM_WAYS = 2
+
+        if self.small_cache:
+            # reduce way sizes and num lines to ridiculously small
+            TLB_SET_SIZE = 2
+            TLB_NUM_WAYS = 1
+            NUM_LINES = 2
+            NUM_WAYS = 1
+        if self.microwatt_compat or self.fabric_compat:
+            # reduce way sizes
+            NUM_WAYS = 1
+            TLB_NUM_WAYS = 1
+
+        super().__init__(TLB_SET_SIZE=TLB_SET_SIZE,
+                         # XLEN=XLEN, # TODO
+                         TLB_NUM_WAYS = TLB_NUM_WAYS,
+                         NUM_LINES = NUM_LINES,
+                         NUM_WAYS = NUM_WAYS
+                        )
+
     def stage_0(self, m, r0, r1, r0_full):
         """Latch the request in r0.req as long as we're not stalling
         """
@@ -634,6 +803,7 @@ class DCache(Elaboratable):
             comb += r.doall.eq(m_in.doall)
             comb += r.tlbld.eq(m_in.tlbld)
             comb += r.mmu_req.eq(1)
+            comb += r.d_valid.eq(1)
             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
                                  m_in.addr, m_in.pte, r.req.load)
 
@@ -644,25 +814,25 @@ class DCache(Elaboratable):
             comb += r.doall.eq(0)
             comb += r.tlbld.eq(0)
             comb += r.mmu_req.eq(0)
+            comb += r.d_valid.eq(0)
+
+        sync += r0_full.eq(0)
         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
             sync += r0.eq(r)
             sync += r0_full.eq(r.req.valid)
+        with m.Elif(~r0.d_valid):
             # Sample data the cycle after a request comes in from loadstore1.
             # If another request has come in already then the data will get
             # put directly into req.data below.
-            with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
-                     ~r0.mmu_req):
-                sync += r0.req.data.eq(d_in.data)
-                sync += r0.d_valid.eq(1)
+            sync += r0.req.data.eq(d_in.data)
+            sync += r0.d_valid.eq(1)
         with m.If(d_in.valid):
             m.d.sync += Display("    DCACHE req cache "
                                 "virt %d addr %x data %x ld %d",
                                  r.req.virt_mode, r.req.addr,
                                  r.req.data, r.req.load)
 
-    def tlb_read(self, m, r0_stall, tlb_valid_way,
-                 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
-                 dtlb_tags, dtlb_ptes):
+    def tlb_read(self, m, r0_stall, tlb_way):
         """TLB
         Operates in the second cycle on the request latched in r0.req.
         TLB updates write the entry at the end of the second cycle.
@@ -671,78 +841,76 @@ class DCache(Elaboratable):
         sync = m.d.sync
         m_in, d_in = self.m_in, self.d_in
 
-        index    = Signal(TLB_SET_BITS)
-        addrbits = Signal(TLB_SET_BITS)
+        addrbits = Signal(self.TLB_SET_BITS)
 
-        amin = TLB_LG_PGSZ
-        amax = TLB_LG_PGSZ + TLB_SET_BITS
+        amin = self.TLB_LG_PGSZ
+        amax = self.TLB_LG_PGSZ + self.TLB_SET_BITS
 
         with m.If(m_in.valid):
             comb += addrbits.eq(m_in.addr[amin : amax])
         with m.Else():
             comb += addrbits.eq(d_in.addr[amin : amax])
-        comb += index.eq(addrbits)
 
         # If we have any op and the previous op isn't finished,
         # then keep the same output for next cycle.
-        with m.If(~r0_stall):
-            sync += tlb_valid_way.eq(dtlb_valid_bits[index])
-            sync += tlb_tag_way.eq(dtlb_tags[index])
-            sync += tlb_pte_way.eq(dtlb_ptes[index])
+        d = self.dtlb_update
+        comb += d.tlb_read_index.eq(addrbits)
+        comb += d.tlb_read.eq(~r0_stall)
+        comb += tlb_way.eq(d.tlb_way)
 
-    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
+    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
         """Generate TLB PLRUs
         """
         comb = m.d.comb
         sync = m.d.sync
 
-        if TLB_NUM_WAYS == 0:
+        if self.TLB_NUM_WAYS == 0:
             return
-        for i in range(TLB_SET_SIZE):
-            # TLB PLRU interface
-            tlb_plru        = PLRU(TLB_WAY_BITS)
-            setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
-            tlb_plru_acc_en = Signal()
 
-            comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
-            comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
-            comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
-            comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
+        # suite of PLRUs with a selection and output mechanism
+        tlb_plrus = PLRUs("d_tlb", self.TLB_SET_SIZE, self.TLB_WAY_BITS)
+        m.submodules.tlb_plrus = tlb_plrus
+        comb += tlb_plrus.way.eq(r1.tlb_hit.way)
+        comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
+        comb += tlb_plrus.index.eq(r1.tlb_hit_index)
+        comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
+        comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 
     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
-                   tlb_valid_way, tlb_tag_way, tlb_hit_way,
-                   tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
+                   tlb_way,
+                   pte, tlb_hit, valid_ra, perm_attr, ra):
 
         comb = m.d.comb
 
-        hitway = Signal(TLB_WAY_BITS)
+        hitway = Signal(self.TLB_WAY_BITS)
         hit    = Signal()
-        eatag  = Signal(TLB_EA_TAG_BITS)
+        eatag  = Signal(self.TLB_EA_TAG_BITS)
 
-        TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
-        comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
-        comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
+        self.TLB_LG_END = self.TLB_LG_PGSZ + self.TLB_SET_BITS
+        r0_req_addr = r0.req.addr[self.TLB_LG_PGSZ : self.TLB_LG_END]
+        comb += tlb_req_index.eq(r0_req_addr)
+        comb += eatag.eq(r0.req.addr[self.TLB_LG_END : 64 ])
 
-        for i in range(TLB_NUM_WAYS):
+        for i in range(self.TLB_NUM_WAYS):
             is_tag_hit = Signal(name="is_tag_hit%d" % i)
-            tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
-            comb += tlb_tag.eq(read_tlb_tag(i, tlb_tag_way))
-            comb += is_tag_hit.eq(tlb_valid_way[i] & (tlb_tag == eatag))
+            tlb_tag = Signal(self.TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
+            comb += tlb_tag.eq(self.read_tlb_tag(i, tlb_way.tag))
+            comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
             with m.If(is_tag_hit):
                 comb += hitway.eq(i)
                 comb += hit.eq(1)
 
-        comb += tlb_hit.eq(hit & r0_valid)
-        comb += tlb_hit_way.eq(hitway)
+        comb += tlb_hit.valid.eq(hit & r0_valid)
+        comb += tlb_hit.way.eq(hitway)
 
-        with m.If(tlb_hit):
-            comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
-        comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
+        with m.If(tlb_hit.valid):
+            comb += pte.eq(self.read_tlb_pte(hitway, tlb_way.pte))
+        comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 
         with m.If(r0.req.virt_mode):
-            comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
-                              r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
-                              pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
+            comb += ra.eq(Cat(Const(0, self.ROW_OFF_BITS),
+                              r0.req.addr[self.ROW_OFF_BITS:self.TLB_LG_PGSZ],
+                              pte[self.TLB_LG_PGSZ:self.REAL_ADDR_BITS]))
             comb += perm_attr.reference.eq(pte[8])
             comb += perm_attr.changed.eq(pte[7])
             comb += perm_attr.nocache.eq(pte[5])
@@ -750,8 +918,8 @@ class DCache(Elaboratable):
             comb += perm_attr.rd_perm.eq(pte[2])
             comb += perm_attr.wr_perm.eq(pte[1])
         with m.Else():
-            comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
-                              r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
+            comb += ra.eq(Cat(Const(0, self.ROW_OFF_BITS),
+                          r0.req.addr[self.ROW_OFF_BITS:self.REAL_ADDR_BITS]))
             comb += perm_attr.reference.eq(1)
             comb += perm_attr.changed.eq(1)
             comb += perm_attr.nocache.eq(0)
@@ -761,7 +929,7 @@ class DCache(Elaboratable):
 
         with m.If(valid_ra):
             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
-                                r0.req.virt_mode, tlb_hit, ra, pte)
+                                r0.req.virt_mode, tlb_hit.valid, ra, pte)
             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
@@ -769,11 +937,8 @@ class DCache(Elaboratable):
             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 
-    def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
-                    tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
-                    dtlb_tags, tlb_pte_way, dtlb_ptes):
-
-        dtlb_valids = TLBValidBitsArray()
+    def tlb_update(self, m, r0_valid, r0, tlb_req_index,
+                    tlb_hit, tlb_plru_victim):
 
         comb = m.d.comb
         sync = m.d.sync
@@ -784,33 +949,19 @@ class DCache(Elaboratable):
         comb += tlbie.eq(r0_valid & r0.tlbie)
         comb += tlbwe.eq(r0_valid & r0.tlbld)
 
-        m.submodules.tlb_update = d = DTLBUpdate()
-        with m.If(tlbie & r0.doall):
-            # clear all valid bits at once
-            for i in range(TLB_SET_SIZE):
-                sync += dtlb_valid_bits[i].eq(0)
-        with m.If(d.updated):
-            sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
-            sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
-        with m.If(d.v_updated):
-            sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
-
-        comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
+        d = self.dtlb_update
 
         comb += d.tlbie.eq(tlbie)
         comb += d.tlbwe.eq(tlbwe)
         comb += d.doall.eq(r0.doall)
         comb += d.tlb_hit.eq(tlb_hit)
-        comb += d.tlb_hit_way.eq(tlb_hit_way)
-        comb += d.tlb_tag_way.eq(tlb_tag_way)
-        comb += d.tlb_pte_way.eq(tlb_pte_way)
         comb += d.tlb_req_index.eq(tlb_req_index)
 
-        with m.If(tlb_hit):
-            comb += d.repl_way.eq(tlb_hit_way)
+        with m.If(tlb_hit.valid):
+            comb += d.repl_way.eq(tlb_hit.way)
         with m.Else():
-            comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
-        comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
+            comb += d.repl_way.eq(tlb_plru_victim)
+        comb += d.eatag.eq(r0.req.addr[self.TLB_LG_PGSZ + self.TLB_SET_BITS:64])
         comb += d.pte_data.eq(r0.req.data)
 
     def maybe_plrus(self, m, r1, plru_victim):
@@ -819,44 +970,47 @@ class DCache(Elaboratable):
         comb = m.d.comb
         sync = m.d.sync
 
-        if TLB_NUM_WAYS == 0:
+        if self.TLB_NUM_WAYS == 0:
             return
 
-        for i in range(NUM_LINES):
-            # PLRU interface
-            plru        = PLRU(WAY_BITS)
-            setattr(m.submodules, "plru%d" % i, plru)
-            plru_acc_en = Signal()
+        # suite of PLRUs with a selection and output mechanism
+        m.submodules.plrus = plrus = PLRUs("dtag", self.NUM_LINES,
+                                                   self.WAY_BITS)
+        comb += plrus.way.eq(r1.hit_way)
+        comb += plrus.valid.eq(r1.cache_hit)
+        comb += plrus.index.eq(r1.hit_index)
+        comb += plrus.isel.eq(r1.store_index) # select victim
+        comb += plru_victim.eq(plrus.o_index) # selected victim
 
-            comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
-            comb += plru.acc_en.eq(plru_acc_en)
-            comb += plru.acc_i.eq(r1.hit_way)
-            comb += plru_victim[i].eq(plru.lru_o)
-
-    def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
+    def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set):
         """Cache tag RAM read port
         """
         comb = m.d.comb
         sync = m.d.sync
+
         m_in, d_in = self.m_in, self.d_in
 
-        index = Signal(INDEX_BITS)
+        # synchronous tag read-port: NOT TRANSPARENT (cannot pass through
+        # write-to-a-read at the same time), seems to pass tests ok
+        m.submodules.rd_tag = rd_tag = self.tagmem.read_port(transparent=False)
+
+        index = Signal(self.INDEX_BITS)
 
         with m.If(r0_stall):
             comb += index.eq(req_index)
         with m.Elif(m_in.valid):
-            comb += index.eq(get_index(m_in.addr))
+            comb += index.eq(self.get_index(m_in.addr))
         with m.Else():
-            comb += index.eq(get_index(d_in.addr))
-        sync += cache_tag_set.eq(cache_tags[index])
+            comb += index.eq(self.get_index(d_in.addr))
+        comb += rd_tag.addr.eq(index)
+        comb += cache_tag_set.eq(rd_tag.data) # read-port is a 1-clock delay
 
     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
                        r0_valid, r1, cache_valids, replace_way,
                        use_forward1_next, use_forward2_next,
                        req_hit_way, plru_victim, rc_ok, perm_attr,
                        valid_ra, perm_ok, access_ok, req_op, req_go,
-                       tlb_pte_way,
-                       tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
+                       tlb_hit, tlb_way, cache_tag_set,
                        cancel_store, req_same_tag, r0_stall, early_req_row):
         """Cache request parsing and hit detection
         """
@@ -865,19 +1019,17 @@ class DCache(Elaboratable):
         m_in, d_in = self.m_in, self.d_in
 
         is_hit      = Signal()
-        hit_way     = Signal(WAY_BITS)
+        hit_way     = Signal(self.WAY_BITS)
         op          = Signal(Op)
         opsel       = Signal(3)
         go          = Signal()
         nc          = Signal()
-        hit_set     = Array(Signal(name="hit_set_%d" % i) \
-                                  for i in range(TLB_NUM_WAYS))
-        cache_i_validdx = Signal(NUM_WAYS)
+        cache_i_validdx = Signal(self.NUM_WAYS)
 
         # Extract line, row and tag from request
-        comb += req_index.eq(get_index(r0.req.addr))
-        comb += req_row.eq(get_row(r0.req.addr))
-        comb += req_tag.eq(get_tag(ra))
+        comb += req_index.eq(self.get_index(r0.req.addr))
+        comb += req_row.eq(self.get_row(r0.req.addr))
+        comb += req_tag.eq(self.get_tag(ra))
 
         if False: # display on comb is a bit... busy.
             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
@@ -886,17 +1038,15 @@ class DCache(Elaboratable):
         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
         comb += cache_i_validdx.eq(cache_valids[req_index])
 
-        m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
-                                tlb_valid_way, tlb_hit_way,
-                                cache_i_validdx, cache_tag_set,
-                                r0.req.addr,
-                                hit_set)
-
+        m.submodules.dcache_pend = dc = DCachePendingHit(self, tlb_way,
+                                            cache_i_validdx, cache_tag_set,
+                                            r0.req.addr)
         comb += dc.tlb_hit.eq(tlb_hit)
         comb += dc.reload_tag.eq(r1.reload_tag)
         comb += dc.virt_mode.eq(r0.req.virt_mode)
         comb += dc.go.eq(go)
         comb += dc.req_index.eq(req_index)
+
         comb += is_hit.eq(dc.is_hit)
         comb += hit_way.eq(dc.hit_way)
         comb += req_same_tag.eq(dc.rel_match)
@@ -907,14 +1057,14 @@ class DCache(Elaboratable):
             # For a store, consider this a hit even if the row isn't
             # valid since it will be by the time we perform the store.
             # For a load, check the appropriate row valid bit.
-            rrow = Signal(ROW_LINE_BITS)
+            rrow = Signal(self.ROW_LINE_BITS)
             comb += rrow.eq(req_row)
             valid = r1.rows_valid[rrow]
             comb += is_hit.eq((~r0.req.load) | valid)
             comb += hit_way.eq(replace_way)
 
         # Whether to use forwarded data for a load or not
-        with m.If((get_row(r1.req.real_addr) == req_row) &
+        with m.If((self.get_row(r1.req.real_addr) == req_row) &
                   (r1.req.hit_way == hit_way)):
             # Only need to consider r1.write_bram here, since if we
             # are writing refill data here, then we don't have a
@@ -933,7 +1083,7 @@ class DCache(Elaboratable):
 
         # The way to replace on a miss
         with m.If(r1.write_tag):
-            comb += replace_way.eq(plru_victim[r1.store_index])
+            comb += replace_way.eq(plru_victim)
         with m.Else():
             comb += replace_way.eq(r1.store_way)
 
@@ -945,6 +1095,7 @@ class DCache(Elaboratable):
                            (perm_attr.wr_perm |
                               (r0.req.load & perm_attr.rd_perm)))
         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
+
         # Combine the request and cache hit status to decide what
         # operation needs to be done
         comb += nc.eq(r0.req.nc | perm_attr.nocache)
@@ -979,9 +1130,9 @@ class DCache(Elaboratable):
         # row requested.
         with m.If(~r0_stall):
             with m.If(m_in.valid):
-                comb += early_req_row.eq(get_row(m_in.addr))
+                comb += early_req_row.eq(self.get_row(m_in.addr))
             with m.Else():
-                comb += early_req_row.eq(get_row(d_in.addr))
+                comb += early_req_row.eq(self.get_row(d_in.addr))
         with m.Else():
             comb += early_req_row.eq(req_row)
 
@@ -999,12 +1150,12 @@ class DCache(Elaboratable):
             with m.Else():
                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
                 with m.If((~reservation.valid) |
-                         (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
+                         (r0.req.addr[self.LINE_OFF_BITS:64] !=
+                          reservation.addr)):
                     comb += cancel_store.eq(1)
 
     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                         reservation, r0):
-
         comb = m.d.comb
         sync = m.d.sync
 
@@ -1013,7 +1164,7 @@ class DCache(Elaboratable):
                 sync += reservation.valid.eq(0)
             with m.Elif(set_rsrv):
                 sync += reservation.valid.eq(1)
-                sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
+                sync += reservation.addr.eq(r0.req.addr[self.LINE_OFF_BITS:64])
 
     def writeback_control(self, m, r1, cache_out_row):
         """Return data for loads & completion control logic
@@ -1041,6 +1192,7 @@ class DCache(Elaboratable):
                 dsel = data_fwd.word_select(i, 8)
                 comb += data_out.word_select(i, 8).eq(dsel)
 
+        # DCache output to LoadStore
         comb += d_out.valid.eq(r1.ls_valid)
         comb += d_out.data.eq(data_out)
         comb += d_out.store_done.eq(~r1.stcx_fail)
@@ -1115,62 +1267,80 @@ class DCache(Elaboratable):
         account by using 1-cycle delayed signals for load hits.
         """
         comb = m.d.comb
-        wb_in = self.wb_in
+        bus = self.bus
+
+        # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
+        # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
+        m.submodules.rams_replace_way_e = rwe = Decoder(self.NUM_WAYS)
+        comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
+                   ~r1.write_bram))
+        comb += rwe.i.eq(replace_way)
+
+        m.submodules.rams_hit_way_e = hwe = Decoder(self.NUM_WAYS)
+        comb += hwe.i.eq(r1.hit_way)
+
+        # this one is gated with write_bram, and replace_way_e can never be
+        # set at the same time.  that means that do_write can OR the outputs
+        m.submodules.rams_hit_req_way_e = hre = Decoder(self.NUM_WAYS)
+        comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
+        comb += hre.i.eq(r1.req.hit_way)
+
+        # common Signals
+        do_read  = Signal()
+        wr_addr  = Signal(self.ROW_BITS)
+        wr_data  = Signal(WB_DATA_BITS)
+        wr_sel   = Signal(self.ROW_SIZE)
+        rd_addr  = Signal(self.ROW_BITS)
+
+        comb += do_read.eq(1) # always enable
+        comb += rd_addr.eq(early_req_row)
+
+        # Write mux:
+        #
+        # Defaults to wishbone read responses (cache refill)
+        #
+        # For timing, the mux on wr_data/sel/addr is not
+        # dependent on anything other than the current state.
 
-        for i in range(NUM_WAYS):
-            do_read  = Signal(name="do_rd%d" % i)
-            rd_addr  = Signal(ROW_BITS, name="rd_addr_%d" % i)
+        with m.If(r1.write_bram):
+            # Write store data to BRAM.  This happens one
+            # cycle after the store is in r0.
+            comb += wr_data.eq(r1.req.data)
+            comb += wr_sel.eq(r1.req.byte_sel)
+            comb += wr_addr.eq(self.get_row(r1.req.real_addr))
+
+        with m.Else():
+            # Otherwise, we might be doing a reload or a DCBZ
+            with m.If(r1.dcbz):
+                comb += wr_data.eq(0)
+            with m.Else():
+                comb += wr_data.eq(bus.dat_r)
+            comb += wr_addr.eq(r1.store_row)
+            comb += wr_sel.eq(~0) # all 1s
+
+        # set up Cache Rams
+        for i in range(self.NUM_WAYS):
             do_write = Signal(name="do_wr%d" % i)
-            wr_addr  = Signal(ROW_BITS, name="wr_addr_%d" % i)
-            wr_data  = Signal(WB_DATA_BITS, name="din_%d" % i)
-            wr_sel   = Signal(ROW_SIZE)
-            wr_sel_m = Signal(ROW_SIZE)
-            _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
+            wr_sel_m = Signal(self.ROW_SIZE, name="wr_sel_m_%d" % i)
+            d_out= Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
 
-            way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
-            setattr(m.submodules, "cacheram_%d" % i, way)
+            way = CacheRam(self.ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
+            m.submodules["cacheram_%d" % i] = way
 
             comb += way.rd_en.eq(do_read)
             comb += way.rd_addr.eq(rd_addr)
-            comb += _d_out.eq(way.rd_data_o)
+            comb += d_out.eq(way.rd_data_o)
             comb += way.wr_sel.eq(wr_sel_m)
             comb += way.wr_addr.eq(wr_addr)
             comb += way.wr_data.eq(wr_data)
 
             # Cache hit reads
-            comb += do_read.eq(1)
-            comb += rd_addr.eq(early_req_row)
-            with m.If(r1.hit_way == i):
-                comb += cache_out_row.eq(_d_out)
-
-            # Write mux:
-            #
-            # Defaults to wishbone read responses (cache refill)
-            #
-            # For timing, the mux on wr_data/sel/addr is not
-            # dependent on anything other than the current state.
-
-            with m.If(r1.write_bram):
-                # Write store data to BRAM.  This happens one
-                # cycle after the store is in r0.
-                comb += wr_data.eq(r1.req.data)
-                comb += wr_sel.eq(r1.req.byte_sel)
-                comb += wr_addr.eq(get_row(r1.req.real_addr))
-
-                with m.If(i == r1.req.hit_way):
-                    comb += do_write.eq(1)
-            with m.Else():
-                # Otherwise, we might be doing a reload or a DCBZ
-                with m.If(r1.dcbz):
-                    comb += wr_data.eq(0)
-                with m.Else():
-                    comb += wr_data.eq(wb_in.dat)
-                comb += wr_addr.eq(r1.store_row)
-                comb += wr_sel.eq(~0) # all 1s
+            with m.If(hwe.o[i]):
+                comb += cache_out_row.eq(d_out)
 
-                with m.If((r1.state == State.RELOAD_WAIT_ACK)
-                          & wb_in.ack & (replace_way == i)):
-                    comb += do_write.eq(1)
+            # these are mutually-exclusive via their Decoder-enablers
+            # (note: Decoder-enable is inverted)
+            comb += do_write.eq(hre.o[i] | rwe.o[i])
 
             # Mask write selects with do_write since BRAM
             # doesn't have a global write-enable
@@ -1182,8 +1352,7 @@ class DCache(Elaboratable):
     # It also handles error cases (TLB miss, cache paradox)
     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
                         req_hit_way, req_index, req_tag, access_ok,
-                        tlb_hit, tlb_hit_way, tlb_req_index):
-
+                        tlb_hit, tlb_req_index):
         comb = m.d.comb
         sync = m.d.sync
 
@@ -1200,15 +1369,9 @@ class DCache(Elaboratable):
         sync += r1.hit_way.eq(req_hit_way)
         sync += r1.hit_index.eq(req_index)
 
-        with m.If(req_op == Op.OP_LOAD_HIT):
-            sync += r1.hit_load_valid.eq(1)
-        with m.Else():
-            sync += r1.hit_load_valid.eq(0)
-
-        with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
-            sync += r1.cache_hit.eq(1)
-        with m.Else():
-            sync += r1.cache_hit.eq(0)
+        sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
+        sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
+                                (req_op == Op.OP_STORE_HIT))
 
         with m.If(req_op == Op.OP_BAD):
             sync += Display("Signalling ld/st error "
@@ -1217,20 +1380,15 @@ class DCache(Elaboratable):
             sync += r1.ls_error.eq(~r0.mmu_req)
             sync += r1.mmu_error.eq(r0.mmu_req)
             sync += r1.cache_paradox.eq(access_ok)
-
         with m.Else():
             sync += r1.ls_error.eq(0)
             sync += r1.mmu_error.eq(0)
             sync += r1.cache_paradox.eq(0)
 
-        with m.If(req_op == Op.OP_STCX_FAIL):
-            sync += r1.stcx_fail.eq(1)
-        with m.Else():
-            sync += r1.stcx_fail.eq(0)
+        sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
 
         # Record TLB hit information for updating TLB PLRU
         sync += r1.tlb_hit.eq(tlb_hit)
-        sync += r1.tlb_hit_way.eq(tlb_hit_way)
         sync += r1.tlb_hit_index.eq(tlb_req_index)
 
     # Memory accesses are handled by this state machine:
@@ -1242,23 +1400,27 @@ class DCache(Elaboratable):
     # All wishbone requests generation is done here.
     # This machine operates at stage 1.
     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
-                    cache_valids, r0, replace_way,
+                    r0, replace_way,
                     req_hit_way, req_same_tag,
-                    r0_valid, req_op, cache_tags, req_go, ra):
+                    r0_valid, req_op, cache_valids, req_go, ra):
 
         comb = m.d.comb
         sync = m.d.sync
-        wb_in = self.wb_in
+        bus = self.bus
         d_in = self.d_in
 
-        req         = MemAccessRequest("mreq_ds")
+        m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
+                                                    granularity=self.TAG_WIDTH)
 
-        req_row = Signal(ROW_BITS)
-        req_idx = Signal(INDEX_BITS)
-        req_tag = Signal(TAG_BITS)
-        comb += req_idx.eq(get_index(req.real_addr))
-        comb += req_row.eq(get_row(req.real_addr))
-        comb += req_tag.eq(get_tag(req.real_addr))
+        req         = MemAccessRequest(self, "mreq_ds")
+
+        r1_next_cycle = Signal()
+        req_row = Signal(self.ROW_BITS)
+        req_idx = Signal(self.INDEX_BITS)
+        req_tag = Signal(self.TAG_BITS)
+        comb += req_idx.eq(self.get_index(req.real_addr))
+        comb += req_row.eq(self.get_row(req.real_addr))
+        comb += req_tag.eq(self.get_tag(req.real_addr))
 
         sync += r1.use_forward1.eq(use_forward1_next)
         sync += r1.forward_sel.eq(0)
@@ -1273,13 +1435,13 @@ class DCache(Elaboratable):
             sync += r1.forward_data1.eq(r1.req.data)
             sync += r1.forward_sel1.eq(r1.req.byte_sel)
             sync += r1.forward_way1.eq(r1.req.hit_way)
-            sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
+            sync += r1.forward_row1.eq(self.get_row(r1.req.real_addr))
             sync += r1.forward_valid1.eq(1)
         with m.Else():
             with m.If(r1.dcbz):
                 sync += r1.forward_data1.eq(0)
             with m.Else():
-                sync += r1.forward_data1.eq(wb_in.dat)
+                sync += r1.forward_data1.eq(bus.dat_r)
             sync += r1.forward_sel1.eq(~0) # all 1s
             sync += r1.forward_way1.eq(replace_way)
             sync += r1.forward_row1.eq(r1.store_row)
@@ -1296,24 +1458,21 @@ class DCache(Elaboratable):
         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
 
         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
-            with m.If(~r0.mmu_req):
-                sync += r1.ls_valid.eq(1)
-            with m.Else():
+            with m.If(r0.mmu_req):
                 sync += r1.mmu_done.eq(1)
+            with m.Else():
+                sync += r1.ls_valid.eq(1)
 
         with m.If(r1.write_tag):
             # Store new tag in selected way
-            for i in range(NUM_WAYS):
-                with m.If(i == replace_way):
-                    ct = Signal(TAG_RAM_WIDTH)
-                    comb += ct.eq(cache_tags[r1.store_index])
-                    """
-TODO: check this
-cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
-                    (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
-                    """
-                    comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
-                    sync += cache_tags[r1.store_index].eq(ct)
+            replace_way_onehot = Signal(self.NUM_WAYS)
+            comb += replace_way_onehot.eq(1<<replace_way)
+            ct = Signal(self.TAG_RAM_WIDTH)
+            comb += ct.eq(r1.reload_tag << (replace_way*self.TAG_WIDTH))
+            comb += wr_tag.en.eq(replace_way_onehot)
+            comb += wr_tag.addr.eq(r1.store_index)
+            comb += wr_tag.data.eq(ct)
+
             sync += r1.store_way.eq(replace_way)
             sync += r1.write_tag.eq(0)
 
@@ -1354,12 +1513,15 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                       | (req_op == Op.OP_STORE_HIT)):
                 sync += r1.req.eq(req)
                 sync += r1.full.eq(1)
+                # do not let r1.state RELOAD_WAIT_ACK or STORE_WAIT_ACK
+                # destroy r1.req by overwriting r1.full back to zero
+                comb += r1_next_cycle.eq(1)
 
         # Main state machine
         with m.Switch(r1.state):
 
             with m.Case(State.IDLE):
-                sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
+                sync += r1.wb.adr.eq(req.real_addr[self.ROW_OFF_BITS:])
                 sync += r1.wb.sel.eq(req.byte_sel)
                 sync += r1.wb.dat.eq(req.data)
                 sync += r1.dcbz.eq(req.dcbz)
@@ -1368,16 +1530,19 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                 # for subsequent stores.
                 sync += r1.store_index.eq(req_idx)
                 sync += r1.store_row.eq(req_row)
-                sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
+                sync += r1.end_row_ix.eq(self.get_row_of_line(req_row)-1)
                 sync += r1.reload_tag.eq(req_tag)
                 sync += r1.req.same_tag.eq(1)
 
                 with m.If(req.op == Op.OP_STORE_HIT):
                     sync += r1.store_way.eq(req.hit_way)
 
+                #with m.If(r1.dec_acks):
+                #    sync += r1.acks_pending.eq(r1.acks_pending - 1)
+
                 # Reset per-row valid bits,
                 # ready for handling OP_LOAD_MISS
-                for i in range(ROW_PER_LINE):
+                for i in range(self.ROW_PER_LINE):
                     sync += r1.rows_valid[i].eq(0)
 
                 with m.If(req_op != Op.OP_NONE):
@@ -1413,12 +1578,13 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                             sync += r1.state.eq(State.STORE_WAIT_ACK)
                             sync += r1.acks_pending.eq(1)
                             sync += r1.full.eq(0)
+                            comb += r1_next_cycle.eq(0)
                             sync += r1.slow_valid.eq(1)
 
-                            with m.If(~req.mmu_req):
-                                sync += r1.ls_valid.eq(1)
-                            with m.Else():
+                            with m.If(req.mmu_req):
                                 sync += r1.mmu_done.eq(1)
+                            with m.Else():
+                                sync += r1.ls_valid.eq(1)
 
                             with m.If(req.op == Op.OP_STORE_HIT):
                                 sync += r1.write_bram.eq(1)
@@ -1445,30 +1611,25 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                         pass
 
             with m.Case(State.RELOAD_WAIT_ACK):
-                ld_stbs_done = Signal()
-                # Requests are all sent if stb is 0
-                comb += ld_stbs_done.eq(~r1.wb.stb)
 
                 # If we are still sending requests, was one accepted?
-                with m.If((~wb_in.stall) & r1.wb.stb):
-                    # That was the last word?  We are done sending.
-                    # Clear stb and set ld_stbs_done so we can handle an
-                    # eventual last ack on the same cycle.
+                with m.If((~bus.stall) & r1.wb.stb):
+                    # That was the last word?  We are done sending.  Clear stb
                     # sigh - reconstruct wb adr with 3 extra 0s at front
-                    wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
-                    with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
+                    wb_adr = Cat(Const(0, self.ROW_OFF_BITS), r1.wb.adr)
+                    with m.If(self.is_last_row_addr(wb_adr, r1.end_row_ix)):
                         sync += r1.wb.stb.eq(0)
-                        comb += ld_stbs_done.eq(1)
 
                     # Calculate the next row address in the current cache line
-                    row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
+                    rlen = self.LINE_OFF_BITS-self.ROW_OFF_BITS
+                    row = Signal(rlen)
                     comb += row.eq(r1.wb.adr)
-                    sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
+                    sync += r1.wb.adr[:rlen].eq(row+1)
 
                 # Incoming acks processing
-                sync += r1.forward_valid1.eq(wb_in.ack)
-                with m.If(wb_in.ack):
-                    srow = Signal(ROW_LINE_BITS)
+                sync += r1.forward_valid1.eq(bus.ack)
+                with m.If(bus.ack):
+                    srow = Signal(self.ROW_LINE_BITS)
                     comb += srow.eq(r1.store_row)
                     sync += r1.rows_valid[srow].eq(1)
 
@@ -1477,27 +1638,31 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                     # Compare the whole address in case the
                     # request in r1.req is not the one that
                     # started this refill.
-                    with m.If(req.valid & r1.req.same_tag &
-                              ((r1.dcbz & r1.req.dcbz) |
-                               (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
-                                (r1.store_row == get_row(req.real_addr))):
-                        sync += r1.full.eq(0)
+                    rowmatch = Signal()
+                    lastrow = Signal()
+                    comb += rowmatch.eq(r1.store_row ==
+                                        self.get_row(r1.req.real_addr))
+                    comb += lastrow.eq(self.is_last_row(r1.store_row,
+                                                      r1.end_row_ix))
+                    with m.If(r1.full & r1.req.same_tag &
+                              ((r1.dcbz & req.dcbz) |
+                               (r1.req.op == Op.OP_LOAD_MISS)) & rowmatch):
+                        sync += r1.full.eq(r1_next_cycle)
                         sync += r1.slow_valid.eq(1)
-                        with m.If(~r1.mmu_req):
-                            sync += r1.ls_valid.eq(1)
-                        with m.Else():
+                        with m.If(r1.mmu_req):
                             sync += r1.mmu_done.eq(1)
+                        with m.Else():
+                            sync += r1.ls_valid.eq(1)
                         sync += r1.forward_sel.eq(~0) # all 1s
                         sync += r1.use_forward1.eq(1)
 
                     # Check for completion
-                    with m.If(ld_stbs_done & is_last_row(r1.store_row,
-                                                      r1.end_row_ix)):
+                    with m.If(lastrow):
                         # Complete wishbone cycle
                         sync += r1.wb.cyc.eq(0)
 
                         # Cache line is now valid
-                        cv = Signal(INDEX_BITS)
+                        cv = Signal(self.INDEX_BITS)
                         comb += cv.eq(cache_valids[r1.store_index])
                         comb += cv.bit_select(r1.store_way, 1).eq(1)
                         sync += cache_valids[r1.store_index].eq(cv)
@@ -1508,45 +1673,48 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                                          cv, r1.store_index, r1.store_way)
 
                     # Increment store row counter
-                    sync += r1.store_row.eq(next_row(r1.store_row))
+                    sync += r1.store_row.eq(self.next_row(r1.store_row))
 
             with m.Case(State.STORE_WAIT_ACK):
                 st_stbs_done = Signal()
-                acks        = Signal(3)
                 adjust_acks = Signal(3)
 
                 comb += st_stbs_done.eq(~r1.wb.stb)
-                comb += acks.eq(r1.acks_pending)
 
                 with m.If(r1.inc_acks != r1.dec_acks):
                     with m.If(r1.inc_acks):
-                        comb += adjust_acks.eq(acks + 1)
+                        comb += adjust_acks.eq(r1.acks_pending + 1)
                     with m.Else():
-                        comb += adjust_acks.eq(acks - 1)
+                        comb += adjust_acks.eq(r1.acks_pending - 1)
                 with m.Else():
-                    comb += adjust_acks.eq(acks)
+                    comb += adjust_acks.eq(r1.acks_pending)
 
                 sync += r1.acks_pending.eq(adjust_acks)
 
                 # Clear stb when slave accepted request
-                with m.If(~wb_in.stall):
+                with m.If(~bus.stall):
                     # See if there is another store waiting
                     # to be done which is in the same real page.
+                    # (this is when same_tsg is true)
                     with m.If(req.valid):
-                        _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
-                        sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
+                        _ra = req.real_addr[self.ROW_OFF_BITS:
+                                            self.SET_SIZE_BITS]
+                        alen = self.SET_SIZE_BITS-self.ROW_OFF_BITS
+                        sync += r1.wb.adr[0:alen].eq(_ra)
                         sync += r1.wb.dat.eq(req.data)
                         sync += r1.wb.sel.eq(req.byte_sel)
 
                     with m.If((adjust_acks < 7) & req.same_tag &
-                                ((req.op == Op.OP_STORE_MISS)
-                                 (req.op == Op.OP_STORE_HIT))):
+                                ((req.op == Op.OP_STORE_MISS) |
+                                 (req.op == Op.OP_STORE_HIT))):
                         sync += r1.wb.stb.eq(1)
                         comb += st_stbs_done.eq(0)
+                        sync += r1.store_way.eq(req.hit_way)
+                        sync += r1.store_row.eq(self.get_row(req.real_addr))
 
                         with m.If(req.op == Op.OP_STORE_HIT):
                             sync += r1.write_bram.eq(1)
-                        sync += r1.full.eq(0)
+                        sync += r1.full.eq(r1_next_cycle)
                         sync += r1.slow_valid.eq(1)
 
                         # Store requests never come from the MMU
@@ -1558,7 +1726,9 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                         comb += st_stbs_done.eq(1)
 
                 # Got ack ? See if complete.
-                with m.If(wb_in.ack):
+                sync += Display("got ack %d %d stbs %d adjust_acks %d",
+                                bus.ack, bus.ack, st_stbs_done, adjust_acks)
+                with m.If(bus.ack):
                     with m.If(st_stbs_done & (adjust_acks == 1)):
                         sync += r1.state.eq(State.IDLE)
                         sync += r1.wb.cyc.eq(0)
@@ -1567,55 +1737,51 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
 
             with m.Case(State.NC_LOAD_WAIT_ACK):
                 # Clear stb when slave accepted request
-                with m.If(~wb_in.stall):
+                with m.If(~bus.stall):
                     sync += r1.wb.stb.eq(0)
 
                 # Got ack ? complete.
-                with m.If(wb_in.ack):
+                with m.If(bus.ack):
                     sync += r1.state.eq(State.IDLE)
-                    sync += r1.full.eq(0)
+                    sync += r1.full.eq(r1_next_cycle)
                     sync += r1.slow_valid.eq(1)
 
-                    with m.If(~r1.mmu_req):
-                        sync += r1.ls_valid.eq(1)
-                    with m.Else():
+                    with m.If(r1.mmu_req):
                         sync += r1.mmu_done.eq(1)
+                    with m.Else():
+                        sync += r1.ls_valid.eq(1)
 
                     sync += r1.forward_sel.eq(~0) # all 1s
                     sync += r1.use_forward1.eq(1)
                     sync += r1.wb.cyc.eq(0)
                     sync += r1.wb.stb.eq(0)
 
-    def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
+    def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
 
         sync = m.d.sync
-        d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
+        d_out, bus, log_out = self.d_out, self.bus, self.log_out
 
-        sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
+        sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
                                stall_out, req_op[:3], d_out.valid, d_out.error,
-                               r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
+                               r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
                                r1.real_adr[3:6]))
 
     def elaborate(self, platform):
 
         m = Module()
-        comb = m.d.comb
-        d_in = self.d_in
+        comb, sync = m.d.comb, m.d.sync
+        m_in, d_in = self.m_in, self.d_in
 
         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
-        cache_tags       = CacheTagArray()
-        cache_tag_set    = Signal(TAG_RAM_WIDTH)
-        cache_valids = CacheValidBitsArray()
+        cache_valids     = self.CacheValidsArray()
+        cache_tag_set    = Signal(self.TAG_RAM_WIDTH)
 
-        # TODO attribute ram_style : string;
-        # TODO attribute ram_style of cache_tags : signal is "distributed";
+        self.tagmem = Memory(depth=self.NUM_LINES, width=self.TAG_RAM_WIDTH,
+                             attrs={'syn_ramstyle': "block_ram"})
 
         """note: these are passed to nmigen.hdl.Memory as "attributes".
            don't know how, just that they are.
         """
-        dtlb_valid_bits = TLBValidBitsArray()
-        dtlb_tags       = TLBTagsArray()
-        dtlb_ptes       = TLBPtesArray()
         # TODO attribute ram_style of
         #  dtlb_tags : signal is "distributed";
         # TODO attribute ram_style of
@@ -1624,21 +1790,21 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
         r0      = RegStage0("r0")
         r0_full = Signal()
 
-        r1 = RegStage1("r1")
+        r1 = RegStage1(self, "r1")
 
-        reservation = Reservation()
+        reservation = Reservation(self, "rsrv")
 
         # Async signals on incoming request
-        req_index    = Signal(INDEX_BITS)
-        req_row      = Signal(ROW_BITS)
-        req_hit_way  = Signal(WAY_BITS)
-        req_tag      = Signal(TAG_BITS)
+        req_index    = Signal(self.INDEX_BITS)
+        req_row      = Signal(self.ROW_BITS)
+        req_hit_way  = Signal(self.WAY_BITS)
+        req_tag      = Signal(self.TAG_BITS)
         req_op       = Signal(Op)
         req_data     = Signal(64)
         req_same_tag = Signal()
         req_go       = Signal()
 
-        early_req_row     = Signal(ROW_BITS)
+        early_req_row     = Signal(self.ROW_BITS)
 
         cancel_store      = Signal()
         set_rsrv          = Signal()
@@ -1652,28 +1818,25 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
 
         cache_out_row     = Signal(WB_DATA_BITS)
 
-        plru_victim       = PLRUOut()
-        replace_way       = Signal(WAY_BITS)
+        plru_victim       = Signal(self.WAY_BITS)
+        replace_way       = Signal(self.WAY_BITS)
 
         # Wishbone read/write/cache write formatting signals
         bus_sel           = Signal(8)
 
         # TLB signals
-        tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
-        tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
-        tlb_valid_way = Signal(TLB_NUM_WAYS)
-        tlb_req_index = Signal(TLB_SET_BITS)
-        tlb_hit       = Signal()
-        tlb_hit_way   = Signal(TLB_WAY_BITS)
-        pte           = Signal(TLB_PTE_BITS)
-        ra            = Signal(REAL_ADDR_BITS)
+        tlb_way       = self.TLBRecord("tlb_way")
+        tlb_req_index = Signal(self.TLB_SET_BITS)
+        tlb_hit       = self.TLBHit("tlb_hit")
+        pte           = Signal(self.TLB_PTE_BITS)
+        ra            = Signal(self.REAL_ADDR_BITS)
         valid_ra      = Signal()
         perm_attr     = PermAttr("dc_perms")
         rc_ok         = Signal()
         perm_ok       = Signal()
         access_ok     = Signal()
 
-        tlb_plru_victim = TLBPLRUOut()
+        tlb_plru_victim = Signal(self.TLB_WAY_BITS)
 
         # we don't yet handle collisions between loadstore1 requests
         # and MMU requests
@@ -1683,37 +1846,50 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
         comb += self.stall_out.eq(r0_stall)
-
-        # Wire up wishbone request latch out of stage 1
-        comb += self.wb_out.eq(r1.wb)
+        # debugging: detect if any stall ever requested, which is fine,
+        # but if a request comes in when stall requested, that's bad.
+        with m.If(r0_stall):
+            sync += self.any_stall_out.eq(1)
+            with m.If(d_in.valid):
+                sync += self.dreq_when_stall.eq(1)
+            with m.If(m_in.valid):
+                sync += self.mreq_when_stall.eq(1)
 
         # deal with litex not doing wishbone pipeline mode
         # XXX in wrong way.  FIFOs are needed in the SRAM test
-        # so that stb/ack match up
-        comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack)
+        # so that stb/ack match up. same thing done in icache.py
+        if not self.microwatt_compat or self.fabric_compat:
+            comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
+
+        # Wire up wishbone request latch out of stage 1
+        comb += self.bus.we.eq(r1.wb.we)
+        comb += self.bus.adr.eq(r1.wb.adr)
+        comb += self.bus.sel.eq(r1.wb.sel)
+        comb += self.bus.stb.eq(r1.wb.stb)
+        comb += self.bus.dat_w.eq(r1.wb.dat)
+        comb += self.bus.cyc.eq(r1.wb.cyc)
+
+        # create submodule TLBUpdate
+        m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate(self)
 
         # call sub-functions putting everything together, using shared
         # signals established above
         self.stage_0(m, r0, r1, r0_full)
-        self.tlb_read(m, r0_stall, tlb_valid_way,
-                      tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
-                      dtlb_tags, dtlb_ptes)
+        self.tlb_read(m, r0_stall, tlb_way)
         self.tlb_search(m, tlb_req_index, r0, r0_valid,
-                        tlb_valid_way, tlb_tag_way, tlb_hit_way,
-                        tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
-        self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
-                        tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
-                        dtlb_tags, tlb_pte_way, dtlb_ptes)
+                        tlb_way,
+                        pte, tlb_hit, valid_ra, perm_attr, ra)
+        self.tlb_update(m, r0_valid, r0, tlb_req_index,
+                        tlb_hit, tlb_plru_victim)
         self.maybe_plrus(m, r1, plru_victim)
-        self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
-        self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
+        self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
+        self.cache_tag_read(m, r0_stall, req_index, cache_tag_set)
         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
                            r0_valid, r1, cache_valids, replace_way,
                            use_forward1_next, use_forward2_next,
                            req_hit_way, plru_victim, rc_ok, perm_attr,
                            valid_ra, perm_ok, access_ok, req_op, req_go,
-                           tlb_pte_way,
-                           tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
+                           tlb_hit, tlb_way, cache_tag_set,
                            cancel_store, req_same_tag, r0_stall, early_req_row)
         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
                            r0_valid, r0, reservation)
@@ -1723,12 +1899,12 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
                         req_hit_way, req_index, req_tag, access_ok,
-                        tlb_hit, tlb_hit_way, tlb_req_index)
+                        tlb_hit, tlb_req_index)
         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
-                    cache_valids, r0, replace_way,
+                    r0, replace_way,
                     req_hit_way, req_same_tag,
-                         r0_valid, req_op, cache_tags, req_go, ra)
-        #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
+                         r0_valid, req_op, cache_valids, req_go, ra)
+        #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
 
         return m
 
diff --git a/src/soc/experiment/formal/proof_compalu_multi.py b/src/soc/experiment/formal/proof_compalu_multi.py
new file mode 100644 (file)
index 0000000..96b61a2
--- /dev/null
@@ -0,0 +1,211 @@
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Cesar Strauss <cestrauss@gmail.com>
+# Sponsored by NLnet under EU Grant and 957073
+# Part of the Libre-SOC Project.
+
+"""
+Formal proof of soc.experiment.compalu_multi.MultiCompUnit
+
+In short, MultiCompUnit:
+
+1) stores an opcode from Issue, when not "busy", and "issue" is pulsed
+2) signals "busy" high
+3) fetches its operand(s), if any (which are not masked or zero) from the
+Scoreboard (REL/GO protocol)
+4) starts the ALU (ready/valid protocol), as soon as all inputs are available
+5) captures result from ALU (again ready/valid)
+5) sends the result(s) back to the Scoreboard (again REL/GO)
+6) drops "busy"
+
+Note that, if the conditions are right, many of the above can occur together,
+on a single cycle.
+
+The formal proof involves ensuring that:
+1) the ALU gets the right opcode from Issue
+2) the ALU gets the right operands from the Scoreboard
+3) the Scoreboard receives the right result from the ALU
+4) no transactions are dropped or repeated
+
+This can be checked using holding registers and transaction counters.
+
+See https://bugs.libre-soc.org/show_bug.cgi?id=879 and
+https://bugs.libre-soc.org/show_bug.cgi?id=197
+"""
+
+import unittest
+
+from nmigen import Signal, Module
+from nmigen.hdl.ast import Cover, Const, Assume, Assert
+from nmutil.formaltest import FHDLTestCase
+from nmutil.singlepipe import ControlBase
+
+from soc.experiment.compalu_multi import MultiCompUnit
+from soc.fu.alu.alu_input_record import CompALUOpSubset
+
+
+# Formal model of a simple ALU, whose inputs and outputs are randomly
+# generated by the formal engine
+
+class ALUCtx:
+    def __init__(self):
+        self.op = CompALUOpSubset(name="op")
+
+
+class ALUInput:
+    def __init__(self):
+        self.a = Signal(16)
+        self.b = Signal(16)
+        self.ctx = ALUCtx()
+
+    def eq(self, i):
+        return [self.a.eq(i.a), self.b.eq(i.b)]
+
+
+class ALUOutput:
+    def __init__(self):
+        self.o1 = Signal(16)
+        self.o2 = Signal(16)
+
+    def eq(self, i):
+        return [self.o1.eq(i.o1), self.o2.eq(i.o2)]
+
+
+class ALU(ControlBase):
+    def __init__(self):
+        super().__init__(stage=self)
+        self.p.i_data, self.n.o_data = self.new_specs(None)
+        self.i, self.o = self.p.i_data, self.n.o_data
+
+    def setup(self, m, i):
+        pass
+
+    def ispec(self, name=None):
+        return ALUInput()
+
+    def ospec(self, name=None):
+        return ALUOutput()
+
+    def elaborate(self, platform):
+        m = super().elaborate(platform)
+        return m
+
+
+class CompALUMultiTestCase(FHDLTestCase):
+    def test_formal(self):
+        inspec = [('INT', 'a', '0:15'),
+                  ('INT', 'b', '0:15')]
+        outspec = [('INT', 'o1', '0:15'),
+                   ('INT', 'o2', '0:15')]
+        regspec = (inspec, outspec)
+        m = Module()
+        # Instantiate "random" ALU
+        alu = ALU()
+        m.submodules.dut = dut = MultiCompUnit(regspec, alu, CompALUOpSubset)
+        # TODO Test shadow / die
+        m.d.comb += [dut.shadown_i.eq(1), dut.go_die_i.eq(0)]
+        # Don't issue while busy
+        issue = Signal()
+        m.d.comb += dut.issue_i.eq(issue & ~dut.busy_o)
+        # Avoid toggling go_i when rel_o is low (rel / go protocol)
+        rd_go = Signal(dut.n_src)
+        m.d.comb += dut.cu.rd.go_i.eq(rd_go & dut.cu.rd.rel_o)
+        wr_go = Signal(dut.n_dst)
+        m.d.comb += dut.cu.wr.go_i.eq(wr_go & dut.cu.wr.rel_o)
+        # Transaction counters
+        do_issue = Signal()
+        m.d.comb += do_issue.eq(dut.issue_i & ~dut.busy_o)
+        cnt_issue = Signal(4)
+        m.d.sync += cnt_issue.eq(cnt_issue + do_issue)
+        do_read = Signal(dut.n_src)
+        m.d.comb += do_read.eq(dut.cu.rd.rel_o & dut.cu.rd.go_i)
+        cnt_read = []
+        for i in range(dut.n_src):
+            cnt = Signal(4, name="cnt_read_%d" % i)
+            m.d.sync += cnt.eq(cnt + do_read[i])
+            cnt_read.append(cnt)
+        do_write = Signal(dut.n_dst)
+        m.d.comb += do_write.eq(dut.cu.wr.rel_o & dut.cu.wr.go_i)
+        cnt_write = []
+        for i in range(dut.n_dst):
+            cnt = Signal(4, name="cnt_write_%d" % i)
+            m.d.sync += cnt.eq(cnt + do_write[i])
+            cnt_write.append(cnt)
+        do_alu_write = Signal()
+        m.d.comb += do_alu_write.eq(alu.p.i_valid & alu.p.o_ready)
+        cnt_alu_write = Signal(4)
+        m.d.sync += cnt_alu_write.eq(cnt_alu_write + do_alu_write)
+        do_alu_read = Signal()
+        m.d.comb += do_alu_read.eq(alu.n.o_valid & alu.n.i_ready)
+        cnt_alu_read = Signal(4)
+        m.d.sync += cnt_alu_read.eq(cnt_alu_read + do_alu_read)
+        cnt_masked_read = []
+        do_masked_read = Signal(dut.n_src)
+        for i in range(dut.n_src):
+            cnt = Signal(4, name="cnt_masked_read_%d" % i)
+            if i == 0:
+                extra = dut.oper_i.zero_a
+            elif i == 1:
+                extra = dut.oper_i.imm_data.ok
+            else:
+                extra = Const(0, 1)
+            m.d.comb += do_masked_read[i].eq(do_issue &
+                                             (dut.rdmaskn[i] | extra))
+            m.d.sync += cnt.eq(cnt + do_masked_read[i])
+            cnt_masked_read.append(cnt)
+        # If the ALU is idle, do not assert valid
+        with m.If((cnt_alu_read == cnt_alu_write) & ~do_alu_write):
+            m.d.comb += Assume(~alu.n.o_valid)
+        # Keep ALU valid high, until read
+        last_alu_valid = Signal()
+        m.d.sync += last_alu_valid.eq(alu.n.o_valid & ~alu.n.i_ready)
+        with m.If(last_alu_valid):
+            m.d.comb += Assume(alu.n.o_valid)
+
+        # Invariant checks
+
+        # For every instruction issued, at any point in time,
+        # each operand was either:
+        # 1) Already read
+        # 2) Not read yet, but the read is pending (rel_o high)
+        # 3) Masked
+        for i in range(dut.n_src):
+            sum_read = Signal(4)
+            m.d.comb += sum_read.eq(
+                cnt_read[i] + cnt_masked_read[i] + dut.cu.rd.rel_o[i])
+            m.d.comb += Assert(sum_read == cnt_issue)
+
+        # For every instruction, either:
+        # 1) The ALU is executing the instruction
+        # 2) Otherwise, execution is pending (alu.p.i_valid is high)
+        # 3) Otherwise, it is waiting for operands
+        #    (some dut.cu.rd.rel_o are still high)
+        # 4) ... unless all operands are masked, in which case there is a one
+        #    cycle delay
+        all_masked = Signal()
+        m.d.sync += all_masked.eq(do_masked_read.all())
+        sum_alu_write = Signal(4)
+        m.d.comb += sum_alu_write.eq(
+            cnt_alu_write +
+            (dut.cu.rd.rel_o.any() | all_masked | alu.p.i_valid))
+        m.d.comb += Assert(sum_alu_write == cnt_issue)
+
+        # Ask the formal engine to give an example
+        m.d.comb += Cover((cnt_issue == 2)
+                          & (cnt_read[0] == 1)
+                          & (cnt_read[1] == 0)
+                          & (cnt_write[0] == 1)
+                          & (cnt_write[1] == 1)
+                          & (cnt_alu_write == 1)
+                          & (cnt_alu_read == 1)
+                          & (cnt_masked_read[0] == 1)
+                          & (cnt_masked_read[1] == 1))
+        with self.subTest("cover"):
+            self.assertFormal(m, mode="cover", depth=10)
+
+        # Check assertions
+        with self.subTest("bmc"):
+            self.assertFormal(m, mode="bmc", depth=10)
+
+
+if __name__ == "__main__":
+    unittest.main()
index 1b8aa8586a761337cf5cb09359b807cd66576516..064f39b629e2388616a47be04726cd1c290b1853 100644 (file)
@@ -17,18 +17,28 @@ TODO (in no specific order):
   write TAG_BITS width which may not match full ram blocks and might
   cause muxes to be inferred for "partial writes".
 * Check if making the read size of PLRU a ROM helps utilization
+
+Links:
+
+* https://bugs.libre-soc.org/show_bug.cgi?id=485
+* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
+  (discussion about brams for ECP5)
+
 """
 
 from enum import (Enum, unique)
-from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
+from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
+                    Record)
 from nmigen.cli import main, rtlil
 from nmutil.iocontrol import RecordObject
 from nmigen.utils import log2_int
+from nmigen.lib.coding import Decoder
 from nmutil.util import Display
+from nmutil.latch import SRLatch
 
 #from nmutil.plru import PLRU
+from soc.experiment.plru import PLRU, PLRUs
 from soc.experiment.cache_ram import CacheRam
-from soc.experiment.plru import PLRU
 
 from soc.experiment.mem_types import (Fetch1ToICacheType,
                                       ICacheToDecode1Type,
@@ -37,8 +47,11 @@ from soc.experiment.mem_types import (Fetch1ToICacheType,
 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
                                      WB_SEL_BITS, WBAddrType, WBDataType,
                                      WBSelType, WBMasterOut, WBSlaveOut,
-                                     WBMasterOutVector, WBSlaveOutVector,
-                                     WBIOMasterOut, WBIOSlaveOut)
+                                     )
+
+from nmigen_soc.wishbone.bus import Interface
+from soc.minerva.units.fetch import FetchUnitInterface
+
 
 # for test
 from soc.bus.sram import SRAM
@@ -50,225 +63,216 @@ from nmigen.cli import main, rtlil
 # Also, check out the cxxsim nmigen branch, and latest yosys from git
 from nmutil.sim_tmp_alternative import Simulator, Settle
 
+# from microwatt/utils.vhdl
+def ispow2(n):
+    return n != 0 and (n & (n - 1)) == 0
 
 SIM            = 0
-LINE_SIZE      = 64
-# BRAM organisation: We never access more than wishbone_data_bits
-# at a time so to save resources we make the array only that wide,
-# and use consecutive indices for to make a cache "line"
-#
-# ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
-ROW_SIZE       = WB_DATA_BITS // 8
-# Number of lines in a set
-NUM_LINES      = 16
-# Number of ways
-NUM_WAYS       = 4
-# L1 ITLB number of entries (direct mapped)
-TLB_SIZE       = 64
-# L1 ITLB log_2(page_size)
-TLB_LG_PGSZ    = 12
-# Number of real address bits that we store
-REAL_ADDR_BITS = 56
 # Non-zero to enable log data collection
 LOG_LENGTH     = 0
 
-ROW_SIZE_BITS  = ROW_SIZE * 8
-# ROW_PER_LINE is the number of row (wishbone) transactions in a line
-ROW_PER_LINE   = LINE_SIZE // ROW_SIZE
-# BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
-BRAM_ROWS      = NUM_LINES * ROW_PER_LINE
-# INSN_PER_ROW is the number of 32bit instructions per BRAM row
-INSN_PER_ROW   = ROW_SIZE_BITS // 32
-
-# Bit fields counts in the address
-#
-# INSN_BITS is the number of bits to select an instruction in a row
-INSN_BITS      = log2_int(INSN_PER_ROW)
-# ROW_BITS is the number of bits to select a row
-ROW_BITS       = log2_int(BRAM_ROWS)
-# ROW_LINE_BITS is the number of bits to select a row within a line
-ROW_LINE_BITS  = log2_int(ROW_PER_LINE)
-# LINE_OFF_BITS is the number of bits for the offset in a cache line
-LINE_OFF_BITS  = log2_int(LINE_SIZE)
-# ROW_OFF_BITS is the number of bits for the offset in a row
-ROW_OFF_BITS   = log2_int(ROW_SIZE)
-# INDEX_BITS is the number of bits to select a cache line
-INDEX_BITS     = log2_int(NUM_LINES)
-# SET_SIZE_BITS is the log base 2 of the set size
-SET_SIZE_BITS  = LINE_OFF_BITS + INDEX_BITS
-# TAG_BITS is the number of bits of the tag part of the address
-TAG_BITS       = REAL_ADDR_BITS - SET_SIZE_BITS
-# TAG_WIDTH is the width in bits of each way of the tag RAM
-TAG_WIDTH      = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
-
-# WAY_BITS is the number of bits to select a way
-WAY_BITS       = log2_int(NUM_WAYS)
-TAG_RAM_WIDTH  = TAG_BITS * NUM_WAYS
-
-# L1 ITLB
-TLB_BITS        = log2_int(TLB_SIZE)
-TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
-TLB_PTE_BITS    = 64
-
-print("BRAM_ROWS       =", BRAM_ROWS)
-print("INDEX_BITS      =", INDEX_BITS)
-print("INSN_BITS       =", INSN_BITS)
-print("INSN_PER_ROW    =", INSN_PER_ROW)
-print("LINE_SIZE       =", LINE_SIZE)
-print("LINE_OFF_BITS   =", LINE_OFF_BITS)
-print("LOG_LENGTH      =", LOG_LENGTH)
-print("NUM_LINES       =", NUM_LINES)
-print("NUM_WAYS        =", NUM_WAYS)
-print("REAL_ADDR_BITS  =", REAL_ADDR_BITS)
-print("ROW_BITS        =", ROW_BITS)
-print("ROW_OFF_BITS    =", ROW_OFF_BITS)
-print("ROW_LINE_BITS   =", ROW_LINE_BITS)
-print("ROW_PER_LINE    =", ROW_PER_LINE)
-print("ROW_SIZE        =", ROW_SIZE)
-print("ROW_SIZE_BITS   =", ROW_SIZE_BITS)
-print("SET_SIZE_BITS   =", SET_SIZE_BITS)
-print("SIM             =", SIM)
-print("TAG_BITS        =", TAG_BITS)
-print("TAG_RAM_WIDTH   =", TAG_RAM_WIDTH)
-print("TAG_BITS        =", TAG_BITS)
-print("TLB_BITS        =", TLB_BITS)
-print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
-print("TLB_LG_PGSZ     =", TLB_LG_PGSZ)
-print("TLB_PTE_BITS    =", TLB_PTE_BITS)
-print("TLB_SIZE        =", TLB_SIZE)
-print("WAY_BITS        =", WAY_BITS)
-
-# from microwatt/utils.vhdl
-def ispow2(n):
-    return n != 0 and (n & (n - 1)) == 0
-
-assert LINE_SIZE % ROW_SIZE == 0
-assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
-assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
-assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
-assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
-assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
-    "geometry bits don't add up"
-assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
-   "geometry bits don't add up"
-assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
-    "geometry bits don't add up"
-assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
-    "geometry bits don't add up"
-
-# Example of layout for 32 lines of 64 bytes:
-#
-# ..  tag    |index|  line  |
-# ..         |   row   |    |
-# ..         |     |   | |00| zero          (2)
-# ..         |     |   |-|  | INSN_BITS     (1)
-# ..         |     |---|    | ROW_LINE_BITS  (3)
-# ..         |     |--- - --| LINE_OFF_BITS (6)
-# ..         |         |- --| ROW_OFF_BITS  (3)
-# ..         |----- ---|    | ROW_BITS      (8)
-# ..         |-----|        | INDEX_BITS    (5)
-# .. --------|              | TAG_BITS      (53)
-
-# The cache data BRAM organized as described above for each way
-#subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
-#
-# The cache tags LUTRAM has a row per set. Vivado is a pain and will
-# not handle a clean (commented) definition of the cache tags as a 3d
-# memory. For now, work around it by putting all the tags
-def CacheTagArray():
-    return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
-                 for x in range(NUM_LINES))
-
-# The cache valid bits
-def CacheValidBitsArray():
-    return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
-                 for x in range(NUM_LINES))
-
-def RowPerLineValidArray():
-    return Array(Signal(name="rows_valid_%d" %x) \
-                 for x in range(ROW_PER_LINE))
-
-
-# TODO to be passed to nigmen as ram attributes
-# attribute ram_style : string;
-# attribute ram_style of cache_tags : signal is "distributed";
-
-
-def TLBValidBitsArray():
-    return Array(Signal(name="tlbvalid_%d" %x) \
-                 for x in range(TLB_SIZE))
-
-def TLBTagArray():
-    return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
-                 for x in range(TLB_SIZE))
-
-def TLBPtesArray():
-    return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
-                 for x in range(TLB_SIZE))
-
-# Cache RAM interface
-def CacheRamOut():
-    return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
-                 for x in range(NUM_WAYS))
-
-# PLRU output interface
-def PLRUOut():
-    return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
-                 for x in range(NUM_LINES))
-
-# Return the cache line index (tag index) for an address
-def get_index(addr):
-    return addr[LINE_OFF_BITS:SET_SIZE_BITS]
-
-# Return the cache row index (data memory) for an address
-def get_row(addr):
-    return addr[ROW_OFF_BITS:SET_SIZE_BITS]
-
-# Return the index of a row within a line
-def get_row_of_line(row):
-    return row[:ROW_LINE_BITS]
-
-# Returns whether this is the last row of a line
-def is_last_row_addr(addr, last):
-    return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
-
-# Returns whether this is the last row of a line
-def is_last_row(row, last):
-    return get_row_of_line(row) == last
-
-# Return the next row in the current cache line. We use a dedicated
-# function in order to limit the size of the generated adder to be
-# only the bits within a cache line (3 bits with default settings)
-def next_row(row):
-    row_v = row[0:ROW_LINE_BITS] + 1
-    return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
-
-# Read the instruction word for the given address
-# in the current cache row
-def read_insn_word(addr, data):
-    word = addr[2:INSN_BITS+2]
-    return data.word_select(word, 32)
-
-# Get the tag value from the address
-def get_tag(addr):
-    return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
-
-# Read a tag from a tag memory row
-def read_tag(way, tagset):
-    return tagset.word_select(way, TAG_BITS)
-
-# Write a tag to tag memory row
-def write_tag(way, tagset, tag):
-    return read_tag(way, tagset).eq(tag)
-
-# Simple hash for direct-mapped TLB index
-def hash_ea(addr):
-    hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
-           TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
-          ] ^ addr[
-           TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
-          ]
-    return hsh
+class ICacheConfig:
+    def __init__(self, XLEN          = 64,
+                       LINE_SIZE     = 64,
+                       NUM_LINES     = 64,  # Number of lines in a set
+                       NUM_WAYS      = 2,  # Number of ways
+                       TLB_SIZE      = 64,  # L1 ITLB number of entries
+                       TLB_LG_PGSZ   = 12): # L1 ITLB log_2(page_size)
+        self.XLEN           = XLEN
+        self.LINE_SIZE      = LINE_SIZE
+        self.NUM_LINES      = NUM_LINES
+        self.NUM_WAYS       = NUM_WAYS
+        self.TLB_SIZE       = TLB_SIZE
+        self.TLB_LG_PGSZ    = TLB_LG_PGSZ
+
+        # BRAM organisation: We never access more than wishbone_data_bits
+        # at a time so to save resources we make the array only that wide,
+        # and use consecutive indices for to make a cache "line"
+        #
+        # self.ROW_SIZE is the width in bytes of the BRAM
+        # (based on WB, so 64-bits)
+        self.ROW_SIZE       = WB_DATA_BITS // 8
+        # Number of real address bits that we store
+        self.REAL_ADDR_BITS = XLEN-8 # 56 for XLEN=64
+
+        self.ROW_SIZE_BITS  = self.ROW_SIZE * 8
+        # ROW_PER_LINE is the number of row (wishbone) transactions in a line
+        self.ROW_PER_LINE   = self.LINE_SIZE // self.ROW_SIZE
+        # BRAM_ROWS is the number of rows in BRAM
+        # needed to represent the full icache
+        self.BRAM_ROWS      = self.NUM_LINES * self.ROW_PER_LINE
+        # INSN_PER_ROW is the number of 32bit instructions per BRAM row
+        self.INSN_PER_ROW   = self.ROW_SIZE_BITS // 32
+
+        # Bit fields counts in the address
+        #
+        # INSN_BITS is the number of bits to select an instruction in a row
+        self.INSN_BITS      = log2_int(self.INSN_PER_ROW)
+        # ROW_BITS is the number of bits to select a row
+        self.ROW_BITS       = log2_int(self.BRAM_ROWS)
+        # ROW_LINE_BITS is the number of bits to select a row within a line
+        self.ROW_LINE_BITS  = log2_int(self.ROW_PER_LINE)
+        # LINE_OFF_BITS is the number of bits for the offset in a cache line
+        self.LINE_OFF_BITS  = log2_int(self.LINE_SIZE)
+        # ROW_OFF_BITS is the number of bits for the offset in a row
+        self.ROW_OFF_BITS   = log2_int(self.ROW_SIZE)
+        # INDEX_BITS is the number of bits to select a cache line
+        self.INDEX_BITS     = log2_int(self.NUM_LINES)
+        # SET_SIZE_BITS is the log base 2 of the set size
+        self.SET_SIZE_BITS  = self.LINE_OFF_BITS + self.INDEX_BITS
+        # TAG_BITS is the number of bits of the tag part of the address
+        self.TAG_BITS       = self.REAL_ADDR_BITS - self.SET_SIZE_BITS
+        # TAG_WIDTH is the width in bits of each way of the tag RAM
+        self.TAG_WIDTH      = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8)
+
+        # WAY_BITS is the number of bits to select a way
+        self.WAY_BITS       = log2_int(self.NUM_WAYS)
+        self.TAG_RAM_WIDTH  = self.TAG_BITS * self.NUM_WAYS
+
+        # L1 ITLB
+        self.TL_BITS        = log2_int(self.TLB_SIZE)
+        self.TLB_EA_TAG_BITS = XLEN - (self.TLB_LG_PGSZ + self.TL_BITS)
+        self.TLB_PTE_BITS    = XLEN
+
+        print("self.XLEN            =", self.XLEN)
+        print("self.BRAM_ROWS       =", self.BRAM_ROWS)
+        print("self.INDEX_BITS      =", self.INDEX_BITS)
+        print("self.INSN_BITS       =", self.INSN_BITS)
+        print("self.INSN_PER_ROW    =", self.INSN_PER_ROW)
+        print("self.LINE_SIZE       =", self.LINE_SIZE)
+        print("self.LINE_OFF_BITS   =", self.LINE_OFF_BITS)
+        print("LOG_LENGTH      =", LOG_LENGTH)
+        print("self.NUM_LINES       =", self.NUM_LINES)
+        print("self.NUM_WAYS        =", self.NUM_WAYS)
+        print("self.REAL_ADDR_BITS  =", self.REAL_ADDR_BITS)
+        print("self.ROW_BITS        =", self.ROW_BITS)
+        print("self.ROW_OFF_BITS    =", self.ROW_OFF_BITS)
+        print("self.ROW_LINE_BITS   =", self.ROW_LINE_BITS)
+        print("self.ROW_PER_LINE    =", self.ROW_PER_LINE)
+        print("self.ROW_SIZE        =", self.ROW_SIZE)
+        print("self.ROW_SIZE_BITS   =", self.ROW_SIZE_BITS)
+        print("self.SET_SIZE_BITS   =", self.SET_SIZE_BITS)
+        print("SIM             =", SIM)
+        print("self.TAG_BITS        =", self.TAG_BITS)
+        print("self.TAG_RAM_WIDTH   =", self.TAG_RAM_WIDTH)
+        print("self.TAG_BITS        =", self.TAG_BITS)
+        print("self.TL_BITS        =", self.TL_BITS)
+        print("self.TLB_EA_TAG_BITS =", self.TLB_EA_TAG_BITS)
+        print("self.TLB_LG_PGSZ     =", self.TLB_LG_PGSZ)
+        print("self.TLB_PTE_BITS    =", self.TLB_PTE_BITS)
+        print("self.TLB_SIZE        =", self.TLB_SIZE)
+        print("self.WAY_BITS        =", self.WAY_BITS)
+        print()
+
+        assert self.LINE_SIZE % self.ROW_SIZE == 0
+        assert ispow2(self.LINE_SIZE), "self.LINE_SIZE not power of 2"
+        assert ispow2(self.NUM_LINES), "self.NUM_LINES not power of 2"
+        assert ispow2(self.ROW_PER_LINE), "self.ROW_PER_LINE not power of 2"
+        assert ispow2(self.INSN_PER_ROW), "self.INSN_PER_ROW not power of 2"
+        assert (self.ROW_BITS == (self.INDEX_BITS + self.ROW_LINE_BITS)), \
+            "geometry bits don't add up"
+        assert (self.LINE_OFF_BITS ==
+            (self.ROW_OFF_BITS + self.ROW_LINE_BITS)), \
+           "geometry bits don't add up"
+        assert (self.REAL_ADDR_BITS ==
+            (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS)), \
+            "geometry bits don't add up"
+        assert (self.REAL_ADDR_BITS ==
+            (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS)), \
+            "geometry bits don't add up"
+
+        # Example of layout for 32 lines of 64 bytes:
+        #
+        # ..  tag    |index|  line  |
+        # ..         |   row   |    |
+        # ..         |     |   | |00| zero          (2)
+        # ..         |     |   |-|  | self.INSN_BITS     (1)
+        # ..         |     |---|    | self.ROW_LINE_BITS  (3)
+        # ..         |     |--- - --| self.LINE_OFF_BITS (6)
+        # ..         |         |- --| self.ROW_OFF_BITS  (3)
+        # ..         |----- ---|    | self.ROW_BITS      (8)
+        # ..         |-----|        | self.INDEX_BITS    (5)
+        # .. --------|              | self.TAG_BITS      (53)
+
+    # The cache data BRAM organized as described above for each way
+    #subtype cache_row_t is std_ulogic_vector(self.ROW_SIZE_BITS-1 downto 0);
+    #
+    def RowPerLineValidArray(self):
+        return Array(Signal(name="rows_valid_%d" %x) \
+                     for x in range(self.ROW_PER_LINE))
+
+
+    # TODO to be passed to nigmen as ram attributes
+    # attribute ram_style : string;
+    # attribute ram_style of cache_tags : signal is "distributed";
+
+    def TLBRecord(self, name):
+        tlb_layout = [ ('tag', self.TLB_EA_TAG_BITS),
+                      ('pte', self.TLB_PTE_BITS)
+                     ]
+        return Record(tlb_layout, name=name)
+
+    def TLBArray(self):
+        return Array(self.TLBRecord("tlb%d" % x) for x in range(self.TLB_SIZE))
+
+    # PLRU output interface
+    def PLRUOut(self):
+        return Array(Signal(self.WAY_BITS, name="plru_out_%d" %x) \
+                     for x in range(self.NUM_LINES))
+
+    # Return the cache line index (tag index) for an address
+    def get_index(self, addr):
+        return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS]
+
+    # Return the cache row index (data memory) for an address
+    def get_row(self, addr):
+        return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS]
+
+    # Return the index of a row within a line
+    def get_row_of_line(self, row):
+        return row[:self.ROW_BITS][:self.ROW_LINE_BITS]
+
+    # Returns whether this is the last row of a line
+    def is_last_row_addr(self, addr, last):
+        return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last
+
+    # Returns whether this is the last row of a line
+    def is_last_row(self, row, last):
+        return self.get_row_of_line(row) == last
+
+    # Return the next row in the current cache line. We use a dedicated
+    # function in order to limit the size of the generated adder to be
+    # only the bits within a cache line (3 bits with default settings)
+    def next_row(self, row):
+        row_v = row[0:self.ROW_LINE_BITS] + 1
+        return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:])
+
+    # Read the instruction word for the given address
+    # in the current cache row
+    def read_insn_word(self, addr, data):
+        word = addr[2:self.INSN_BITS+2]
+        return data.word_select(word, 32)
+
+    # Get the tag value from the address
+    def get_tag(self, addr):
+        return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS]
+
+    # Read a tag from a tag memory row
+    def read_tag(self, way, tagset):
+        return tagset.word_select(way, self.TAG_BITS)
+
+    # Write a tag to tag memory row
+    def write_tag(self, way, tagset, tag):
+        return self.read_tag(way, tagset).eq(tag)
+
+    # Simple hash for direct-mapped TLB index
+    def hash_ea(self, addr):
+        hsh = (addr[self.TLB_LG_PGSZ:self.TLB_LG_PGSZ + self.TL_BITS] ^
+               addr[self.TLB_LG_PGSZ + self.TL_BITS:
+                    self.TLB_LG_PGSZ + 2 * self.TL_BITS ] ^
+               addr[self.TLB_LG_PGSZ + 2 * self.TL_BITS:
+                    self.TLB_LG_PGSZ + 3 * self.TL_BITS])
+        return hsh
 
 
 # Cache reload state machine
@@ -280,10 +284,10 @@ class State(Enum):
 
 
 class RegInternal(RecordObject):
-    def __init__(self):
+    def __init__(self, cfg):
         super().__init__()
         # Cache hit state (Latches for 1 cycle BRAM access)
-        self.hit_way      = Signal(NUM_WAYS)
+        self.hit_way      = Signal(cfg.WAY_BITS)
         self.hit_nia      = Signal(64)
         self.hit_smark    = Signal()
         self.hit_valid    = Signal()
@@ -292,21 +296,22 @@ class RegInternal(RecordObject):
         self.state        = Signal(State, reset=State.IDLE)
         self.wb           = WBMasterOut("wb")
         self.req_adr      = Signal(64)
-        self.store_way    = Signal(NUM_WAYS)
-        self.store_index  = Signal(NUM_LINES)
-        self.store_row    = Signal(BRAM_ROWS)
-        self.store_tag    = Signal(TAG_BITS)
+        self.store_way    = Signal(cfg.WAY_BITS)
+        self.store_index  = Signal(cfg.INDEX_BITS)
+        self.store_row    = Signal(cfg.ROW_BITS)
+        self.store_tag    = Signal(cfg.TAG_BITS)
         self.store_valid  = Signal()
-        self.end_row_ix   = Signal(ROW_LINE_BITS)
-        self.rows_valid   = RowPerLineValidArray()
+        self.end_row_ix   = Signal(cfg.ROW_LINE_BITS)
+        self.rows_valid   = cfg.RowPerLineValidArray()
 
         # TLB miss state
         self.fetch_failed = Signal()
 
 
-class ICache(Elaboratable):
+class ICache(FetchUnitInterface, Elaboratable, ICacheConfig):
     """64 bit direct mapped icache. All instructions are 4B aligned."""
-    def __init__(self):
+    def __init__(self, pspec):
+        FetchUnitInterface.__init__(self, pspec)
         self.i_in           = Fetch1ToICacheType(name="i_in")
         self.i_out          = ICacheToDecode1Type(name="i_out")
 
@@ -317,11 +322,52 @@ class ICache(Elaboratable):
         self.flush_in       = Signal()
         self.inval_in       = Signal()
 
-        self.wb_out         = WBMasterOut(name="wb_out")
-        self.wb_in          = WBSlaveOut(name="wb_in")
+        # standard naming (wired to non-standard for compatibility)
+        self.bus = Interface(addr_width=32,
+                            data_width=64,
+                            granularity=8,
+                            features={'stall'},
+                            #alignment=0,
+                            name="icache_wb")
 
         self.log_out        = Signal(54)
 
+        # use FetchUnitInterface, helps keep some unit tests running
+        self.use_fetch_iface = False
+
+        # test if small cache to be enabled
+        self.small_cache = (hasattr(pspec, "small_cache") and
+                                 (pspec.small_cache == True))
+        # test if microwatt compatibility to be enabled
+        self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+                                 (pspec.microwatt_compat == True))
+        # test if fabric compatibility is to be enabled
+        self.fabric_compat = (hasattr(pspec, "fabric_compat") and
+                                 (pspec.fabric_compat == True))
+
+        XLEN = pspec.XLEN
+        LINE_SIZE = 64
+        TLB_SIZE = 8
+        NUM_LINES = 8
+        NUM_WAYS = 2
+        if self.small_cache:
+            # reduce way sizes and num lines to ridiculously small
+            NUM_LINES = 2
+            NUM_WAYS = 1
+            TLB_SIZE = 2
+        if self.microwatt_compat or self.fabric_compat:
+            # reduce way sizes
+            NUM_WAYS = 1
+
+        ICacheConfig.__init__(self, LINE_SIZE=LINE_SIZE,
+                                    XLEN=XLEN,
+                                    NUM_LINES = NUM_LINES,
+                                    NUM_WAYS = NUM_WAYS,
+                                    TLB_SIZE=TLB_SIZE
+                             )
+
+    def use_fetch_interface(self):
+        self.use_fetch_iface = True
 
     # Generate a cache RAM for each way
     def rams(self, m, r, cache_out_row, use_previous,
@@ -330,93 +376,100 @@ class ICache(Elaboratable):
         comb = m.d.comb
         sync = m.d.sync
 
-        wb_in, stall_in = self.wb_in, self.stall_in
+        bus, stall_in = self.bus, self.stall_in
+
+        # read condition (for every cache ram)
+        do_read  = Signal()
+        comb += do_read.eq(~(stall_in | use_previous))
+
+        rd_addr  = Signal(self.ROW_BITS)
+        wr_addr  = Signal(self.ROW_BITS)
+        comb += rd_addr.eq(req_row)
+        comb += wr_addr.eq(r.store_row)
 
-        for i in range(NUM_WAYS):
-            do_read  = Signal(name="do_rd_%d" % i)
+        # binary-to-unary converters: replace-way enabled by bus.ack,
+        # hit-way left permanently enabled
+        m.submodules.replace_way_e = re = Decoder(self.NUM_WAYS)
+        m.submodules.hit_way_e = he = Decoder(self.NUM_WAYS)
+        comb += re.i.eq(replace_way)
+        comb += re.n.eq(~bus.ack)
+        comb += he.i.eq(r.hit_way)
+
+        for i in range(self.NUM_WAYS):
             do_write = Signal(name="do_wr_%d" % i)
-            rd_addr  = Signal(ROW_BITS)
-            wr_addr  = Signal(ROW_BITS)
-            d_out    = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
-            wr_sel   = Signal(ROW_SIZE)
+            d_out    = Signal(self.ROW_SIZE_BITS, name="d_out_%d" % i)
+            wr_sel   = Signal(self.ROW_SIZE, name="wr_sel_%d" % i)
 
-            way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
-            setattr(m.submodules, "cacheram_%d" % i, way)
+            way = CacheRam(self.ROW_BITS, self.ROW_SIZE_BITS,
+                           TRACE=True, ram_num=i)
+            m.submodules["cacheram_%d" % i] =  way
 
             comb += way.rd_en.eq(do_read)
             comb += way.rd_addr.eq(rd_addr)
             comb += d_out.eq(way.rd_data_o)
             comb += way.wr_sel.eq(wr_sel)
             comb += way.wr_addr.eq(wr_addr)
-            comb += way.wr_data.eq(wb_in.dat)
+            comb += way.wr_data.eq(bus.dat_r)
 
-            comb += do_read.eq(~(stall_in | use_previous))
-            comb += do_write.eq(wb_in.ack & (replace_way == i))
+            comb += do_write.eq(re.o[i])
 
             with m.If(do_write):
                 sync += Display("cache write adr: %x data: %lx",
                                 wr_addr, way.wr_data)
 
-            with m.If(r.hit_way == i):
+            with m.If(he.o[i]):
                 comb += cache_out_row.eq(d_out)
                 with m.If(do_read):
                     sync += Display("cache read adr: %x data: %x",
                                      req_row, d_out)
 
-            comb += rd_addr.eq(req_row)
-            comb += wr_addr.eq(r.store_row)
-            comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
+            comb += wr_sel.eq(Repl(do_write, self.ROW_SIZE))
 
     # Generate PLRUs
     def maybe_plrus(self, m, r, plru_victim):
         comb = m.d.comb
 
-        with m.If(NUM_WAYS > 1):
-            for i in range(NUM_LINES):
-                plru_acc_i  = Signal(WAY_BITS)
-                plru_acc_en = Signal()
-                plru        = PLRU(WAY_BITS)
-                setattr(m.submodules, "plru_%d" % i, plru)
-
-                comb += plru.acc_i.eq(plru_acc_i)
-                comb += plru.acc_en.eq(plru_acc_en)
+        if self.NUM_WAYS == 0:
+            return
 
-                # PLRU interface
-                with m.If(get_index(r.hit_nia) == i):
-                    comb += plru.acc_en.eq(r.hit_valid)
 
-                comb += plru.acc_i.eq(r.hit_way)
-                comb += plru_victim[i].eq(plru.lru_o)
+        m.submodules.plrus = plru = PLRUs("itag", self.NUM_LINES,
+                                                  self.WAY_BITS)
+        comb += plru.way.eq(r.hit_way)
+        comb += plru.valid.eq(r.hit_valid)
+        comb += plru.index.eq(self.get_index(r.hit_nia))
+        comb += plru.isel.eq(r.store_index) # select victim
+        comb += plru_victim.eq(plru.o_index) # selected victim
 
     # TLB hit detection and real address generation
-    def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
-                    real_addr, itlb_valid_bits, ra_valid, eaa_priv,
+    def itlb_lookup(self, m, tlb_req_index, itlb, itlb_valid,
+                    real_addr, ra_valid, eaa_priv,
                     priv_fault, access_ok):
 
         comb = m.d.comb
 
         i_in = self.i_in
 
-        pte  = Signal(TLB_PTE_BITS)
-        ttag = Signal(TLB_EA_TAG_BITS)
+        # use an *asynchronous* Memory read port here (combinatorial)
+        m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb")
+        tlb = self.TLBRecord("tlb_rdport")
+        pte, ttag = tlb.pte, tlb.tag
 
-        comb += tlb_req_index.eq(hash_ea(i_in.nia))
-        comb += pte.eq(itlb_ptes[tlb_req_index])
-        comb += ttag.eq(itlb_tags[tlb_req_index])
+        comb += tlb_req_index.eq(self.hash_ea(i_in.nia))
+        comb += rd_tlb.addr.eq(tlb_req_index)
+        comb += tlb.eq(rd_tlb.data)
 
         with m.If(i_in.virt_mode):
-            comb += real_addr.eq(Cat(
-                     i_in.nia[:TLB_LG_PGSZ],
-                     pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
-                    ))
+            comb += real_addr.eq(Cat(i_in.nia[:self.TLB_LG_PGSZ],
+                                     pte[self.TLB_LG_PGSZ:self.REAL_ADDR_BITS]))
 
-            with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
-                comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
+            with m.If(ttag == i_in.nia[self.TLB_LG_PGSZ + self.TL_BITS:64]):
+                comb += ra_valid.eq(itlb_valid.q.bit_select(tlb_req_index, 1))
 
             comb += eaa_priv.eq(pte[3])
 
         with m.Else():
-            comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
+            comb += real_addr.eq(i_in.nia[:self.REAL_ADDR_BITS])
             comb += ra_valid.eq(1)
             comb += eaa_priv.eq(1)
 
@@ -425,85 +478,101 @@ class ICache(Elaboratable):
         comb += access_ok.eq(ra_valid & ~priv_fault)
 
     # iTLB update
-    def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
+    def itlb_update(self, m, itlb, itlb_valid):
         comb = m.d.comb
         sync = m.d.sync
 
         m_in = self.m_in
 
-        wr_index = Signal(TLB_SIZE)
-        comb += wr_index.eq(hash_ea(m_in.addr))
+        wr_index = Signal(self.TL_BITS)
+        wr_unary = Signal(self.TLB_SIZE)
+        comb += wr_index.eq(self.hash_ea(m_in.addr))
+        comb += wr_unary.eq(1<<wr_index)
+
+        m.submodules.wr_tlb = wr_tlb = self.tlbmem.write_port()
+        sync += itlb_valid.s.eq(0)
+        sync += itlb_valid.r.eq(0)
 
         with m.If(m_in.tlbie & m_in.doall):
             # Clear all valid bits
-            for i in range(TLB_SIZE):
-                sync += itlb_valid_bits[i].eq(0)
+            sync += itlb_valid.r.eq(-1)
 
         with m.Elif(m_in.tlbie):
             # Clear entry regardless of hit or miss
-            sync += itlb_valid_bits[wr_index].eq(0)
+            sync += itlb_valid.r.eq(wr_unary)
 
         with m.Elif(m_in.tlbld):
-            sync += itlb_tags[wr_index].eq(
-                     m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
-                    )
-            sync += itlb_ptes[wr_index].eq(m_in.pte)
-            sync += itlb_valid_bits[wr_index].eq(1)
+            tlb = self.TLBRecord("tlb_wrport")
+            comb += tlb.tag.eq(m_in.addr[self.TLB_LG_PGSZ + self.TL_BITS:64])
+            comb += tlb.pte.eq(m_in.pte)
+            comb += wr_tlb.en.eq(1)
+            comb += wr_tlb.addr.eq(wr_index)
+            comb += wr_tlb.data.eq(tlb)
+            sync += itlb_valid.s.eq(wr_unary)
 
     # Cache hit detection, output to fetch2 and other misc logic
     def icache_comb(self, m, use_previous, r, req_index, req_row,
                     req_hit_way, req_tag, real_addr, req_laddr,
-                    cache_valid_bits, cache_tags, access_ok,
+                    cache_valids, access_ok,
                     req_is_hit, req_is_miss, replace_way,
                     plru_victim, cache_out_row):
 
         comb = m.d.comb
+        m.submodules.rd_tag = rd_tag = self.tagmem.read_port(domain="comb")
 
-        i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
+        i_in, i_out, bus = self.i_in, self.i_out, self.bus
         flush_in, stall_out = self.flush_in, self.stall_out
 
         is_hit  = Signal()
-        hit_way = Signal(NUM_WAYS)
+        hit_way = Signal(self.WAY_BITS)
 
         # i_in.sequential means that i_in.nia this cycle is 4 more than
         # last cycle.  If we read more than 32 bits at a time, had a
         # cache hit last cycle, and we don't want the first 32-bit chunk
         # then we can keep the data we read last cycle and just use that.
-        with m.If(i_in.nia[2:INSN_BITS+2] != 0):
+        with m.If(i_in.nia[2:self.INSN_BITS+2] != 0):
             comb += use_previous.eq(i_in.sequential & r.hit_valid)
 
         # Extract line, row and tag from request
-        comb += req_index.eq(get_index(i_in.nia))
-        comb += req_row.eq(get_row(i_in.nia))
-        comb += req_tag.eq(get_tag(real_addr))
+        comb += req_index.eq(self.get_index(i_in.nia))
+        comb += req_row.eq(self.get_row(i_in.nia))
+        comb += req_tag.eq(self.get_tag(real_addr))
 
         # Calculate address of beginning of cache row, will be
         # used for cache miss processing if needed
         comb += req_laddr.eq(Cat(
-                 Const(0, ROW_OFF_BITS),
-                 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
+                 Const(0, self.ROW_OFF_BITS),
+                 real_addr[self.ROW_OFF_BITS:self.REAL_ADDR_BITS],
                 ))
 
         # Test if pending request is a hit on any way
         hitcond = Signal()
-        comb += hitcond.eq((r.state == State.WAIT_ACK)
-                 & (req_index == r.store_index)
-                 & r.rows_valid[req_row % ROW_PER_LINE]
+        rowvalid = Signal()
+        comb += rowvalid.eq(r.rows_valid[req_row % self.ROW_PER_LINE])
+        comb += hitcond.eq((r.state == State.WAIT_ACK) &
+                            (req_index == r.store_index) &
+                             rowvalid
                 )
-        with m.If(i_in.req):
-            cvb = Signal(NUM_WAYS)
-            ctag = Signal(TAG_RAM_WIDTH)
-            comb += ctag.eq(cache_tags[req_index])
-            comb += cvb.eq(cache_valid_bits[req_index])
-            for i in range(NUM_WAYS):
-                tagi = Signal(TAG_BITS, name="tag_i%d" % i)
-                comb += tagi.eq(read_tag(i, ctag))
-                hit_test = Signal(name="hit_test%d" % i)
-                comb += hit_test.eq(i == r.store_way)
-                with m.If((cvb[i] | (hitcond & hit_test))
-                          & (tagi == req_tag)):
-                    comb += hit_way.eq(i)
-                    comb += is_hit.eq(1)
+        # i_in.req asserts Decoder active
+        cvb = Signal(self.NUM_WAYS)
+        ctag = Signal(self.TAG_RAM_WIDTH)
+        comb += rd_tag.addr.eq(req_index)
+        comb += ctag.eq(rd_tag.data)
+        comb += cvb.eq(cache_valids.q.word_select(req_index, self.NUM_WAYS))
+        m.submodules.store_way_e = se = Decoder(self.NUM_WAYS)
+        comb += se.i.eq(r.store_way)
+        comb += se.n.eq(~i_in.req)
+        for i in range(self.NUM_WAYS):
+            tagi = Signal(self.TAG_BITS, name="tag_i%d" % i)
+            hit_test = Signal(name="hit_test%d" % i)
+            is_tag_hit = Signal(name="is_tag_hit_%d" % i)
+            comb += tagi.eq(self.read_tag(i, ctag))
+            comb += hit_test.eq(se.o[i])
+            comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
+                                  (tagi == req_tag))
+            with m.If(is_tag_hit):
+                comb += hit_way.eq(i)
+                comb += is_hit.eq(1)
 
         # Generate the "hit" and "miss" signals
         # for the synchronous blocks
@@ -511,15 +580,11 @@ class ICache(Elaboratable):
             comb += req_is_hit.eq(is_hit)
             comb += req_is_miss.eq(~is_hit)
 
-        with m.Else():
-            comb += req_is_hit.eq(0)
-            comb += req_is_miss.eq(0)
-
         comb += req_hit_way.eq(hit_way)
 
         # The way to replace on a miss
         with m.If(r.state == State.CLR_TAG):
-            comb += replace_way.eq(plru_victim[r.store_index])
+            comb += replace_way.eq(plru_victim)
         with m.Else():
             comb += replace_way.eq(r.store_way)
 
@@ -531,7 +596,7 @@ class ICache(Elaboratable):
         # be output an entire row which I prefer not to do just yet
         # as it would force fetch2 to know about some of the cache
         # geometry information.
-        comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
+        comb += i_out.insn.eq(self.read_insn_word(r.hit_nia, cache_out_row))
         comb += i_out.valid.eq(r.hit_valid)
         comb += i_out.nia.eq(r.hit_nia)
         comb += i_out.stop_mark.eq(r.hit_smark)
@@ -542,7 +607,12 @@ class ICache(Elaboratable):
         comb += stall_out.eq(~(is_hit & access_ok))
 
         # Wishbone requests output (from the cache miss reload machine)
-        comb += wb_out.eq(r.wb)
+        comb += bus.we.eq(r.wb.we)
+        comb += bus.adr.eq(r.wb.adr)
+        comb += bus.sel.eq(r.wb.sel)
+        comb += bus.stb.eq(r.wb.stb)
+        comb += bus.dat_w.eq(r.wb.dat)
+        comb += bus.cyc.eq(r.wb.cyc)
 
     # Cache hit synchronous machine
     def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
@@ -567,14 +637,10 @@ class ICache(Elaboratable):
 
             with m.If(req_is_hit):
                 sync += r.hit_way.eq(req_hit_way)
-                sync += Display(
-                         "cache hit nia:%x IR:%x SM:%x idx:%x tag:%x " \
-                         "way:%x RA:%x", i_in.nia, i_in.virt_mode, \
-                         i_in.stop_mark, req_index, req_tag, \
-                         req_hit_way, real_addr
-                        )
-
-
+                sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
+                                "way:%x RA:%x", i_in.nia, i_in.virt_mode,
+                                 i_in.stop_mark, req_index, req_tag,
+                                 req_hit_way, real_addr)
 
         with m.If(~stall_in):
             # Send stop marks and NIA down regardless of validity
@@ -589,7 +655,7 @@ class ICache(Elaboratable):
         i_in = self.i_in
 
         # Reset per-row valid flags, only used in WAIT_ACK
-        for i in range(ROW_PER_LINE):
+        for i in range(self.ROW_PER_LINE):
             sync += r.rows_valid[i].eq(0)
 
         # We need to read a cache line
@@ -598,17 +664,16 @@ class ICache(Elaboratable):
                      "cache miss nia:%x IR:%x SM:%x idx:%x "
                      " way:%x tag:%x RA:%x", i_in.nia,
                      i_in.virt_mode, i_in.stop_mark, req_index,
-                     replace_way, req_tag, real_addr
-                    )
+                     replace_way, req_tag, real_addr)
 
             # Keep track of our index and way for subsequent stores
-            st_row = Signal(BRAM_ROWS)
-            comb += st_row.eq(get_row(req_laddr))
+            st_row = Signal(self.ROW_BITS)
+            comb += st_row.eq(self.get_row(req_laddr))
             sync += r.store_index.eq(req_index)
             sync += r.store_row.eq(st_row)
             sync += r.store_tag.eq(req_tag)
             sync += r.store_valid.eq(1)
-            sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
+            sync += r.end_row_ix.eq(self.get_row_of_line(st_row) - 1)
 
             # Prep for first wishbone read.  We calculate the address
             # of the start of the cache line and start the WB cycle.
@@ -620,144 +685,113 @@ class ICache(Elaboratable):
             sync += r.state.eq(State.CLR_TAG)
 
     def icache_miss_clr_tag(self, m, r, replace_way,
-                            cache_valid_bits, req_index,
-                            tagset, cache_tags):
-
+                            req_index,
+                            cache_valids):
         comb = m.d.comb
         sync = m.d.sync
+        m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
+                                                    granularity=self.TAG_BITS)
 
         # Get victim way from plru
         sync += r.store_way.eq(replace_way)
+
         # Force misses on that way while reloading that line
-        cv = Signal(INDEX_BITS)
-        comb += cv.eq(cache_valid_bits[req_index])
-        comb += cv.bit_select(replace_way, 1).eq(0)
-        sync += cache_valid_bits[req_index].eq(cv)
+        idx = req_index*self.NUM_WAYS + replace_way # 2D index, 1st dim: self.NUM_WAYS
+        comb += cache_valids.r.eq(1<<idx)
 
-        for i in range(NUM_WAYS):
-            with m.If(i == replace_way):
-                comb += tagset.eq(cache_tags[r.store_index])
-                comb += write_tag(i, tagset, r.store_tag)
-                sync += cache_tags[r.store_index].eq(tagset)
+        # use write-port "granularity" to select the tag to write to
+        # TODO: the Memory should be multipled-up (by NUM_TAGS)
+        tagset = Signal(self.TAG_RAM_WIDTH)
+        comb += tagset.eq(r.store_tag << (replace_way*self.TAG_BITS))
+        comb += wr_tag.en.eq(1<<replace_way)
+        comb += wr_tag.addr.eq(r.store_index)
+        comb += wr_tag.data.eq(tagset)
 
         sync += r.state.eq(State.WAIT_ACK)
 
     def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
-                             stbs_done, cache_valid_bits):
+                             cache_valids):
         comb = m.d.comb
         sync = m.d.sync
 
-        wb_in = self.wb_in
-
-        # Requests are all sent if stb is 0
-        stbs_zero = Signal()
-        comb += stbs_zero.eq(r.wb.stb == 0)
-        comb += stbs_done.eq(stbs_zero)
+        bus = self.bus
 
         # If we are still sending requests, was one accepted?
-        with m.If(~wb_in.stall & ~stbs_zero):
-            # That was the last word? We are done sending.
-            # Clear stb and set stbs_done so we can handle
-            # an eventual last ack on the same cycle.
-            with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
-                sync += Display(
-                         "IS_LAST_ROW_ADDR r.wb.addr:%x " \
-                         "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x " \
-                         "stbs_done:%x", r.wb.adr, r.end_row_ix,
-                         r.wb.stb, stbs_zero, stbs_done
-                        )
+        with m.If(~bus.stall & r.wb.stb):
+            # That was the last word? We are done sending.  Clear stb
+            with m.If(self.is_last_row_addr(r.req_adr, r.end_row_ix)):
+                sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
+                         "r.end_row_ix:%x r.wb.stb:%x",
+                         r.wb.adr, r.end_row_ix, r.wb.stb)
                 sync += r.wb.stb.eq(0)
-                comb += stbs_done.eq(1)
 
             # Calculate the next row address
-            rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
-            comb += rarange.eq(
-                     r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
-                    )
-            sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(
-                     rarange
-                    )
+            rarange = Signal(self.LINE_OFF_BITS - self.ROW_OFF_BITS)
+            comb += rarange.eq(r.req_adr[self.ROW_OFF_BITS:
+                                         self.LINE_OFF_BITS] + 1)
+            sync += r.req_adr[self.ROW_OFF_BITS:self.LINE_OFF_BITS].eq(rarange)
             sync += Display("RARANGE r.req_adr:%x rarange:%x "
-                            "stbs_zero:%x stbs_done:%x",
-                            r.req_adr, rarange, stbs_zero, stbs_done)
+                            "r.wb.stb:%x",
+                            r.req_adr, rarange, r.wb.stb)
 
         # Incoming acks processing
-        with m.If(wb_in.ack):
-            sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
-                            "stbs_done:%x",
-                            wb_in.dat, stbs_zero, stbs_done)
+        with m.If(bus.ack):
+            sync += Display("WB_IN_ACK data:%x", bus.dat_r)
 
-            sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
+            sync += r.rows_valid[r.store_row % self.ROW_PER_LINE].eq(1)
 
             # Check for completion
-            with m.If(stbs_done &
-                      is_last_row(r.store_row, r.end_row_ix)):
+            with m.If(self.is_last_row(r.store_row, r.end_row_ix)):
                 # Complete wishbone cycle
                 sync += r.wb.cyc.eq(0)
                 # be nice, clear addr
                 sync += r.req_adr.eq(0)
 
                 # Cache line is now valid
-                cv = Signal(INDEX_BITS)
-                comb += cv.eq(cache_valid_bits[r.store_index])
-                comb += cv.bit_select(replace_way, 1).eq(
-                         r.store_valid & ~inval_in
-                        )
-                sync += cache_valid_bits[r.store_index].eq(cv)
-
+                idx = r.store_index*self.NUM_WAYS + replace_way # 2D index again
+                valid = r.store_valid & ~inval_in
+                comb += cache_valids.s.eq(1<<idx)
                 sync += r.state.eq(State.IDLE)
 
-            # not completed, move on to next request in row
-            with m.Else():
-                # Increment store row counter
-                sync += r.store_row.eq(next_row(r.store_row))
-
+            # move on to next request in row
+            # Increment store row counter
+            sync += r.store_row.eq(self.next_row(r.store_row))
 
     # Cache miss/reload synchronous machine
-    def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
+    def icache_miss(self, m, r, req_is_miss,
                     req_index, req_laddr, req_tag, replace_way,
-                    cache_tags, access_ok, real_addr):
+                    cache_valids, access_ok, real_addr):
         comb = m.d.comb
         sync = m.d.sync
 
-        i_in, wb_in, m_in  = self.i_in, self.wb_in, self.m_in
+        i_in, bus, m_in  = self.i_in, self.bus, self.m_in
         stall_in, flush_in = self.stall_in, self.flush_in
         inval_in           = self.inval_in
 
-        tagset    = Signal(TAG_RAM_WIDTH)
-        stbs_done = Signal()
-
         comb += r.wb.sel.eq(-1)
         comb += r.wb.adr.eq(r.req_adr[3:])
 
         # Process cache invalidations
         with m.If(inval_in):
-            for i in range(NUM_LINES):
-                sync += cache_valid_bits[i].eq(0)
+            comb += cache_valids.r.eq(-1)
             sync += r.store_valid.eq(0)
 
         # Main state machine
         with m.Switch(r.state):
 
             with m.Case(State.IDLE):
-                self.icache_miss_idle(
-                    m, r, req_is_miss, req_laddr,
-                    req_index, req_tag, replace_way,
-                    real_addr
-                )
+                self.icache_miss_idle(m, r, req_is_miss, req_laddr,
+                                      req_index, req_tag, replace_way,
+                                      real_addr)
 
             with m.Case(State.CLR_TAG, State.WAIT_ACK):
                 with m.If(r.state == State.CLR_TAG):
-                    self.icache_miss_clr_tag(
-                        m, r, replace_way,
-                        cache_valid_bits, req_index,
-                        tagset, cache_tags
-                    )
-
-                self.icache_miss_wait_ack(
-                    m, r, replace_way, inval_in,
-                    stbs_done, cache_valid_bits
-                )
+                    self.icache_miss_clr_tag(m, r, replace_way,
+                                             req_index,
+                                             cache_valids)
+
+                self.icache_miss_wait_ack(m, r, replace_way, inval_in,
+                                          cache_valids)
 
         # TLB miss and protection fault processing
         with m.If(flush_in | m_in.tlbld):
@@ -771,13 +805,13 @@ class ICache(Elaboratable):
         comb = m.d.comb
         sync = m.d.sync
 
-        wb_in, i_out       = self.wb_in, self.i_out
+        bus, i_out       = self.bus, self.i_out
         log_out, stall_out = self.log_out, self.stall_out
 
         # Output data to logger
         for i in range(LOG_LENGTH):
             log_data = Signal(54)
-            lway     = Signal(NUM_WAYS)
+            lway     = Signal(self.WAY_BITS)
             wstate   = Signal()
 
             sync += lway.eq(req_hit_way)
@@ -789,8 +823,8 @@ class ICache(Elaboratable):
             sync += log_data.eq(Cat(
                      ra_valid, access_ok, req_is_miss, req_is_hit,
                      lway, wstate, r.hit_nia[2:6], r.fetch_failed,
-                     stall_out, wb_in.stall, r.wb.cyc, r.wb.stb,
-                     r.real_addr[3:6], wb_in.ack, i_out.insn, i_out.valid
+                     stall_out, bus.stall, r.wb.cyc, r.wb.stb,
+                     r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
                     ))
             comb += log_out.eq(log_data)
 
@@ -799,13 +833,17 @@ class ICache(Elaboratable):
         m                = Module()
         comb             = m.d.comb
 
-        # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
-        cache_tags       = CacheTagArray()
-        cache_valid_bits = CacheValidBitsArray()
+        # Cache-Ways "valid" indicators.  this is a 2D Signal, by the
+        # number of ways and the number of lines.
+        vec = SRLatch(sync=True, llen=self.NUM_WAYS*self.NUM_LINES,
+                      name="cachevalids")
+        m.submodules.cache_valids = cache_valids = vec
+
+        # TLB Array
+        itlb            = self.TLBArray()
+        vec = SRLatch(sync=False, llen=self.TLB_SIZE, name="tlbvalids")
+        m.submodules.itlb_valids = itlb_valid = vec
 
-        itlb_valid_bits  = TLBValidBitsArray()
-        itlb_tags        = TLBTagArray()
-        itlb_ptes        = TLBPtesArray()
         # TODO to be passed to nmigen as ram attributes
         # attribute ram_style of itlb_tags : signal is "distributed";
         # attribute ram_style of itlb_ptes : signal is "distributed";
@@ -813,62 +851,106 @@ class ICache(Elaboratable):
         # Privilege bit from PTE EAA field
         eaa_priv         = Signal()
 
-        r                = RegInternal()
+        r                = RegInternal(self)
 
         # Async signal on incoming request
-        req_index        = Signal(NUM_LINES)
-        req_row          = Signal(BRAM_ROWS)
-        req_hit_way      = Signal(NUM_WAYS)
-        req_tag          = Signal(TAG_BITS)
+        req_index        = Signal(self.INDEX_BITS)
+        req_row          = Signal(self.ROW_BITS)
+        req_hit_way      = Signal(self.WAY_BITS)
+        req_tag          = Signal(self.TAG_BITS)
         req_is_hit       = Signal()
         req_is_miss      = Signal()
         req_laddr        = Signal(64)
 
-        tlb_req_index    = Signal(TLB_SIZE)
-        real_addr        = Signal(REAL_ADDR_BITS)
+        tlb_req_index    = Signal(self.TL_BITS)
+        real_addr        = Signal(self.REAL_ADDR_BITS)
         ra_valid         = Signal()
         priv_fault       = Signal()
         access_ok        = Signal()
         use_previous     = Signal()
 
-        cache_out_row    = Signal(ROW_SIZE_BITS)
+        cache_out_row    = Signal(self.ROW_SIZE_BITS)
+
+        plru_victim      = Signal(self.WAY_BITS)
+        replace_way      = Signal(self.WAY_BITS)
 
-        plru_victim      = PLRUOut()
-        replace_way      = Signal(NUM_WAYS)
+        self.tlbmem = Memory(depth=self.TLB_SIZE,
+                             width=self.TLB_EA_TAG_BITS+self.TLB_PTE_BITS,
+                             #attrs={'syn_ramstyle': "block_ram"}
+                            )
+        self.tagmem = Memory(depth=self.NUM_LINES,
+                             width=self.TAG_RAM_WIDTH,
+                             #attrs={'syn_ramstyle': "block_ram"}
+                            )
 
         # call sub-functions putting everything together,
         # using shared signals established above
         self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
         self.maybe_plrus(m, r, plru_victim)
-        self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags, real_addr,
-                         itlb_valid_bits, ra_valid, eaa_priv, priv_fault,
+        self.itlb_lookup(m, tlb_req_index, itlb, itlb_valid, real_addr,
+                         ra_valid, eaa_priv, priv_fault,
                          access_ok)
-        self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
+        self.itlb_update(m, itlb, itlb_valid)
         self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
-                         req_tag, real_addr, req_laddr, cache_valid_bits,
-                         cache_tags, access_ok, req_is_hit, req_is_miss,
+                         req_tag, real_addr, req_laddr,
+                         cache_valids,
+                         access_ok, req_is_hit, req_is_miss,
                          replace_way, plru_victim, cache_out_row)
         self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
                         req_index, req_tag, real_addr)
-        self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
-                         req_laddr, req_tag, replace_way, cache_tags,
+        self.icache_miss(m, r, req_is_miss, req_index,
+                         req_laddr, req_tag, replace_way,
+                         cache_valids,
                          access_ok, real_addr)
         #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
         #                req_is_miss, req_is_hit, lway, wstate, r)
 
+        # don't connect up to FetchUnitInterface so that some unit tests
+        # can continue to operate
+        if not self.use_fetch_iface:
+            return m
+
+        # connect to FetchUnitInterface. FetchUnitInterface is undocumented
+        # so needs checking and iterative revising
+        i_in, bus, i_out = self.i_in, self.bus, self.i_out
+        comb += i_in.req.eq(self.a_i_valid)
+        comb += i_in.nia.eq(self.a_pc_i)
+        comb += self.stall_in.eq(self.a_stall_i)
+        comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
+        comb += self.f_badaddr_o.eq(i_out.nia)
+        comb += self.f_instr_o.eq(i_out.insn)
+        comb += self.f_busy_o.eq(~i_out.valid) # probably
+
+        # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
+        ibus = self.ibus
+        comb += ibus.adr.eq(self.bus.adr)
+        comb += ibus.dat_w.eq(self.bus.dat_w)
+        comb += ibus.sel.eq(self.bus.sel)
+        comb += ibus.cyc.eq(self.bus.cyc)
+        comb += ibus.stb.eq(self.bus.stb)
+        comb += ibus.we.eq(self.bus.we)
+
+        comb += self.bus.dat_r.eq(ibus.dat_r)
+        comb += self.bus.ack.eq(ibus.ack)
+        if hasattr(ibus, "stall"):
+            comb += self.bus.stall.eq(ibus.stall)
+        else:
+            # fake-up the wishbone stall signal to comply with pipeline mode
+            # same thing is done in dcache.py
+            comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
+
         return m
 
 
 def icache_sim(dut):
-    i_out = dut.i_in
-    i_in  = dut.i_out
+    i_in = dut.i_in
+    i_out  = dut.i_out
     m_out = dut.m_in
 
-    yield i_in.valid.eq(0)
-    yield i_out.priv_mode.eq(1)
-    yield i_out.req.eq(0)
-    yield i_out.nia.eq(0)
-    yield i_out.stop_mark.eq(0)
+    yield i_in.priv_mode.eq(1)
+    yield i_in.req.eq(0)
+    yield i_in.nia.eq(0)
+    yield i_in.stop_mark.eq(0)
     yield m_out.tlbld.eq(0)
     yield m_out.tlbie.eq(0)
     yield m_out.addr.eq(0)
@@ -877,107 +959,126 @@ def icache_sim(dut):
     yield
     yield
     yield
-    yield i_out.req.eq(1)
-    yield i_out.nia.eq(Const(0x0000000000000004, 64))
-    for i in range(30):
-        yield
+
+    # miss, stalls for a bit
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(Const(0x0000000000000004, 64))
     yield
-    valid = yield i_in.valid
+    valid = yield i_out.valid
+    while not valid:
+        yield
+        valid = yield i_out.valid
+    yield i_in.req.eq(0)
+
+    insn  = yield i_out.insn
     nia   = yield i_out.nia
-    insn  = yield i_in.insn
-    print(f"valid? {valid}")
-    assert valid
     assert insn == 0x00000001, \
         "insn @%x=%x expected 00000001" % (nia, insn)
-    yield i_out.req.eq(0)
+    yield i_in.req.eq(0)
     yield
 
     # hit
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(Const(0x0000000000000008, 64))
     yield
+    valid = yield i_out.valid
+    while not valid:
+        yield
+        valid = yield i_out.valid
+    yield i_in.req.eq(0)
+
+    nia   = yield i_out.nia
+    insn  = yield i_out.insn
     yield
-    yield i_out.req.eq(1)
-    yield i_out.nia.eq(Const(0x0000000000000008, 64))
-    yield
-    yield
-    valid = yield i_in.valid
-    nia   = yield i_in.nia
-    insn  = yield i_in.insn
-    assert valid
     assert insn == 0x00000002, \
         "insn @%x=%x expected 00000002" % (nia, insn)
-    yield
 
     # another miss
-    yield i_out.req.eq(1)
-    yield i_out.nia.eq(Const(0x0000000000000040, 64))
-    for i in range(30):
-        yield
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(Const(0x0000000000000040, 64))
     yield
-    valid = yield i_in.valid
-    nia   = yield i_out.nia
-    insn  = yield i_in.insn
-    assert valid
+    valid = yield i_out.valid
+    while not valid:
+        yield
+        valid = yield i_out.valid
+    yield i_in.req.eq(0)
+
+    nia   = yield i_in.nia
+    insn  = yield i_out.insn
     assert insn == 0x00000010, \
         "insn @%x=%x expected 00000010" % (nia, insn)
 
-    # test something that aliases
-    yield i_out.req.eq(1)
-    yield i_out.nia.eq(Const(0x0000000000000100, 64))
+    # test something that aliases (this only works because
+    # the unit test SRAM is a depth of 512)
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(Const(0x0000000000000100, 64))
     yield
     yield
-    valid = yield i_in.valid
+    valid = yield i_out.valid
     assert ~valid
     for i in range(30):
         yield
     yield
-    insn  = yield i_in.insn
-    valid = yield i_in.valid
-    insn  = yield i_in.insn
+    insn  = yield i_out.insn
+    valid = yield i_out.valid
+    insn  = yield i_out.insn
     assert valid
     assert insn == 0x00000040, \
          "insn @%x=%x expected 00000040" % (nia, insn)
-    yield i_out.req.eq(0)
-
+    yield i_in.req.eq(0)
 
 
 def test_icache(mem):
-     dut    = ICache()
-
-     memory = Memory(width=64, depth=512, init=mem)
-     sram   = SRAM(memory=memory, granularity=8)
-
-     m      = Module()
-
-     m.submodules.icache = dut
-     m.submodules.sram   = sram
-
-     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
-     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
-     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
-     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
-     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
-     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
-
-     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
-     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
-
-     # nmigen Simulation
-     sim = Simulator(m)
-     sim.add_clock(1e-6)
-
-     sim.add_sync_process(wrap(icache_sim(dut)))
-     with sim.write_vcd('test_icache.vcd'):
+    from soc.config.test.test_loadstore import TestMemPspec
+    pspec = TestMemPspec(addr_wid=32,
+                         mask_wid=8,
+                         reg_wid=64,
+                         XLEN=32,
+                         )
+    dut    = ICache(pspec)
+
+    memory = Memory(width=64, depth=512, init=mem)
+    sram   = SRAM(memory=memory, granularity=8)
+
+    m      = Module()
+
+    m.submodules.icache = dut
+    m.submodules.sram   = sram
+
+    m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
+    m.d.comb += sram.bus.stb.eq(dut.bus.stb)
+    m.d.comb += sram.bus.we.eq(dut.bus.we)
+    m.d.comb += sram.bus.sel.eq(dut.bus.sel)
+    m.d.comb += sram.bus.adr.eq(dut.bus.adr)
+    m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
+
+    m.d.comb += dut.bus.ack.eq(sram.bus.ack)
+    m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    sim.add_sync_process(wrap(icache_sim(dut)))
+    with sim.write_vcd('test_icache.vcd'):
          sim.run()
 
+
 if __name__ == '__main__':
-    dut = ICache()
+    from soc.config.test.test_loadstore import TestMemPspec
+    pspec = TestMemPspec(addr_wid=64,
+                         mask_wid=8,
+                         XLEN=32,
+                         reg_wid=64,
+                         )
+    dut = ICache(pspec)
     vl = rtlil.convert(dut, ports=[])
     with open("test_icache.il", "w") as f:
         f.write(vl)
 
+    # set up memory every 32-bits with incrementing values 0 1 2 ...
     mem = []
     for i in range(512):
         mem.append((i*2) | ((i*2+1)<<32))
 
     test_icache(mem)
-
index f1c895d0ff47d62110cd5d16cb716382a212fca9..42ef061072d6b6b1511fa9e16061286744b27153 100644 (file)
@@ -43,7 +43,7 @@ import unittest
 
 class L0CacheBuffer2(Elaboratable):
     """L0CacheBuffer2"""
-    def __init__(self, n_units=8, regwid=64, addrwid=48):
+    def __init__(self, n_units=8, regwid=64, addrwid=64):
         self.n_units = n_units
         self.regwid = regwid
         self.addrwid = addrwid
@@ -59,7 +59,7 @@ class L0CacheBuffer2(Elaboratable):
         # connect the ports as modules
 
         for i in range(self.n_units):
-            d = LDSTSplitter(64, 48, 4, self.dports[i])
+            d = LDSTSplitter(64, 64, 4, self.dports[i])
             setattr(m.submodules, "ldst_splitter%d" % i, d)
 
         # state-machine latches TODO
@@ -228,7 +228,7 @@ class L0CacheBuffer(Elaboratable):
     by this class.  That task is taken care of by LDSTCompUnit.
     """
 
-    def __init__(self, n_units, pimem, regwid=64, addrwid=48):
+    def __init__(self, n_units, pimem, regwid=64, addrwid=64):
         self.n_units = n_units
         self.pimem = pimem
         self.regwid = regwid
@@ -414,7 +414,7 @@ class TestL0Cache(unittest.TestCase):
     def test_l0_cache_test_bare_wb(self):
 
         pspec = TestMemPspec(ldst_ifacetype='test_bare_wb',
-                             addr_wid=48,
+                             addr_wid=64,
                              mask_wid=8,
                              reg_wid=64)
         dut = TstL0CacheBuffer(pspec)
@@ -428,7 +428,7 @@ class TestL0Cache(unittest.TestCase):
     def test_l0_cache_testpi(self):
 
         pspec = TestMemPspec(ldst_ifacetype='testpi',
-                             addr_wid=48,
+                             addr_wid=64,
                              mask_wid=8,
                              reg_wid=64)
         dut = TstL0CacheBuffer(pspec)
index aadffaaa22ddc0882e8a49eb86b3a12471ba148d..2176855d0efa2b4cf21beb0a709e34559158e893 100644 (file)
@@ -32,6 +32,42 @@ from soc.experiment.mem_types import (LoadStore1ToMMUType,
                                  DCacheToMMUType,
                                  MMUToICacheType)
 
+# Radix Tree Page Directory Entry Record, TODO put this somewhere sensible
+# v3.0C Book III p1015-1016 section 6.7.10.1
+class RTPDE(RecordObject):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.nls   = Signal(5)  # Nextded Access Auth bits 59:63 LSB0 0:4
+        self.rs1   = Signal(3)  # Reserved            bits 56:58 LSB0 5:7
+        self.nlb   = Signal(52) # Next Level Base     bit  4:55  LSB0 8:59
+        self.rs2   = Signal(2)  # Reserved            bit  2:3   LSB0 60:61
+        self.leaf  = Signal(1)  # leaf                bit  1     LSB0 62
+        self.valid = Signal(1)  # valid               bit  0     LSB0 63
+
+
+# Radix Tree Page Table Entry Record, TODO put this somewhere sensible
+# v3.0C Book III p1016 section 6.7.10.2
+class RTPTE(RecordObject):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.eaa   = Signal(4)  # Encoded Access Auth bits 60:63 LSB0 0:3
+        self.att   = Signal(2)  # Attributes          bits 58:59 LSB0 4:5
+        self.rs1   = Signal(1)  # Reserved            bit  57    LSB0 6
+        self.c     = Signal(1)  # Change              bit  56    LSB0 7
+        self.r     = Signal(1)  # Reference           bit  55    LSB0 8
+        self.sw    = Signal(3)  # SW bits 1:3         bits 52:54 LSB0 9:11
+        self.rpn   = Signal(45) # Real Page Number    bits 7:51  LSB0 12:56
+        self.rs2   = Signal(4)  # Reserved            bit  3:6   LSB0 57-60
+        self.sw0   = Signal(1)  # SW bit 0            bit  2     LSB0 61
+        self.leaf  = Signal(1)  # leaf                bit  1     LSB0 62
+        self.valid = Signal(1)  # valid               bit  0     LSB0 63
+
+# and these... which of course are turned round to LSB0 order.
+# TODO: sigh. use botchify and put them in openpower.consts
+EAA_PRIV = 3 # bit 0 (in MSB0) set ==> problem-state banned (priv=1 only)
+EAA_RD   = 2 # bit 1 (in MSB0) set ==> loads are permitted
+EAA_WR   = 1 # bit 2 (in MSB0) set ==> load and stores permitted
+EAA_EXE  = 0 # bit 3 (in MSB0) set ==> execute permitted
 
 # for debugging
 display_invalid = True
@@ -50,6 +86,19 @@ class State(Enum):
     RADIX_FINISH = 9
 
 
+# Process Table Record - near-identical to Page Table Record (same format)
+# v3.0C Book III Section 6.7.6.2 p1004
+class PRTBL(RecordObject):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.rpds  = Signal(5)  # Root Page Directory Size  59:63 LSB0 0:4
+        self.rts2  = Signal(3)  # Radix Tree Size part 2    56:58 LSB0 5:7
+        self.rpdb  = Signal(52) # Root Page Directory Base  4:55  LSB0 8:59
+        self.rsv2  = Signal(1)  # reserved                  3     LSB0 60
+        self.rts1  = Signal(2)  # Radix Tree Size part 1    1:2   LSB0 61:62
+        self.rsv1  = Signal(1)  # reserved                  0     LSB0 63
+
+
 class RegStage(RecordObject):
     def __init__(self, name=None):
         super().__init__(name=name)
@@ -60,17 +109,26 @@ class RegStage(RecordObject):
         self.priv = Signal()
         self.addr = Signal(64)
         self.inval_all = Signal()
+
         # config SPRs
         self.prtbl = Signal(64)
         self.pid = Signal(32)
+
         # internal state
         self.state = Signal(State) # resets to IDLE
         self.done = Signal()
         self.err = Signal()
+
+        # there are 4 quadrants (0-3): here we only support 2 (pt0 and pt3)
+        # these are bits 62-63 of any given address.
+        # except in segment_check, bit 62 is ignored
+        # Quadrant Select can be seen in v3.0C 6.7.10 p1015 book III figure 36
+        # and is further described in 6.7.11.3 p1019
         self.pgtbl0 = Signal(64)
         self.pt0_valid = Signal()
         self.pgtbl3 = Signal(64)
         self.pt3_valid = Signal()
+
         self.shift = Signal(6)
         self.mask_size = Signal(5)
         self.pgbase = Signal(56)
@@ -82,6 +140,20 @@ class RegStage(RecordObject):
         self.rc_error = Signal()
 
 
+# Page Table Record - note that HR bit is treated as part of rts below
+# (near-identical to Process Table Record - same format)
+# v3.0C Book III Section 6.7.6.1 p1003
+class PGTBL(RecordObject):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.rpds  = Signal(5)  # Root Page Directory Size  59:63 LSB0 0:4
+        self.rts2  = Signal(3)  # Radix Tree Size part 2    56:58 LSB0 5:7
+        self.rpdb  = Signal(52) # Root Page Directory Base  4:55  LSB0 8:59
+        self.s     = Signal(1)  # Host Secure               3     LSB0 60
+        self.rts1  = Signal(2)  # Radix Tree Size part 1    1:2   LSB0 61:62
+        self.hr    = Signal(1)  # Host Radix                0     LSB0 63
+
+
 class MMU(Elaboratable):
     """Radix MMU
 
@@ -90,41 +162,52 @@ class MMU(Elaboratable):
     (i.e. there is no gRA -> hRA translation).
     """
     def __init__(self):
-        self.l_in  = LoadStore1ToMMUType()
-        self.l_out = MMUToLoadStore1Type()
-        self.d_out = MMUToDCacheType()
-        self.d_in  = DCacheToMMUType()
-        self.i_out = MMUToICacheType()
+        self.l_in  = LoadStore1ToMMUType("l_in")
+        self.l_out = MMUToLoadStore1Type("l_out")
+        self.d_out = MMUToDCacheType("d_out")
+        self.d_in  = DCacheToMMUType("d_in")
+        self.i_out = MMUToICacheType("i_out")
 
     def radix_tree_idle(self, m, l_in, r, v):
+        """radix_tree_idle - the main decision-point.  valid actions include:
+        * LDST incoming TLBIE request (invalidate TLB entry)
+        * LDST incoming RADIX walk request
+        * set either PRTBL or PID SPRs (which then fires a TLB invalidate)
+        """
         comb = m.d.comb
         sync = m.d.sync
 
         pt_valid = Signal()
-        pgtbl = Signal(64)
+        pgtbl = PGTBL("pgtbl")
         rts = Signal(6)
-        mbits = Signal(6)
+        mbits = Signal(6, name="mbits_idle")
 
-        with m.If(~l_in.addr[63]):
-            comb += pgtbl.eq(r.pgtbl0)
-            comb += pt_valid.eq(r.pt0_valid)
-        with m.Else():
+        with m.If(l_in.addr[63]): # quadrant 3
             comb += pgtbl.eq(r.pgtbl3)
             comb += pt_valid.eq(r.pt3_valid)
+        with m.Else():
+            comb += pgtbl.eq(r.pgtbl0)
+            comb += pt_valid.eq(r.pt0_valid)
 
         # rts == radix tree size, number of address bits
-        # being translated
-        comb += rts.eq(Cat(pgtbl[5:8], pgtbl[61:63]))
+        # being translated.  takes bits 5:7 and 61:62
+        comb += rts.eq(Cat(pgtbl.rts2, pgtbl.rts1, C(0)))
 
         # mbits == number of address bits to index top
-        # level of tree
-        comb += mbits.eq(pgtbl[0:5])
+        # level of tree.  takes bits 0:4
+        comb += mbits.eq(pgtbl.rpds)
 
         # set v.shift to rts so that we can use finalmask
-        # for the segment check
+        # for the segment check.
+        # note: rpdb (52 bits long) is truncated to 48 bits
         comb += v.shift.eq(rts)
         comb += v.mask_size.eq(mbits[0:5])
-        comb += v.pgbase.eq(Cat(C(0, 8), pgtbl[8:56]))
+
+        # create the page base from root page directory base (48 bits with 8 0s)
+        comb += v.pgbase.eq(Cat(C(0, 8), pgtbl.rpdb[:48])) # bits 8:55
+
+        # request either TLB invalidate
+        # or start a RADIX walk
 
         with m.If(l_in.valid):
             comb += v.addr.eq(l_in.addr)
@@ -132,10 +215,10 @@ class MMU(Elaboratable):
             comb += v.store.eq(~(l_in.load | l_in.iside))
             comb += v.priv.eq(l_in.priv)
 
-            comb += Display("state %d l_in.valid addr %x iside %d store %d "
-                            "rts %x mbits %x pt_valid %d",
+            sync += Display("state %d l_in.valid addr %x iside %d store %d "
+                            "rpdb %x rts %d mbits %d pt_valid %d",
                             v.state, v.addr, v.iside, v.store,
-                            rts, mbits, pt_valid)
+                            pgtbl.rpdb, rts, mbits, pt_valid)
 
             with m.If(l_in.tlbie):
                 # Invalidate all iTLB/dTLB entries for
@@ -162,7 +245,9 @@ class MMU(Elaboratable):
                     # set v.shift so we can use finalmask
                     # for generating the process table
                     # entry address
-                    comb += v.shift.eq(r.prtbl[0:5])
+                    prtbl = PRTBL("prtbl")
+                    comb += prtbl.eq(r.prtbl)
+                    comb += v.shift.eq(prtbl.rpds)
                     comb += v.state.eq(State.PROC_TBL_READ)
 
                 with m.Elif(mbits == 0):
@@ -175,10 +260,13 @@ class MMU(Elaboratable):
                 with m.Else():
                     comb += v.state.eq(State.SEGMENT_CHECK)
 
+        # set either PID or PRTBL SPRs
+        # (then invalidate TLBs)
+
         with m.If(l_in.mtspr):
             # Move to PID needs to invalidate L1 TLBs
-            # and cached pgtbl0 value.  Move to PRTBL
-            # does that plus invalidating the cached
+            # and cached pgtbl0 value.
+            # Move to PRTBL does that plus invalidating the cached
             # pgtbl3 value as well.
             with m.If(~l_in.sprn[9]):
                 comb += v.pid.eq(l_in.rs[0:32])
@@ -192,83 +280,105 @@ class MMU(Elaboratable):
 
     def proc_tbl_wait(self, m, v, r, data):
         comb = m.d.comb
-        with m.If(r.addr[63]):
-            comb += v.pgtbl3.eq(data)
+        sync = m.d.sync
+        rts = Signal(6)
+        mbits = Signal(6, name="mbits_tbl_wait")
+        prtbl = PRTBL("prtblw")
+        comb += prtbl.eq(data)
+
+        with m.If(r.addr[63]): # top bit of quadrant selects pt3
+            comb += v.pgtbl3.eq(prtbl)
             comb += v.pt3_valid.eq(1)
         with m.Else():
-            comb += v.pgtbl0.eq(data)
+            comb += v.pgtbl0.eq(prtbl)
             comb += v.pt0_valid.eq(1)
 
-        rts = Signal(6)
-        mbits = Signal(6)
-
         # rts == radix tree size, # address bits being translated
-        comb += rts.eq(Cat(data[5:8], data[61:63]))
+        comb += rts.eq(Cat(prtbl.rts2, prtbl.rts1, C(0)))
 
         # mbits == # address bits to index top level of tree
-        comb += mbits.eq(data[0:5])
+        comb += mbits.eq(prtbl.rpds[0:5])
 
         # set v.shift to rts so that we can use finalmask for the segment check
         comb += v.shift.eq(rts)
         comb += v.mask_size.eq(mbits[0:5])
-        comb += v.pgbase.eq(Cat(C(0, 8), data[8:56]))
+
+        # create the page base from root page directory base (48 bits with 8 0s)
+        comb += v.pgbase.eq(Cat(C(0, 8), prtbl.rpdb[:48])) # bits 8:55
 
         with m.If(mbits):
             comb += v.state.eq(State.SEGMENT_CHECK)
+            sync += Display("PROC TBL %d data %x rts1 %x rts2 %x rts %d "
+                            "rpdb %x mbits %d pgbase %x "
+                            " pt0_valid %d, pt3_valid %d",
+                            v.state, data, prtbl.rts1, prtbl.rts2, rts,
+                            prtbl.rpdb, mbits, v.pgbase,
+                            v.pt0_valid, v.pt3_valid)
         with m.Else():
             comb += v.state.eq(State.RADIX_FINISH)
             comb += v.invalid.eq(1)
-            if(display_invalid): m.d.sync += Display("MMUBUG: mbits is invalid")
+            if (display_invalid): m.d.sync += Display("MMU: mbits is invalid")
 
     def radix_read_wait(self, m, v, r, d_in, data):
         comb = m.d.comb
         sync = m.d.sync
 
+        rpte = RTPTE(name="radix_rpte") # page-table (leaf) entry
+        rpde = RTPDE(name="radix_rpde") # page-directory (non-leaf) entry
+
         perm_ok = Signal()
         rc_ok = Signal()
-        mbits = Signal(6)
-        valid = Signal()
-        leaf = Signal()
+        mbits = Signal(6, name="mbits_read_wait")
+        valid = rpte.valid
+        eaa = rpte.eaa
+        leaf = rpte.leaf
         badtree = Signal()
 
-        comb += Display("RDW %016x done %d "
+        sync += Display("RDW %016x done %d "
                         "perm %d rc %d mbits %d shf %d "
                         "valid %d leaf %d bad %d",
                         data, d_in.done, perm_ok, rc_ok,
                         mbits, r.shift, valid, leaf, badtree)
 
-        # set pde
+        # set pde and interpret as Radix Tree Page Table Entry (leaf=1 case)
         comb += v.pde.eq(data)
+        comb += rpte.eq(data)
+        comb += rpde.eq(data)
 
-        # test valid bit
-        comb += valid.eq(data[63]) # valid=data[63]
-        comb += leaf.eq(data[62]) # valid=data[63]
-
-        comb += v.pde.eq(data)
-        # valid & leaf
         with m.If(valid):
+            # valid & leaf: RADIX Page-Table Entry
             with m.If(leaf):
                 # check permissions and RC bits
-                with m.If(r.priv | ~data[3]):
-                    with m.If(~r.iside):
-                        comb += perm_ok.eq(data[1] | (data[2] & ~r.store))
-                    with m.Else():
+                with m.If(r.priv | ~eaa[EAA_PRIV]):
+                    with m.If(r.iside): # instruction-side request
                         # no IAMR, so no KUEP support for now
                         # deny execute permission if cache inhibited
-                        comb += perm_ok.eq(data[0] & ~data[5])
+                        comb += perm_ok.eq(eaa[EAA_EXE] & ~rpte.att[1])
+                    with m.Else():
+                        # Load/Store (read/write)
+                        comb += perm_ok.eq(eaa[EAA_WR] |
+                                          (eaa[EAA_RD] & ~r.store))
+                comb += rc_ok.eq(rpte.r & (rpte.c | ~r.store))
 
-                comb += rc_ok.eq(data[8] & (data[7] | ~r.store))
+                # permissions / rc ok, load TLB, otherwise report error
                 with m.If(perm_ok & rc_ok):
                     comb += v.state.eq(State.RADIX_LOAD_TLB)
+                    sync += Display("RADIX LEAF data %x att %x eaa %x "
+                                    "R %d C %d "
+                                    "shift %d pgbase %x ",
+                                    data, rpte.att, eaa,
+                                    rpte.r, rpte.c,
+                                    v.shift, v.pgbase
+                                    )
                 with m.Else():
                     comb += v.state.eq(State.RADIX_FINISH)
                     comb += v.perm_err.eq(~perm_ok)
                     # permission error takes precedence over RC error
                     comb += v.rc_error.eq(perm_ok)
 
-            # valid & !leaf
+            # valid & !leaf: RADIX Page-Directory Entry
             with m.Else():
-                comb += mbits.eq(data[0:5])
+                comb += mbits.eq(rpde.nls) # 5 bits NLS into 6-bit-long mbits
                 comb += badtree.eq((mbits < 5) |
                                    (mbits > 16) |
                                    (mbits > r.shift))
@@ -277,26 +387,31 @@ class MMU(Elaboratable):
                     comb += v.badtree.eq(1)
                 with m.Else():
                     comb += v.shift.eq(r.shift - mbits)
-                    comb += v.mask_size.eq(mbits[0:5])
-                    comb += v.pgbase.eq(Cat(C(0, 8), data[8:56]))
+                    comb += v.mask_size.eq(mbits)
+                    # pagebase is first 48 bits of NLB, shifted up 1 byte
+                    comb += v.pgbase.eq(Cat(C(0, 8), rpde.nlb[:48]))
                     comb += v.state.eq(State.RADIX_LOOKUP)
 
         with m.Else():
             # non-present PTE, generate a DSI
             comb += v.state.eq(State.RADIX_FINISH)
             comb += v.invalid.eq(1)
-            if(display_invalid):
-                sync += Display("MMUBUG: non-present PTE, generate a DSI")
+            if (display_invalid):
+                sync += Display("MMU: non-present PTE, generate a DSI")
 
     def segment_check(self, m, v, r, data, finalmask):
+        """segment_check: checks validity of the request before doing a
+        RADIX lookup. reports either segment error or bad tree if not ok
+        """
         comb = m.d.comb
 
-        mbits = Signal(6)
+        mbits = Signal(6, name="mbits_check")
         nonzero = Signal()
         comb += mbits.eq(r.mask_size)
         comb += v.shift.eq(r.shift + (31 - 12) - mbits)
         comb += nonzero.eq((r.addr[31:62] & ~finalmask[0:31]).bool())
-        with m.If((r.addr[63] ^ r.addr[62]) | nonzero):
+        with m.If((r.addr[63] != r.addr[62]) # pt3 == 0b11 and pt1 == 0b00
+                  | nonzero):
             comb += v.state.eq(State.RADIX_FINISH)
             comb += v.segerror.eq(1)
         with m.Elif((mbits < 5) | (mbits > 16) |
@@ -328,12 +443,14 @@ class MMU(Elaboratable):
                             "%d badtree=%d", l_out.invalid, l_out.badtree)
 
         with m.If(rin.state == State.RADIX_LOOKUP):
-            sync += Display ("radix lookup shift=%d msize=%d",
-                            rin.shift, rin.mask_size)
+            sync += Display ("radix lookup shift=%x msize=%x",
+                            rin.shift, mask)
 
         with m.If(r.state == State.RADIX_LOOKUP):
-            sync += Display(f"send load addr=%x addrsh=%d mask=%x",
+            sync += Display(f"send load addr=%x addrsh=%x mask=%x",
                             d_out.addr, addrsh, mask)
+
+        # update the internal register
         sync += r.eq(rin)
 
     def elaborate(self, platform):
@@ -349,6 +466,11 @@ class MMU(Elaboratable):
         self.rin = rin = RegStage("r_in")
         r = RegStage("r")
 
+        # get access to prtbl and pid for debug / testing purposes ONLY
+        # (actually, not needed, because setup_regs() triggers mmu direct)
+        # self._prtbl = r.prtbl
+        # self._pid = r.pid
+
         l_in  = self.l_in
         l_out = self.l_out
         d_out = self.d_out
@@ -357,7 +479,7 @@ class MMU(Elaboratable):
 
         self.mmu_0(m, r, rin, l_in, l_out, d_out, addrsh, mask)
 
-        v = RegStage()
+        v = RegStage("v")
         dcreq = Signal()
         tlb_load = Signal()
         itlb_load = Signal()
@@ -372,7 +494,6 @@ class MMU(Elaboratable):
 
         comb += v.eq(r)
         comb += v.valid.eq(0)
-        comb += dcreq.eq(0)
         comb += v.done.eq(0)
         comb += v.err.eq(0)
         comb += v.invalid.eq(0)
@@ -380,11 +501,7 @@ class MMU(Elaboratable):
         comb += v.segerror.eq(0)
         comb += v.perm_err.eq(0)
         comb += v.rc_error.eq(0)
-        comb += tlb_load.eq(0)
-        comb += itlb_load.eq(0)
-        comb += tlbie_req.eq(0)
         comb += v.inval_all.eq(0)
-        comb += prtbl_rd.eq(0)
 
         # Radix tree data structures in memory are
         # big-endian, so we need to byte-swap them
@@ -392,17 +509,29 @@ class MMU(Elaboratable):
 
         # generate mask for extracting address fields for PTE addr generation
         m.submodules.pte_mask = pte_mask = Mask(16-5)
+        pte_mask.mask.name = "pte_mask"
         comb += pte_mask.shift.eq(r.mask_size - 5)
         comb += mask.eq(Cat(C(0x1f, 5), pte_mask.mask))
 
         # generate mask for extracting address bits to go in
         # TLB entry in order to support pages > 4kB
         m.submodules.tlb_mask = tlb_mask = Mask(44)
+        tlb_mask.mask.name = "tlb_mask"
         comb += tlb_mask.shift.eq(r.shift)
         comb += finalmask.eq(tlb_mask.mask)
 
+        # Shift address bits 61--12 right by 0--47 bits and
+        # supply the least significant 16 bits of the result.
+        comb += addrsh.eq(r.addr[12:62] >> r.shift)
+
         with m.If(r.state != State.IDLE):
             sync += Display("MMU state %d %016x", r.state, data)
+            sync += Display("addrsh %x r.shift %d r.addr[12:62] %x",
+                        addrsh, r.shift, r.addr[12:62])
+
+        ##########
+        # Main FSM
+        ##########
 
         with m.Switch(r.state):
             with m.Case(State.IDLE):
@@ -460,25 +589,35 @@ class MMU(Elaboratable):
                 sync += Display("   RADIX_FINISH")
                 comb += v.state.eq(State.IDLE)
 
+        # check and report either error or done.
         with m.If((v.state == State.RADIX_FINISH) |
                  ((v.state == State.RADIX_LOAD_TLB) & r.iside)):
             comb += v.err.eq(v.invalid | v.badtree | v.segerror
                              | v.perm_err | v.rc_error)
             comb += v.done.eq(~v.err)
 
-        with m.If(~r.addr[63]):
+        # PID is only valid if MSB of address is zero, top 2 bits are Quadrant
+        with m.If(~r.addr[63]): # quadrant 0 (pt0)
             comb += effpid.eq(r.pid)
 
+        # calculate Process Table Address
         pr24 = Signal(24, reset_less=True)
-        comb += pr24.eq(masked(r.prtbl[12:36], effpid[8:32], finalmask))
-        comb += prtb_adr.eq(Cat(C(0, 4), effpid[0:8], pr24, r.prtbl[36:56]))
+        prtbla = PRTBL("prtbla")
+        comb += prtbla.eq(r.prtbl)
+        rpdb = prtbla.rpdb
+        comb += pr24.eq(masked(rpdb[4:28], effpid[8:32], finalmask))
+        comb += prtb_adr.eq(Cat(C(0, 4), effpid[0:8], pr24, rpdb[28:48]))
 
+        # calculate Page Table Address
         pg16 = Signal(16, reset_less=True)
         comb += pg16.eq(masked(r.pgbase[3:19], addrsh, mask))
         comb += pgtb_adr.eq(Cat(C(0, 3), pg16, r.pgbase[19:56]))
 
+        # calculate Page Table Entry from Real Page Number (leaf=1, RTPTE)
+        rpte = RTPTE(name="rpte")
+        comb += rpte.eq(r.pde)
         pd44 = Signal(44, reset_less=True)
-        comb += pd44.eq(masked(r.pde[12:56], r.addr[12:56], finalmask))
+        comb += pd44.eq(masked(rpte.rpn, r.addr[12:56], finalmask))
         comb += pte.eq(Cat(r.pde[0:12], pd44))
 
         # update registers
@@ -494,7 +633,11 @@ class MMU(Elaboratable):
             comb += addr.eq(prtb_adr)
         with m.Else():
             comb += addr.eq(pgtb_adr)
+            sync += Display(f"pagetable pg16=%x addrsh %x mask %x pgbase=%x "
+                            "pgbase[19:56]=%x",
+                            pg16, addrsh, mask, r.pgbase, r.pgbase[19:56])
 
+        # connect to other interfaces: LDST, D-Cache, I-Cache
         comb += l_out.done.eq(r.done)
         comb += l_out.err.eq(r.err)
         comb += l_out.invalid.eq(r.invalid)
@@ -533,8 +676,8 @@ def dcache_get(dut):
     mem = {0x0: 0x000000, # to get mtspr prtbl working
 
            0x10000:    # PARTITION_TABLE_2
-                       # PATB_GR=1 PRTB=0x1000 PRTS=0xb
-           b(0x800000000100000b),
+                       # HR=1 RTS1=0x2 PRTB=0x300 RTS2=0x5 PRTS=0xb
+           b(0xc0000000000030ad),
 
            0x30000:     # RADIX_ROOT_PTE
                         # V = 1 L = 0 NLB = 0x400 NLS = 9
@@ -545,20 +688,77 @@ def dcache_get(dut):
                            # R = 1 C = 1 ATT = 0 EAA 0x7
            b(0xc000000000000187),
 
-          0x1000000:   # PROCESS_TABLE_3
+#
+#   slightly different from radix_walk_example.txt: address in microwatt
+#   has the top bit set to indicate hypervisor.  here, Quadrant 3's
+#   process table entry is put instead into Quadrant 0.  the entry
+#   PROCESS_TABLE_3 should, strictly speaking, be at 0x1000010
+
+#          0x1000000:   # PROCESS_TABLE_3 (pt0_valid)
+#                       # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 12
+#           b(0x40000000000300ac),
+
+          0x1000000:   # PROCESS_TABLE_3 (pt3_valid)
                        # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
            b(0x40000000000300ad),
           }
 
+    # microwatt mmu.bin first part of test 2.
+    # PRTBL must be set to 0x12000, PID to 1
+    mem = {
+             0x0: 0x000000, # to get mtspr prtbl working
+             0x13920: 0x86810000000000c0, # leaf, supposed to be at 0x13920
+             0x10000: 0x0930010000000080, # directory node
+             0x12010: 0x0a00010000000000, # page table
+             0x124000: 0x0000000badc0ffee,  # memory to be looked up
+            }
+
+    # microwatt mmu.bin first part of test 4.
+    # PRTBL must be set to 0x12000, PID to 1
+    mem = {
+             0x0: 0x000000, # to get mtspr prtbl working
+             0x13858: 0x86a10000000000c0, # leaf node
+             0x10000: 0x0930010000000080, # directory node
+             0x12010: 0x0a00010000000000, # page table
+    }
+
+    # microwatt mmu.bin test 5.
+    # PRTBL must be set to 0x12000, PID to 1
+    mem = {
+             0x0: 0x000000, # to get mtspr prtbl working
+             0x13cf8: 0x86b10000000000c0, # leaf node
+             0x13d00: 0x0000000000000000, # invalid leaf node
+             0x10008: 0x0930010000000080, # directory node
+             0x12010: 0x0a00010000000000, # page table
+    }
+
+    # microwatt mmu.bin test 12, instruction-side
+    # PRTBL must be set to 0x12000, PID to 1, iside to 1
+    mem = {
+             0x0: 0x000000, # to get mtspr prtbl working
+             0x13920: 0x01110000000000c0, # leaf node
+             0x10008: 0x0930010000000080, # directory node
+             0x12010: 0x0a00010000000000, # page table
+    }
+
     while not stop:
         while True: # wait for dc_valid
             if stop:
                 return
             dc_valid = yield (dut.d_out.valid)
+            tlbld = yield (dut.d_out.tlbld)
             if dc_valid:
                 break
             yield
         addr = yield dut.d_out.addr
+        if tlbld:
+            pte = yield dut.d_out.pte
+            print ("    DCACHE PTE %x -> %x" % (pte, addr))
+            yield dut.d_in.done.eq(1)
+            yield
+            yield dut.d_in.done.eq(0)
+            continue
+
         if addr not in mem:
             print ("    DCACHE LOOKUP FAIL %x" % (addr))
             stop = True
@@ -572,9 +772,15 @@ def dcache_get(dut):
         yield
         yield dut.d_in.done.eq(0)
 
+
 def mmu_wait(dut):
     global stop
     while not stop: # wait for dc_valid / err
+        d_valid = yield (dut.d_out.valid)
+        if d_valid:
+            tlbld = yield (dut.d_out.tlbld)
+            addr = yield (dut.d_out.addr)
+            print ("addr %x tlbld %d" % (addr, tlbld))
         l_done = yield (dut.l_out.done)
         l_err = yield (dut.l_out.err)
         l_badtree = yield (dut.l_out.badtree)
@@ -590,13 +796,20 @@ def mmu_wait(dut):
         yield dut.l_in.mtspr.eq(0) # captured by RegStage(s)
         yield dut.l_in.load.eq(0)  # can reset everything safely
 
+
 def mmu_sim(dut):
     global stop
 
+    # microwatt PRTBL = 0x12000, other test is 0x1000000
+    #prtbl = 0x100000
+    #pidr = 0x0
+    prtbl = 0x12000
+    pidr = 0x1
+
     # MMU MTSPR set prtbl
     yield dut.l_in.mtspr.eq(1)
     yield dut.l_in.sprn[9].eq(1) # totally fake way to set SPR=prtbl
-    yield dut.l_in.rs.eq(0x1000000) # set process table
+    yield dut.l_in.rs.eq(prtbl) # set process table
     yield dut.l_in.valid.eq(1)
     yield from mmu_wait(dut)
     yield
@@ -606,26 +819,55 @@ def mmu_sim(dut):
 
     prtbl = yield (dut.rin.prtbl)
     print ("prtbl after MTSPR %x" % prtbl)
-    assert prtbl == 0x1000000
+    assert prtbl == prtbl
+
+    if True: # microwatt test set PIDR
+        # MMU MTSPR set PIDR = 1
+        yield dut.l_in.mtspr.eq(1)
+        yield dut.l_in.sprn[9].eq(0) # totally fake way to set SPR=pidr
+        yield dut.l_in.rs.eq(pidr) # set process table
+        yield dut.l_in.valid.eq(1)
+        yield from mmu_wait(dut)
+        yield
+        yield dut.l_in.sprn.eq(0)
+        yield dut.l_in.rs.eq(0)
+        yield
 
     #yield dut.rin.prtbl.eq(0x1000000) # manually set process table
     #yield
 
+    #addr = 0x10000  # original test
+    #addr = 0x124108  # microwatt mmu.bin test 2
+    #addr = 0x10b0d8  # microwatt mmu.bin test 4
+    # these are a misalignment test. one load results in two actual
+    # lookups, one of which has a valid page table entry, the other
+    # does not.  we currently do not support misaligned in Loadstore1
+    # therefore these tests fail with an align_intr (0x600) at 0x39fffd
+    addr = 0x39fffd # microwatt mmu.bin test 5
+    addr = 0x3a0000 # microwatt mmu.bin test 5
+
+    # microwatt mmu.bin test 12 is instruction-side
+    addr = 0x324000 # microwatt mmu.bin test 12
+    iside = 1
 
     # MMU PTE request
-    yield dut.l_in.load.eq(1)
+    yield dut.l_in.iside.eq(iside)
+    yield dut.l_in.load.eq(0)
     yield dut.l_in.priv.eq(1)
-    yield dut.l_in.addr.eq(0x10000)
+    yield dut.l_in.addr.eq(addr)
     yield dut.l_in.valid.eq(1)
     yield from mmu_wait(dut)
 
     addr = yield dut.d_out.addr
     pte = yield dut.d_out.pte
+    tlb_ld = yield dut.d_out.tlbld
     l_done = yield (dut.l_out.done)
     l_err = yield (dut.l_out.err)
     l_badtree = yield (dut.l_out.badtree)
-    print ("translated done %d err %d badtree %d addr %x pte %x" % \
-               (l_done, l_err, l_badtree, addr, pte))
+    print ("translated done %d err %d badtree %d "
+           "addr %x pte %x tlb_ld %d" % \
+               (l_done, l_err, l_badtree, addr, pte, tlb_ld))
+
     yield
     yield dut.l_in.priv.eq(0)
     yield dut.l_in.addr.eq(0)
index 2e8643da33072c26363a248b85c0819338fa60ff..023f47589eaf983e5731cfd7c6970b6072db47f2 100644 (file)
@@ -10,7 +10,7 @@
 
     busy_o/1        most likely to be x_busy_o
     go_die_i/1      rst?
-    addr.data/48    x_addr_i (x_addr_i[:4] goes into LenExpand)
+    addr.data/64    x_addr_i (x_addr_i[:4] goes into LenExpand)
     addr.ok/1       probably x_i_valid & ~x_stall_i
 
     addr_ok_o/1     no equivalent.  *might* work using x_stall_i
@@ -37,7 +37,7 @@ from nmutil.util import rising_edge
 class Pi2LSUI(PortInterfaceBase):
 
     def __init__(self, name, lsui=None,
-                 data_wid=64, mask_wid=8, addr_wid=48):
+                 data_wid=64, mask_wid=8, addr_wid=64):
         print("pi2lsui reg mask addr", data_wid, mask_wid, addr_wid)
         super().__init__(data_wid, addr_wid)
         if lsui is None:
@@ -46,13 +46,13 @@ class Pi2LSUI(PortInterfaceBase):
         self.lsui_busy = Signal()
         self.valid_l = SRLatch(False, name="valid")
 
-    def set_wr_addr(self, m, addr, mask, misalign, msr_pr, is_dcbz):
+    def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
         print("pi2lsui TODO, implement is_dcbz")
         m.d.comb += self.valid_l.s.eq(1)
         m.d.comb += self.lsui.x_mask_i.eq(mask)
         m.d.comb += self.lsui.x_addr_i.eq(addr)
 
-    def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+    def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
         m.d.comb += self.valid_l.s.eq(1)
         m.d.comb += self.lsui.x_mask_i.eq(mask)
         m.d.comb += self.lsui.x_addr_i.eq(addr)
@@ -115,7 +115,7 @@ class Pi2LSUI(PortInterfaceBase):
 class Pi2LSUI1(Elaboratable):
 
     def __init__(self, name, pi=None, lsui=None,
-                 data_wid=64, mask_wid=8, addr_wid=48):
+                 data_wid=64, mask_wid=8, addr_wid=64):
         print("pi2lsui reg mask addr", data_wid, mask_wid, addr_wid)
         self.addrbits = mask_wid
         if pi is None:
index d4e7b60c9b7356db71da53679db0a6e346aea360..93db9d6e9bdda5eceae23a680328ded88034e7ef 100644 (file)
@@ -25,6 +25,7 @@ from nmigen.utils import log2_int
 from nmutil.latch import SRLatch, latchregister
 from nmutil.util import rising_edge
 from openpower.decoder.power_decoder2 import Data
+from openpower.decoder.power_enums import MSRSpec
 from soc.scoreboard.addr_match import LenExpand
 from soc.experiment.mem_types import LDSTException
 
@@ -89,20 +90,25 @@ class PortInterface(RecordObject):
       busy_o is deasserted on the cycle AFTER st.ok is asserted.
     """
 
-    def __init__(self, name=None, regwid=64, addrwid=48):
+    def __init__(self, name=None, regwid=64, addrwid=64):
 
         self._regwid = regwid
         self._addrwid = addrwid
 
         RecordObject.__init__(self, name=name)
 
-        # distinguish op type (ld/st)
+        # distinguish op type (ld/st/dcbz/nc)
         self.is_ld_i    = Signal(reset_less=True)
         self.is_st_i    = Signal(reset_less=True)
+        self.is_dcbz_i     = Signal(reset_less=True) # cache-line zeroing
+        self.is_nc         = Signal()  # no cacheing
 
         # LD/ST data length (TODO: other things may be needed)
         self.data_len = Signal(4, reset_less=True)
 
+        # atomic reservation (LR/SC - ldarx / stdcx etc.)
+        self.reserve = Signal(reset_less=True)
+
         # common signals
         self.busy_o = Signal(reset_less=True)     # do not use if busy
         self.go_die_i = Signal(reset_less=True)   # back to reset
@@ -110,19 +116,17 @@ class PortInterface(RecordObject):
         # addr is valid (TLB, L1 etc.)
         self.addr_ok_o = Signal(reset_less=True)
         self.exc_o = LDSTException("exc")
-        self.dar_o = Signal(64, reset_less=True)
 
         # LD/ST
         self.ld = Data(regwid, "ld_data_o")  # ok to be set by L0 Cache/Buf
         self.st = Data(regwid, "st_data_i")  # ok to be set by CompUnit
+        self.store_done = Data(1, "store_done_o") # store has been actioned
 
-        # additional "modes"
-        self.is_nc         = Signal()  # no cacheing
-        self.msr_pr        = Signal()  # 1==virtual, 0==privileged
-        self.is_dcbz_i     = Signal(reset_less=True)
-
-        # mmu
-        self.mmu_done          = Signal() # keep for now
+        #only priv_mode = not msr_pr is used currently
+        # TODO: connect signals
+        self.virt_mode  = Signal() # ctrl.msr(MSR_DR);
+        self.priv_mode  = Signal() # not ctrl.msr(MSR_PR);
+        self.mode_32bit = Signal() # not ctrl.msr(MSR_SF);
 
         # dcache
         self.ldst_error        = Signal()
@@ -136,17 +140,19 @@ class PortInterface(RecordObject):
                 self.is_nc.eq(inport.is_nc),
                 self.is_dcbz_i.eq(inport.is_dcbz_i),
                 self.data_len.eq(inport.data_len),
+                self.reserve.eq(inport.reserve),
                 self.go_die_i.eq(inport.go_die_i),
                 self.addr.data.eq(inport.addr.data),
                 self.addr.ok.eq(inport.addr.ok),
                 self.st.eq(inport.st),
-                self.msr_pr.eq(inport.msr_pr),
+                self.virt_mode.eq(inport.virt_mode),
+                self.priv_mode.eq(inport.priv_mode),
+                self.mode_32bit.eq(inport.mode_32bit),
                 inport.ld.eq(self.ld),
                 inport.busy_o.eq(self.busy_o),
                 inport.addr_ok_o.eq(self.addr_ok_o),
                 inport.exc_o.eq(self.exc_o),
-                inport.dar_o.eq(self.dar_o),
-                inport.mmu_done.eq(self.mmu_done),
+                inport.store_done.eq(self.store_done),
                 inport.ldst_error.eq(self.ldst_error),
                 inport.cache_paradox.eq(self.cache_paradox)
                 ]
@@ -175,8 +181,8 @@ class PortInterfaceBase(Elaboratable):
     def connect_port(self, inport):
         return self.pi.connect_port(inport)
 
-    def set_wr_addr(self, m, addr, mask, misalign, msr_pr, is_dcbz): pass
-    def set_rd_addr(self, m, addr, mask, misalign, msr_pr): pass
+    def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc): pass
+    def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc): pass
     def set_wr_data(self, m, data, wen): pass
     def get_rd_data(self, m): pass
 
@@ -214,7 +220,13 @@ class PortInterfaceBase(Elaboratable):
         pi = self.pi
         comb += lds.eq(pi.is_ld_i)  # ld-req signals
         comb += sts.eq(pi.is_st_i)  # st-req signals
-        pr = pi.msr_pr # MSR problem state: PR=1 ==> virt, PR==0 ==> priv
+
+        # TODO: construct an MSRspec here and pass it over in
+        # self.set_rd_addr and set_wr_addr below rather than just pr
+        pr = ~pi.priv_mode
+        dr = pi.virt_mode
+        sf = ~pi.mode_32bit
+        msr = MSRSpec(pr=pr, dr=dr, sf=sf)
 
         # detect busy "edge"
         busy_delay = Signal()
@@ -228,7 +240,6 @@ class PortInterfaceBase(Elaboratable):
         misalign = Signal()
         comb += misalign.eq(lenexp.lexp_o[8:].bool())
 
-
         # activate mode: only on "edge"
         comb += ld_active.s.eq(rising_edge(m, lds))  # activate LD mode
         comb += st_active.s.eq(rising_edge(m, sts))  # activate ST mode
@@ -247,7 +258,8 @@ class PortInterfaceBase(Elaboratable):
             comb += lenexp.len_i.eq(pi.data_len)
             comb += lenexp.addr_i.eq(lsbaddr)
             with m.If(pi.addr.ok & adrok_l.qn):
-                self.set_rd_addr(m, pi.addr.data, lenexp.lexp_o, misalign, pr)
+                self.set_rd_addr(m, pi.addr.data, lenexp.lexp_o, misalign,
+                                    msr, pi.is_nc)
                 comb += pi.addr_ok_o.eq(1)  # acknowledge addr ok
                 sync += adrok_l.s.eq(1)       # and pull "ack" latch
 
@@ -259,8 +271,8 @@ class PortInterfaceBase(Elaboratable):
             comb += lenexp.len_i.eq(pi.data_len)
             comb += lenexp.addr_i.eq(lsbaddr)
             with m.If(pi.addr.ok):
-                self.set_wr_addr(m, pi.addr.data, lenexp.lexp_o, misalign, pr,
-                                 pi.is_dcbz_i)
+                self.set_wr_addr(m, pi.addr.data, lenexp.lexp_o, misalign, msr,
+                                 pi.is_dcbz_i, pi.is_nc)
                 with m.If(adrok_l.qn & self.pi.exc_o.happened==0):
                     comb += pi.addr_ok_o.eq(1)  # acknowledge addr ok
                     sync += adrok_l.s.eq(1)       # and pull "ack" latch
@@ -285,7 +297,7 @@ class PortInterfaceBase(Elaboratable):
         with m.If(st_active.q & pi.st.ok):
             # shift data up before storing.  lenexp *bit* version of mask is
             # passed straight through as byte-level "write-enable" lines.
-            stdata = Signal(self.regwid, reset_less=True)
+            stdata = Signal(self.regwid*2, reset_less=True)
             comb += stdata.eq(pi.st.data << (lenexp.addr_i*8))
             # TODO: replace with link to LoadStoreUnitInterface.x_store_data
             # and also handle the ready/stall/busy protocol
@@ -323,7 +335,14 @@ class PortInterfaceBase(Elaboratable):
             comb += busy_l.r.eq(1)
 
         # busy latch outputs to interface
-        comb += pi.busy_o.eq(busy_l.q)
+        if hasattr(self, "external_busy"):
+            # when there is an extra (external) busy, include that here.
+            # this is used e.g. in LoadStore1 when an instruction fault
+            # is being processed (instr_fault) and stops Load/Store requests
+            # from being made until it's done
+            comb += pi.busy_o.eq(busy_l.q | self.external_busy(m))
+        else:
+            comb += pi.busy_o.eq(busy_l.q)
 
         return m
 
@@ -349,11 +368,11 @@ class TestMemoryPortInterface(PortInterfaceBase):
         # hard-code memory addressing width to 6 bits
         self.mem = TestMemory(regwid, 5, granularity=regwid//8, init=False)
 
-    def set_wr_addr(self, m, addr, mask, misalign, msr_pr, is_dcbz):
+    def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
         lsbaddr, msbaddr = self.splitaddr(addr)
         m.d.comb += self.mem.wrport.addr.eq(msbaddr)
 
-    def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+    def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
         lsbaddr, msbaddr = self.splitaddr(addr)
         m.d.comb += self.mem.rdport.addr.eq(msbaddr)
 
index 31f84c2033153ff710ca13aafa73445e181eb46f..661b784d71f6a091757d21e8de7ebebc50b4e4d8 100644 (file)
@@ -1,7 +1,8 @@
 # based on microwatt plru.vhdl
 
-from nmigen import Elaboratable, Signal, Array, Module, Mux, Const
+from nmigen import Elaboratable, Signal, Array, Module, Mux, Const, Cat
 from nmigen.cli import rtlil
+from nmigen.lib.coding import Decoder
 
 
 class PLRU(Elaboratable):
@@ -52,6 +53,53 @@ class PLRU(Elaboratable):
     def ports(self):
         return [self.acc_en, self.lru_o, self.acc_i]
 
+
+class PLRUs(Elaboratable):
+    def __init__(self, cachetype, n_plrus, n_bits):
+        self.cachetype = cachetype
+        self.n_plrus = n_plrus
+        self.n_bits = n_bits
+        self.valid = Signal()
+        self.way = Signal(n_bits)
+        self.index = Signal(n_plrus.bit_length())
+        self.isel = Signal(n_plrus.bit_length())
+        self.o_index = Signal(n_bits)
+
+    def elaborate(self, platform):
+        """Generate TLB PLRUs
+        """
+        m = Module()
+        comb = m.d.comb
+
+        if self.n_plrus == 0:
+            return m
+
+        # Binary-to-Unary one-hot, enabled by valid
+        m.submodules.te = te = Decoder(self.n_plrus)
+        comb += te.n.eq(~self.valid)
+        comb += te.i.eq(self.index)
+
+        out = Array(Signal(self.n_bits, name="plru_out%d" % x) \
+                             for x in range(self.n_plrus))
+
+        for i in range(self.n_plrus):
+            # PLRU interface
+            name = "%s_plru_%d" % (self.cachetype, i)
+            m.submodules[name] = plru = PLRU(self.n_bits)
+
+            comb += plru.acc_en.eq(te.o[i])
+            comb += plru.acc_i.eq(self.way)
+            comb += out[i].eq(plru.lru_o)
+
+        # select output based on index
+        comb += self.o_index.eq(out[self.isel])
+
+        return m
+
+    def ports(self):
+        return [self.valid, self.way, self.index, self.isel, self.o_index]
+
+
 if __name__ == '__main__':
     dut = PLRU(2)
     vl = rtlil.convert(dut, ports=dut.ports())
@@ -59,3 +107,9 @@ if __name__ == '__main__':
         f.write(vl)
 
 
+    dut = PLRUs("testing", 4, 2)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_plrus.il", "w") as f:
+        f.write(vl)
+
+
index 2e6c734f0ebbc6137e46b001a5474ce54de7d210..d30a99dc65a28278c9c04eff1dcfb935da0e199d 100644 (file)
@@ -53,7 +53,7 @@ PROCESS_TABLE:
            RTS2 = 0x5
            RPDS = 12
 
-           PROCESS_TABLE_3       |     PROCESS_TABLE_3 //Hypervisor Userspace 
+0x1000010 :    PROCESS_TABLE_3       |     PROCESS_TABLE_3 //Hypervisor Userspace 
            0x40000000000300ad    |     0x0
             RTS1 = 0x2
            RPDB = 0x300
index 53bc03912470c98935b4800ac93c6186cc5b763f..e481dd4ad5b3075d4217b5f708fb0889bbedaf6e 100644 (file)
@@ -13,7 +13,7 @@ test1 = {
 
            0x40000:     # RADIX_SECOND_LEVEL
                         # V = 1 L = 1 SW = 0 RPN = 0
-                        # R = 1 C = 1 ATT = 0 EAA 0x7
+                        # R = 1 C = 1 ATT = 0 EAA 0x3
            b(0xc000000000000183),
 
            0x1000000:   # PROCESS_TABLE_3
@@ -23,3 +23,144 @@ test1 = {
            #0x10004: 0
 
 }
+
+
+# executable permission is barred here (EAA=0x2)
+test2 = {
+           0x10000:    # PARTITION_TABLE_2
+                       # PATB_GR=1 PRTB=0x1000 PRTS=0xb
+           b(0x800000000100000b),
+
+           0x30000:     # RADIX_ROOT_PTE
+                        # V = 1 L = 0 NLB = 0x400 NLS = 9
+           b(0x8000000000040009),
+
+           0x40000:     # RADIX_SECOND_LEVEL
+                        # V = 1 L = 1 SW = 0 RPN = 0
+                        # R = 1 C = 1 ATT = 0 EAA 0x2
+           b(0xc000000000000182),
+
+           0x1000000:   # PROCESS_TABLE_3
+                        # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
+           b(0x40000000000300ad),
+
+           #0x10004: 0
+
+}
+
+
+# microwatt mmu.bin first part of test 2. PRTBL must be set to 0x12000, PID to 1
+microwatt_test2 = {
+             0x13920: 0x86810000000000c0, # leaf node
+             0x10000: 0x0930010000000080, # directory node
+             0x12010: 0x0a00010000000000, # page table
+             0x8108: 0x0000000badc0ffee,  # memory to be looked up
+            }
+
+microwatt_test4 = {
+             0x13858: 0x86a10000000000c0, # leaf node
+             0x10000: 0x0930010000000080, # directory node
+             0x12010: 0x0a00010000000000, # page table
+}
+
+# microwatt mmu.bin test 5: a misaligned read which crosses over to a TLB that
+# is not valid.  must attempt a 64-bit read at address 0x39fffd to trigger
+
+microwatt_test5 = {
+             0x13cf8: 0x86b10000000000c0, # leaf, covers up to 0x39ffff
+             0x10008: 0x0930010000000080, # directory node
+             0x12010: 0x0a00010000000000, # page table
+             0x39fff8: 0x0123456badc0ffee,  # to be looked up (should fail)
+             0x400000: 0x0123456badc0ffee,  # not page-mapped
+}
+
+# linux kernel 5.7 first MMU enable
+"""
+                          rd @ 000bf803 di b000000000001033 sel ff 3.......
+                          rd @ 000bf804 di                0 sel ff ........
+                          rd @ 000bf805 di                0 sel ff ........
+                          rd @ 000bf806 di            10000 sel ff ........
+                          rd @ 000bf807 di c0000000005fc380 sel ff ........
+                          rd @ 000bf800 di         80000000 sel ff ........
+                          rd @ 000bf801 di c00000000059d400 sel ff ..Y.....
+                          rd @ 000bf802 di c000000000000000 sel ff ........
+pc     a588 insn 7c7a03a6 msr a000000000000003
+pc     a58c insn 7c9b03a6 msr a000000000000003
+pc     a590 insn 4c000024 msr a000000000000003
+pc     a598 insn f82d0190 msr b000000000000033
+                          rd @ 01c00000 di ad005c0000000040 sel ff ........
+                          rd @ 01c00001 di                0 sel ff ........
+                          rd @ 01c00002 di                0 sel ff ........
+                          rd @ 01c00003 di                0 sel ff ........
+                          rd @ 01c00004 di                0 sel ff ........
+                          rd @ 01c00005 di                0 sel ff ........
+                          rd @ 01c00006 di                0 sel ff ........
+                          rd @ 01c00007 di                0 sel ff ........
+                          rd @ 000b8000 di  9e0ff0f00000080 sel ff ........
+                          rd @ 000b8001 di                0 sel ff ........
+                          rd @ 000b8002 di                0 sel ff ........
+                          rd @ 000b8003 di                0 sel ff ........
+                          rd @ 000b8004 di                0 sel ff ........
+                          rd @ 000b8005 di                0 sel ff ........
+                          rd @ 000b8006 di                0 sel ff ........
+                          rd @ 000b8007 di                0 sel ff ........
+                          rd @ 01fffc00 di  9d0ff0f00000080 sel ff ........
+                          rd @ 01fffc01 di                0 sel ff ........
+                          rd @ 01fffc02 di                0 sel ff ........
+                          rd @ 01fffc03 di                0 sel ff ........
+                          rd @ 01fffc04 di                0 sel ff ........
+                          rd @ 01fffc05 di                0 sel ff ........
+                          rd @ 01fffc06 di                0 sel ff ........
+                          rd @ 01fffc07 di                0 sel ff ........
+                          rd @ 01fffa00 di 8f010000000000c0 sel ff ........
+                          rd @ 01fffa01 di 8f012000000000c0 sel ff ........
+                          rd @ 01fffa02 di 8f014000000000c0 sel ff ........
+                          rd @ 01fffa03 di 8e016000000000c0 sel ff ........
+                          rd @ 01fffa04 di 8e018000000000c0 sel ff ........
+                          rd @ 01fffa05 di 8e01a000000000c0 sel ff ........
+                          rd @ 01fffa06 di 8e01c000000000c0 sel ff ........
+                          rd @ 01fffa07 di 8e01e000000000c0 sel ff ........
+"""
+
+microwatt_linux_5_7_boot = {
+                  0x000bf803<<3: 0xb000000000001033,
+                  0x000bf804<<3: 0x0,
+                  0x000bf805<<3: 0x0,
+                  0x000bf806<<3: 0x10000,
+                  0x000bf807<<3: 0xc0000000005fc380,
+                  0x000bf800<<3: 0x80000000,
+                  0x000bf801<<3: 0xc00000000059d400,
+                  0x000bf802<<3: 0xc000000000000000,
+                  0x01c00000<<3: 0xad005c0000000040,
+                  0x01c00001<<3: 0x0,
+                  0x01c00002<<3: 0x0,
+                  0x01c00003<<3: 0x0,
+                  0x01c00004<<3: 0x0,
+                  0x01c00005<<3: 0x0,
+                  0x01c00006<<3: 0x0,
+                  0x01c00007<<3: 0x0,
+                  0x000b8000<<3: 0x09e0ff0f00000080,
+                  0x000b8001<<3: 0x0,
+                  0x000b8002<<3: 0x0,
+                  0x000b8003<<3: 0x0,
+                  0x000b8004<<3: 0x0,
+                  0x000b8005<<3: 0x0,
+                  0x000b8006<<3: 0x0,
+                  0x000b8007<<3: 0x0,
+                  0x01fffc00<<3: 0x09d0ff0f00000080,
+                  0x01fffc01<<3: 0x0,
+                  0x01fffc02<<3: 0x0,
+                  0x01fffc03<<3: 0x0,
+                  0x01fffc04<<3: 0x0,
+                  0x01fffc05<<3: 0x0,
+                  0x01fffc06<<3: 0x0,
+                  0x01fffc07<<3: 0x0,
+                  0x01fffa00<<3: 0x8f010000000000c0,
+                  0x01fffa01<<3: 0x8f012000000000c0,
+                  0x01fffa02<<3: 0x8f014000000000c0,
+                  0x01fffa03<<3: 0x8e016000000000c0,
+                  0x01fffa04<<3: 0x8e018000000000c0,
+                  0x01fffa05<<3: 0x8e01a000000000c0,
+                  0x01fffa06<<3: 0x8e01c000000000c0,
+                  0x01fffa07<<3: 0x8e01e000000000c0,
+}
index 4c2e1347adc29a4d2b05d018746bf6fff1a458a0..2f2c51d1c18888187c4d540e54fb5604d9b8e236 100644 (file)
@@ -464,13 +464,6 @@ def scoreboard_sim(op):
                         wrmask=[0, 1],
                         src_delays=[2, 0], dest_delays=[1, 0])
 
-    # test combinatorial zero-delay operation
-    # In the test ALU, any operation other than ADD, MUL, EXTS or SHR
-    # is zero-delay, and do a subtraction.
-    # 5 - 2 = 3
-    yield from op.issue([5, 2], MicrOp.OP_CMP, [3, 0],
-                        wrmask=[0, 1],
-                        src_delays=[0, 1], dest_delays=[2, 0])
     # test all combinations of masked input ports
     # NOP does not make any request nor response
     yield from op.issue([5, 2], MicrOp.OP_NOP, [0, 0],
@@ -484,6 +477,15 @@ def scoreboard_sim(op):
     yield from op.issue([2, 0x80], MicrOp.OP_EXTSWSLI, [0xFF80, 0],
                         rdmaskn=[1, 0], wrmask=[0, 1],
                         src_delays=[1, 2], dest_delays=[1, 0])
+
+    # test combinatorial zero-delay operation
+    # In the test ALU, any operation other than ADD, MUL, EXTS or SHR
+    # is zero-delay, and do a subtraction.
+    # 5 - 2 = 3
+    yield from op.issue([5, 2], MicrOp.OP_CMP, [3, 0],
+                        wrmask=[0, 1],
+                        src_delays=[0, 1], dest_delays=[2, 0])
+
     # test with rc=1, so expect results on the CR output port
     # 5 + 2 = 7
     # 7 > 0 => CR = 0b100
@@ -532,14 +534,14 @@ def test_compunit_fsm():
                 'n_data_o[7:0]',
                 ({'submodule': 'n'},
                     ['n_o_valid', 'n_i_ready'])])]),
-        ('debug', {'module': 'top'},
+        ('debug', {'module': 'bench'},
             ['src1_count[7:0]', 'src2_count[7:0]', 'dest1_count[7:0]'])]
 
     write_gtkw(
         "test_compunit_fsm1.gtkw",
         "test_compunit_fsm1.vcd",
         traces, style,
-        module='top.cu'
+        module='bench.top.cu'
     )
     m = Module()
     alu = Shifter(8)
@@ -665,7 +667,7 @@ def test_compunit_regspec3():
                "test_compunit_regspec3.vcd",
                traces, style,
                clk_period=1e-6,
-               module='top.cu')
+               module='bench.top.cu')
 
     inspec = [('INT', 'a', '0:15'),
               ('INT', 'b', '0:15'),
@@ -736,14 +738,14 @@ def test_compunit_regspec1():
             ('next port', 'out', [
                 'alu_o[15:0]', 'o_valid', 'i_ready',
                 'alu_o_ok', 'alu_cr_ok'])]),
-        ('debug', {'module': 'top'},
+        ('debug', {'module': 'bench'},
             ['src1_count[7:0]', 'src2_count[7:0]', 'dest1_count[7:0]'])]
 
     write_gtkw("test_compunit_regspec1.gtkw",
                "test_compunit_regspec1.vcd",
                traces, style,
                clk_period=1e-6,
-               module='top.cu')
+               module='bench.top.cu')
 
     inspec = [('INT', 'a', '0:15'),
               ('INT', 'b', '0:15')]
index ba3c62a6bb8d8fe2e5798a38f2dbda901ef4a7a8..a0d2372a30dc5cd81adbc62e48339d0708a8c737 100644 (file)
@@ -72,6 +72,8 @@ class OpSim:
             yield
 
 
+# FIXME: AttributeError: type object 'LDSTPipeSpec' has no attribute 'regspec'
+@unittest.skip('broken')
 class TestLDSTCompUnit(unittest.TestCase):
 
     def test_ldst_compunit(self):
index cb8bff00ea12f57b53bb39221724986be646a6e9..f3a3421bcbe76a1a018fe313612b3a9a5f04932f 100644 (file)
@@ -27,7 +27,9 @@ from nmutil.util import Display
 
 from soc.config.loadstore import ConfigMemoryPortInterface
 from soc.experiment.test import pagetables
-from soc.experiment.test.test_wishbone import wb_get
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
+
 
 ########################################
 
@@ -143,7 +145,7 @@ def ldst_sim(dut):
     assert(ld_data==data)
     print("dzbz test passed")
 
-    dut.stop = True # stop simulation
+    wbget.stop = True # stop simulation
 
 ########################################
 class TestLDSTCompUnitMMU(LDSTCompUnit):
@@ -194,10 +196,10 @@ def test_scoreboard_mmu():
     sim.add_clock(1e-6)
 
     dut.mem = pagetables.test1
-    dut.stop = False
+    wbget.stop = False
 
     sim.add_sync_process(wrap(ldst_sim(dut)))
-    sim.add_sync_process(wrap(wb_get(dut)))
+    sim.add_sync_process(wrap(wb_get(dut.cmpi.wb_bus(), dut.mem)))
     with sim.write_vcd('test_scoreboard_mmu.vcd'):
         sim.run()
 
@@ -252,10 +254,10 @@ def test_scoreboard_regspec_mmu():
     sim.add_clock(1e-6)
 
     dut.mem = pagetables.test1
-    dut.stop = False
+    wbget.stop = False
 
     sim.add_sync_process(wrap(ldst_sim(dut)))
-    sim.add_sync_process(wrap(wb_get(dut)))
+    sim.add_sync_process(wrap(wb_get(dut.cmpi.wb_bus(), dut.mem)))
     with sim.write_vcd('test_scoreboard_regspec_mmu.vcd'):
         sim.run()
 
index 3e4180913f01928c9dbda83367f34ab4ac6ba1b3..81d21c180962966a147b8d38b957cb5804d99eac 100644 (file)
@@ -29,17 +29,18 @@ from soc.config.loadstore import ConfigMemoryPortInterface
 from soc.experiment.test import pagetables
 from soc.experiment.test.test_wishbone import wb_get
 
-#new unit added to this test case
+# new unit added to this test case
 from soc.fu.mmu.pipe_data import MMUPipeSpec
 from soc.fu.mmu.fsm import FSMMMUStage
 
-#for sending instructions to the FSM
+# for sending instructions to the FSM
 from openpower.consts import MSR
 from openpower.decoder.power_fields import DecodeFields
 from openpower.decoder.power_fieldsn import SignalBitRange
 from openpower.decoder.power_decoder2 import decode_spr_num
 from openpower.decoder.power_enums import MicrOp
 
+
 def test_TLBIE(dut):
     yield dut.fsm.p.i_data.ctx.op.eq(MicrOp.OP_TLBIE)
     yield dut.fsm.p.valid_i.eq(1)
@@ -51,21 +52,21 @@ def test_TLBIE(dut):
     yield
     yield Display("OP_TLBIE test done")
 
+
 def ldst_sim(dut):
-    yield dut.mmu.rin.prtbl.eq(0x1000000) # set process table
+    yield dut.mmu.rin.prtbl.eq(0x1000000)  # set process table
     addr = 0x100e0
-    data = 0xFF #just a single byte for this test
+    data = 0xFF  # just a single byte for this test
     #data = 0xf553b658ba7e1f51
 
     yield from store(dut, addr, 0, data, 0)
     yield
     ld_data, data_ok, ld_addr = yield from load(dut, addr, 0, 0)
-    print(data,data_ok,ld_addr)
-    assert(ld_data==data)
+    print(data, data_ok, ld_addr)
+    assert(ld_data == data)
     yield
     yield from test_TLBIE(dut)
 
-
     """
     -- not testing dzbz here --
     data = 0
@@ -81,7 +82,7 @@ def ldst_sim(dut):
     print("dzbz test passed")
     """
 
-    dut.stop = True # stop simulation
+    dut.stop = True  # stop simulation
 
 ########################################
 
@@ -112,7 +113,7 @@ def test_scoreboard_mmu():
                          reg_wid=64,
                          units=units)
 
-    dut = TestLDSTCompUnit(16,pspec)
+    dut = TestLDSTCompUnit(16, pspec)
     vl = rtlil.convertMMUFSM(dut, ports=dut.ports())
     with open("test_ldst_comp_mmu1.il", "w") as f:
         f.write(vl)
@@ -120,6 +121,8 @@ def test_scoreboard_mmu():
     run_simulation(dut, ldst_sim(dut), vcd_name='test_ldst_comp.vcd')
 
 ########################################
+
+
 class TestLDSTCompUnitRegSpecMMUFSM(LDSTCompUnit):
 
     def __init__(self, pspec):
@@ -136,7 +139,7 @@ class TestLDSTCompUnitRegSpecMMUFSM(LDSTCompUnit):
 
         self.mmu = MMU()
 
-        pipe_spec = MMUPipeSpec(id_wid=2)
+        pipe_spec = MMUPipeSpec(id_wid=2, parent_pspec=None)
         self.fsm = FSMMMUStage(pipe_spec)
 
         self.fsm.set_ldst_interface(ldst)
@@ -154,11 +157,12 @@ class TestLDSTCompUnitRegSpecMMUFSM(LDSTCompUnit):
         # link mmu and dcache together
         dcache = self.l0.dcache
         mmu = self.mmu
-        m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
-        m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
+        m.d.comb += dcache.m_in.eq(mmu.d_out)  # MMUToDCacheType
+        m.d.comb += mmu.d_in.eq(dcache.m_out)  # DCacheToMMUType
 
         return m
 
+
 def test_scoreboard_regspec_mmufsm():
 
     m = Module()
@@ -181,7 +185,7 @@ def test_scoreboard_regspec_mmufsm():
     dut.mem = pagetables.test1
     dut.stop = False
 
-    sim.add_sync_process(wrap(ldst_sim(dut))) # rename ?
+    sim.add_sync_process(wrap(ldst_sim(dut)))  # rename ?
     sim.add_sync_process(wrap(wb_get(dut)))
     with sim.write_vcd('test_scoreboard_regspec_mmufsm.vcd'):
         sim.run()
@@ -189,4 +193,4 @@ def test_scoreboard_regspec_mmufsm():
 
 if __name__ == '__main__':
     test_scoreboard_regspec_mmufsm()
-    #only one test for now -- test_scoreboard_mmu()
+    # only one test for now -- test_scoreboard_mmu()
index 3212bad649ecaac4560e3b7cfa11461bdfae6d53..3b795ef7c463e96b4f4ad85d51e97ded47fa344c 100644 (file)
@@ -255,15 +255,15 @@ def tst_dcache(mem, test_fn, test_name):
     m.submodules.dcache = dut
     m.submodules.sram = sram
 
-    m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
-    m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
-    m.d.comb += sram.bus.we.eq(dut.wb_out.we)
-    m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
-    m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
-    m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
+    m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
+    m.d.comb += sram.bus.stb.eq(dut.bus.stb)
+    m.d.comb += sram.bus.we.eq(dut.bus.we)
+    m.d.comb += sram.bus.sel.eq(dut.bus.sel)
+    m.d.comb += sram.bus.adr.eq(dut.bus.adr)
+    m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
 
-    m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
-    m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
+    m.d.comb += dut.bus.ack.eq(sram.bus.ack)
+    m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
 
     dcache_write_gtkw(test_name)
 
@@ -286,6 +286,7 @@ def dcache_write_gtkw(test_name):
         ('d_out', [
             'd_out_valid', 'd_out_data[63:0]'
         ]),
+        # XXX TODO, update to standard wishbone Signals (single "bus" Interface)
         ('wb_out', [
             'wb_out_cyc', 'wb_out_stb', 'wb_out_we',
             'wb_out_adr[31:0]', 'wb_out_sel[7:0]', 'wb_out_dat[63:0]'
index 835f4b270443fc6b7a2dbafd5fe59e61a59b81b7..5fa10c0ffc4c56783c9ef996ef830274e3105311 100644 (file)
@@ -286,15 +286,15 @@ def tst_dcache(mem, test_fn, test_name):
     m.submodules.dcache = dut
     m.submodules.sram = sram
 
-    m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
-    m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
-    m.d.comb += sram.bus.we.eq(dut.wb_out.we)
-    m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
-    m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
-    m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
-
-    m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
-    m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
+    m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
+    m.d.comb += sram.bus.stb.eq(dut.bus.stb)
+    m.d.comb += sram.bus.we.eq(dut.bus.we)
+    m.d.comb += sram.bus.sel.eq(dut.bus.sel)
+    m.d.comb += sram.bus.adr.eq(dut.bus.adr)
+    m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
+
+    m.d.comb += dut.bus.ack.eq(sram.bus.ack)
+    m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
 
     dcache_write_gtkw(test_name)
 
index b3e531d98133749f9462baff963855f38cfe4ede..f4717fda6dcbb2b6880759ab4b1dac100ef44602 100644 (file)
@@ -20,61 +20,16 @@ from soc.experiment.mmu import MMU
 from soc.experiment.test import pagetables
 
 from nmigen.compat.sim import run_simulation
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
+from openpower.decoder.power_enums import MSRSpec
 
+wbget.stop = False
 
 
-stop = False
-
-def wb_get(wb, mem):
-    """simulator process for getting memory load requests
-    """
-
-    global stop
-    assert(stop==False)
-
-    while not stop:
-        while True: # wait for dc_valid
-            if stop:
-                return
-            cyc = yield (wb.cyc)
-            stb = yield (wb.stb)
-            if cyc and stb:
-                break
-            yield
-        addr = (yield wb.adr) << 3
-        if addr not in mem:
-            print ("    WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
-        # read or write?
-        we = (yield wb.we)
-        if we:
-            store = (yield wb.dat_w)
-            sel = (yield wb.sel)
-            data = mem.get(addr, 0)
-            # note we assume 8-bit sel, here
-            res = 0
-            for i in range(8):
-                mask = 0xff << (i*8)
-                if sel & (1<<i):
-                    res |= store & mask
-                else:
-                    res |= data & mask
-            mem[addr] = res
-            print ("    DCACHE set %x mask %x data %x" % (addr, sel, res))
-        else:
-            data = mem.get(addr, 0)
-            yield wb.dat_r.eq(data)
-            print ("    DCACHE get %x data %x" % (addr, data))
-
-        yield wb.ack.eq(1)
-        yield
-        yield wb.ack.eq(0)
-        yield
-
 def setup_mmu():
 
-    global stop
-    stop = False
+    wbget.stop = False
 
     pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
                          imem_ifacetype='',
@@ -92,7 +47,6 @@ def setup_mmu():
 
     l_in, l_out = mmu.l_in, mmu.l_out
     d_in, d_out = dcache.d_in, dcache.d_out
-    wb_out, wb_in = dcache.wb_out, dcache.wb_in
 
     # link mmu and dcache together
     m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
@@ -109,8 +63,7 @@ def setup_mmu():
 def _test_dcbz_addr_100e0(dut, mem):
     mmu = dut.submodules.mmu
     pi = dut.submodules.ldst.pi
-    global stop
-    stop = False
+    wbget.stop = False
 
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
@@ -118,26 +71,28 @@ def _test_dcbz_addr_100e0(dut, mem):
     addr = 0x100e0
     data = 0xf553b658ba7e1f51
 
-    yield from pi_st(pi, addr, data, 8, msr_pr=0)
+    msr = MSRSpec(pr=1, dr=0, sf=1) # 64 bit by default
+
+    yield from pi_st(pi, addr, data, 8, msr)
     yield
 
-    ld_data = yield from pi_ld(pi, addr, 8, msr_pr=0)
+    ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr)
     assert ld_data == 0xf553b658ba7e1f51
-    ld_data = yield from pi_ld(pi, addr, 8, msr_pr=0)
+    ld_data, _, _  = yield from pi_ld(pi, addr, 8, msr)
     assert ld_data == 0xf553b658ba7e1f51
 
     print("do_dcbz ===============")
-    yield from pi_st(pi, addr, data, 8, msr_pr=0, is_dcbz=1)
+    yield from pi_st(pi, addr, data, 8, msr, is_dcbz=1)
     print("done_dcbz ===============")
     yield
 
-    ld_data = yield from pi_ld(pi, addr, 8, msr_pr=0)
+    ld_data, _, _  = yield from pi_ld(pi, addr, 8, msr)
     print("ld_data after dcbz")
     print(ld_data)
     assert ld_data == 0
 
     yield
-    stop = True
+    wbget.stop = True
 
 def test_dcbz_addr_100e0():
 
index 5ba926847c771f4a59801c010c6d498e16e845d2..c331a7b5e5958238a78d5c28f5fdcab873aa351d 100644 (file)
@@ -25,10 +25,10 @@ class TestCachedMemoryPortInterface(PortInterfaceBase):
         super().__init__(regwid, addrwid)
         self.ldst = LDSTSplitter(32, 48, 4)
 
-    def set_wr_addr(self, m, addr, mask, misalign, msr_pr, is_dcbz):
+    def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
         m.d.comb += self.ldst.addr_i.eq(addr)
 
-    def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+    def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
         m.d.comb += self.ldst.addr_i.eq(addr)
 
     def set_wr_data(self, m, data, wen):
index 7a098b6e244593a0734b31ad550b2ee52acd1a7f..003edf1264566ac27528a55df97b88030f976d38 100644 (file)
@@ -10,6 +10,8 @@ from nmigen.cli import rtlil
 from nmutil.mask import Mask, masked
 from nmutil.util import Display
 from random import randint, seed
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
 
 if True:
     from nmigen.back.pysim import Simulator, Delay, Settle
@@ -25,9 +27,13 @@ from soc.fu.ldst.loadstore import LoadStore1
 from soc.experiment.mmu import MMU
 
 from nmigen.compat.sim import run_simulation
+from openpower.decoder.power_enums import MSRSpec
 
 
-stop = False
+msr_default = MSRSpec(pr=1, dr=0, sf=1) # 64 bit by default
+
+
+wbget.stop = False
 
 def b(x): # byte-reverse function
     return int.from_bytes(x.to_bytes(8, byteorder='little'),
@@ -38,63 +44,16 @@ def b(x): # byte-reverse function
 #    for cell in mem:
 #        f.write(str(hex(cell))+"="+str(hex(mem[cell]))+"\n")
 
-def wb_get(wb, mem):
-    """simulator process for getting memory load requests
-    """
-
-    global stop
-    assert(stop==False)
-
-    while not stop:
-        while True: # wait for dc_valid
-            if stop:
-                return
-            cyc = yield (wb.cyc)
-            stb = yield (wb.stb)
-            if cyc and stb:
-                break
-            yield
-        addr = (yield wb.adr) << 3
-        if addr not in mem:
-            print ("    WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
-        # read or write?
-        we = (yield wb.we)
-        if we:
-            store = (yield wb.dat_w)
-            sel = (yield wb.sel)
-            data = mem.get(addr, 0)
-            # note we assume 8-bit sel, here
-            res = 0
-            for i in range(8):
-                mask = 0xff << (i*8)
-                if sel & (1<<i):
-                    res |= store & mask
-                else:
-                    res |= data & mask
-            mem[addr] = res
-            print ("    DCACHE set %x mask %x data %x" % (addr, sel, res))
-        else:
-            data = mem.get(addr, 0)
-            yield wb.dat_r.eq(data)
-            print ("    DCACHE get %x data %x" % (addr, data))
-
-        yield wb.ack.eq(1)
-        yield
-        yield wb.ack.eq(0)
-        yield
-
 
 def mmu_lookup(dut, addr):
     mmu = dut.submodules.mmu
-    global stop
 
     print("pi_ld", hex(addr))
-    data = yield from pi_ld(dut.submodules.ldst.pi, addr, 4, msr_pr=1)
+    data, _, _ = yield from pi_ld(dut.submodules.ldst.pi, addr, 4, msr=msr_default)
     print("pi_ld done, data", hex(data))
     """
     # original test code kept for reference
-    while not stop: # wait for dc_valid / err
+    while not wbget.stop: # wait for dc_valid / err
         print("waiting for mmu")
         l_done = yield (mmu.l_out.done)
         l_err = yield (mmu.l_out.err)
@@ -123,7 +82,6 @@ def mmu_lookup(dut, addr):
 
 def ldst_sim(dut):
     mmu = dut.submodules.mmu
-    global stop
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
 
@@ -149,7 +107,7 @@ def ldst_sim(dut):
     data = yield from mmu_lookup(dut, addr+8)
     assert data == 0xf001a5a5
 
-    yield from pi_st(dut.submodules.ldst.pi, addr+4, 0x10015a5a, 4, msr_pr=1)
+    yield from pi_st(dut.submodules.ldst.pi, addr+4, 0x10015a5a, 4, msr=msr_default)
 
     data = yield from mmu_lookup(dut, addr+4)
     assert data == 0x10015a5a
@@ -157,12 +115,11 @@ def ldst_sim(dut):
     yield
     yield
 
-    stop = True
+    wbget.stop = True
 
 def setup_mmu():
 
-    global stop
-    stop = False
+    wbget.stop = False
 
     pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
                          imem_ifacetype='',
@@ -180,7 +137,6 @@ def setup_mmu():
 
     l_in, l_out = mmu.l_in, mmu.l_out
     d_in, d_out = dcache.d_in, dcache.d_out
-    wb_out, wb_in = dcache.wb_out, dcache.wb_in
 
     # link mmu and dcache together
     m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
@@ -234,17 +190,16 @@ def test_mmu():
 
 def ldst_sim_misalign(dut):
     mmu = dut.submodules.mmu
-    global stop
-    stop = False
+    wbget.stop = False
 
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
 
-    data = yield from pi_ld(dut.submodules.ldst.pi, 0x1007, 8, msr_pr=1)
-    print ("misalign ld data", hex(data))
+    data, _, _ = yield from pi_ld(dut.submodules.ldst.pi, 0x1007, 8, msr_default)
+    print ("misalign ld data", data)
 
     yield
-    stop = True
+    wbget.stop = True
 
 
 def test_misalign_mmu():
@@ -288,39 +243,37 @@ def test_misalign_mmu():
 
 def ldst_sim_radixmiss(dut):
     mmu = dut.submodules.mmu
-    global stop
-    stop = False
+    wbget.stop = False
 
     yield mmu.rin.prtbl.eq(1<<40) # set process table
     yield
 
-    data = yield from pi_ld(dut.submodules.ldst.pi, 0x10000000, 8, msr_pr=1)
-    print ("radixmiss ld data", hex(data))
+    data, _, _ = yield from pi_ld(dut.submodules.ldst.pi,
+                                  0x10000000, 8, msr=msr_default)
+    print ("radixmiss ld data", data)
 
     yield
-    stop = True
+    wbget.stop = True
 
 def ldst_sim_dcache_regression(dut):
     mmu = dut.submodules.mmu
-    global stop
-    stop = False
+    wbget.stop = False
 
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
 
     addr = 0x10000
-    data = yield from pi_ld(dut.submodules.ldst.pi, addr, 8, msr_pr=1)
-    print ("=== dcache_regression ld data", hex(data))
+    data, _, _ = yield from pi_ld(dut.submodules.ldst.pi, addr, 8, msr=msr_default)
+    print ("=== dcache_regression ld data", data)
     assert(data == 0xdeadbeef01234567)
 
     yield
-    stop = True
+    wbget.stop = True
 
 def ldst_sim_dcache_random(dut):
     mmu = dut.submodules.mmu
     pi = dut.submodules.ldst.pi
-    global stop
-    stop = False
+    wbget.stop = False
 
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
@@ -333,23 +286,22 @@ def ldst_sim_dcache_random(dut):
         addr *= 8
         addr += 0x10000
 
-        yield from pi_st(pi, addr, data, 8, msr_pr=1)
+        yield from pi_st(pi, addr, data, 8, msr=msr_default)
         yield
 
-        ld_data = yield from pi_ld(pi, addr, 8, msr_pr=1)
+        ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr=msr_default)
 
         eq = (data==ld_data)
         print ("dcache_random values", hex(addr), hex(data), hex(ld_data), eq)
         assert(data==ld_data)   ## investigate why this fails -- really seldom
 
     yield
-    stop = True
+    wbget.stop = True
 
 def ldst_sim_dcache_first(dut): # this test is likely to fail
     mmu = dut.submodules.mmu
     pi = dut.submodules.ldst.pi
-    global stop
-    stop = False
+    wbget.stop = False
 
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
@@ -359,10 +311,10 @@ def ldst_sim_dcache_first(dut): # this test is likely to fail
     data = 0x8c5a3e460d71f0b4
 
     # known to fail without bugfix in src/soc/fu/ldst/loadstore.py
-    yield from pi_st(pi, addr, data, 8, msr_pr=1)
+    yield from pi_st(pi, addr, data, 8, msr=msr_default)
     yield
 
-    ld_data = yield from pi_ld(pi, addr, 8, msr_pr=1)
+    ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr=msr_default)
 
     print ("addr",addr)
     print ("dcache_first ld data", hex(data), hex(ld_data))
@@ -370,7 +322,7 @@ def ldst_sim_dcache_first(dut): # this test is likely to fail
     assert(data==ld_data)
 
     yield
-    stop = True
+    wbget.stop = True
 
 def test_radixmiss_mmu():
 
@@ -483,8 +435,7 @@ def test_dcache_random():
 def ldst_sim_dcache_random2(dut, mem):
     mmu = dut.submodules.mmu
     pi = dut.submodules.ldst.pi
-    global stop
-    stop = False
+    wbget.stop = False
 
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
@@ -518,7 +469,7 @@ def ldst_sim_dcache_random2(dut, mem):
             print("before_pi_st")
             yield
 
-        yield from pi_st(pi, addr, data, 8, msr_pr=1)
+        yield from pi_st(pi, addr, data, 8, msr=msr_default)
         yield
 
         for i in range(0,c2):
@@ -526,7 +477,7 @@ def ldst_sim_dcache_random2(dut, mem):
             yield
 
         print("== read: wb_get")
-        ld_data = yield from pi_ld(pi, addr, 8, msr_pr=1)
+        ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr=msr_default)
 
         #dumpmem(mem,"/tmp/dumpmem"+str(c)+".txt")
         #c += 1
@@ -536,7 +487,7 @@ def ldst_sim_dcache_random2(dut, mem):
         assert(data==ld_data)   ## investigate why this fails -- really seldom
 
     yield
-    stop = True
+    wbget.stop = True
 
 def test_dcache_random2():
 
index df679977dd54728fbc68e7e26840e707535b4318..6090710da470a60893f2075d37540f56fc18cc86 100644 (file)
@@ -24,59 +24,19 @@ from soc.fu.ldst.loadstore import LoadStore1
 from soc.experiment.mmu import MMU
 
 from nmigen.compat.sim import run_simulation
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
+from openpower.decoder.power_enums import MSRSpec
 
+msr_default = MSRSpec(pr=0, dr=0, sf=1) # 64 bit by default
 
-stop = False
+
+wbget.stop = False
 
 def b(x): # byte-reverse function
     return int.from_bytes(x.to_bytes(8, byteorder='little'),
                           byteorder='big', signed=False)
 
-def wb_get(wb, mem):
-    """simulator process for getting memory load requests
-    """
-
-    global stop
-
-    while not stop:
-        while True: # wait for dc_valid
-            if stop:
-                return
-            cyc = yield (wb.cyc)
-            stb = yield (wb.stb)
-            if cyc and stb:
-                break
-            yield
-        addr = (yield wb.adr) << 3
-        if addr not in mem:
-            print ("    WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
-        # read or write?
-        we = (yield wb.we)
-        if we:
-            store = (yield wb.dat_w)
-            sel = (yield wb.sel)
-            data = mem.get(addr, 0)
-            # note we assume 8-bit sel, here
-            res = 0
-            for i in range(8):
-                mask = 0xff << (i*8)
-                if sel & (1<<i):
-                    res |= store & mask
-                else:
-                    res |= data & mask
-            mem[addr] = res
-            print ("    DCACHE set %x mask %x data %x" % (addr, sel, res))
-        else:
-            data = mem.get(addr, 0)
-            yield wb.dat_r.eq(data)
-            print ("    DCACHE get %x data %x" % (addr, data))
-
-        yield wb.ack.eq(1)
-        yield
-        yield wb.ack.eq(0)
-        yield
-
 
 def setup_mmu():
 
@@ -96,7 +56,6 @@ def setup_mmu():
 
     l_in, l_out = mmu.l_in, mmu.l_out
     d_in, d_out = dcache.d_in, dcache.d_out
-    wb_out, wb_in = dcache.wb_out, dcache.wb_in
 
     # link mmu and dcache together
     m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
@@ -112,13 +71,66 @@ def setup_mmu():
 
 def ldst_sim_misalign(dut):
     mmu = dut.submodules.mmu
-    global stop
-    stop = False
+    wbget.stop = False
 
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
 
-    data = yield from pi_ld(dut.submodules.ldst.pi, 0x1000, 4, msr_pr=1)
+    # load 8 bytes at aligned address
+    align_addr = 0x1000
+    data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+                                          align_addr, 8, msr=msr_default)
+    print ("ldst_sim_misalign (aligned)", hex(data), exctype, exc)
+    assert data == 0xdeadbeef01234567
+
+    # load 4 bytes at aligned address
+    align_addr = 0x1004
+    data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+                                          align_addr, 4, msr=msr_default)
+    print ("ldst_sim_misalign (aligned)", hex(data), exctype, exc)
+    assert data == 0xdeadbeef
+
+    # load 8 bytes at *mis*-aligned address which is still within
+    # the page
+    misalign_addr = 0x1004
+    data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+                                          misalign_addr, 8, msr=msr_default)
+
+    print ("ldst_sim_misalign", hex(data), exctype, exc)
+    assert data == 0xf001a5a5deadbeef
+
+    # load 8 bytes at *mis*-aligned address which is still within
+    # the page
+    misalign_addr = 0x1006
+    data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+                                          misalign_addr, 8, msr=msr_default)
+
+    print ("ldst_sim_misalign", hex(data), exctype, exc)
+    assert data == 0xf00ff001a5a5dead
+    wbget.stop = True
+    return
+
+    # load 8 bytes at *mis*-aligned address which is NOT within
+    # the page - TODO - work this out
+    misalign_addr = 0x10000004
+    data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+                                          misalign_addr, 8, msr=msr_default)
+    print ("ldst_sim_misalign", data, exctype, exc)
+    yield
+    dar = yield dut.submodules.ldst.dar
+    print ("DAR", hex(dar))
+    assert dar == misalign_addr
+    # check exception bits
+    assert exc.happened
+    assert exc.alignment
+    assert not exc.segment_fault
+    assert not exc.instr_fault
+    assert not exc.invalid
+    assert not exc.perm_error
+    assert not exc.rc_error
+    assert not exc.badtree
+
+    wbget.stop = True
 
 
 def test_misalign_mmu():
index 00bec7f55b490727bbe82ffc1e7b8eff231fb2f2..e79e0c127c22ef55947363498624f70b67d66806 100644 (file)
@@ -1,4 +1,5 @@
-from nmigen import (C, Module, Signal, Elaboratable, Mux, Cat, Repl, Signal)
+from nmigen import (C, Module, Signal, Elaboratable, Mux, Cat, Repl, Signal,
+                    Const)
 from nmigen.cli import main
 from nmigen.cli import rtlil
 from nmutil.mask import Mask, masked
@@ -7,7 +8,8 @@ from random import randint, seed
 from nmigen.sim import Simulator, Delay, Settle
 from nmutil.util import wrap
 
-from soc.config.test.test_pi2ls import pi_ld, pi_st, pi_ldst, wait_busy
+from soc.config.test.test_pi2ls import (pi_ld, pi_st, pi_ldst, wait_busy,
+                                        get_exception_info)
 #from soc.config.test.test_pi2ls import pi_st_debug
 from soc.config.test.test_loadstore import TestMemPspec
 from soc.config.loadstore import ConfigMemoryPortInterface
@@ -18,59 +20,17 @@ from soc.experiment.test import pagetables
 
 from nmigen.compat.sim import run_simulation
 from random import random
+from openpower.test.wb_get import wb_get_classic
+from openpower.test import wb_get as wbget
+from openpower.exceptions import LDSTExceptionTuple
 
-stop = False
+from soc.config.test.test_fetch import read_from_addr
+from openpower.decoder.power_enums import MSRSpec
 
-def wb_get(wb, mem):
-    """simulator process for getting memory load requests
-    """
-
-    global stop
-    assert (stop==False)
-
-    while not stop:
-        while True: # wait for dc_valid
-            if stop:
-                return
-            cyc = yield (wb.cyc)
-            stb = yield (wb.stb)
-            if cyc and stb:
-                break
-            yield
-        addr = (yield wb.adr) << 3
-        if addr not in mem:
-            print ("    WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
-        # read or write?
-        we = (yield wb.we)
-        if we:
-            store = (yield wb.dat_w)
-            sel = (yield wb.sel)
-            data = mem.get(addr, 0)
-            # note we assume 8-bit sel, here
-            res = 0
-            for i in range(8):
-                mask = 0xff << (i*8)
-                if sel & (1<<i):
-                    res |= store & mask
-                else:
-                    res |= data & mask
-            mem[addr] = res
-            print ("    DCACHE set %x mask %x data %x" % (addr, sel, res))
-        else:
-            data = mem.get(addr, 0)
-            yield wb.dat_r.eq(data)
-            print ("    DCACHE get %x data %x" % (addr, data))
-
-        yield wb.ack.eq(1)
-        yield
-        yield wb.ack.eq(0)
-        yield
 
 def setup_mmu():
 
-    global stop
-    stop = False
+    wbget.stop = False
 
     pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
                          imem_ifacetype='',
@@ -85,35 +45,444 @@ def setup_mmu():
     m.submodules.ldst = ldst = cmpi.pi
     m.submodules.mmu = mmu = MMU()
     dcache = ldst.dcache
+    icache = ldst.icache
 
     l_in, l_out = mmu.l_in, mmu.l_out
     d_in, d_out = dcache.d_in, dcache.d_out
-    wb_out, wb_in = dcache.wb_out, dcache.wb_in
+    i_in, i_out = icache.i_in, icache.i_out # FetchToICache, ICacheToDecode
 
-    # link mmu and dcache together
+    # link mmu, dcache and icache together
     m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
+    m.d.comb += icache.m_in.eq(mmu.i_out) # MMUToICacheType
     m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
 
     # link ldst and MMU together
     comb += l_in.eq(ldst.m_out)
     comb += ldst.m_in.eq(l_out)
 
+    # add a debug status Signal: use "msg.str = "blah"
+    # then toggle with yield msg.eq(0); yield msg.eq(1)
+    debug_status = Signal(8, decoder=lambda _ : debug_status.str)
+    m.debug_status = debug_status
+    debug_status.str = ''
+
     return m, cmpi
 
+
+def icache_read(dut,addr,priv,virt):
+
+    icache = dut.submodules.ldst.icache
+    i_in = icache.i_in
+    i_out  = icache.i_out
+
+    yield i_in.priv_mode.eq(priv)
+    yield i_in.virt_mode.eq(virt)
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(addr)
+    yield i_in.stop_mark.eq(0)
+
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(addr)
+    yield
+    valid = yield i_out.valid
+    failed = yield i_out.fetch_failed
+    while not valid and not failed:
+        yield
+        valid = yield i_out.valid
+        failed = yield i_out.fetch_failed
+    yield i_in.req.eq(0)
+
+    nia   = yield i_out.nia
+    insn  = yield i_out.insn
+    yield
+    yield
+
+    return nia, insn, valid, failed
+
+
 test_exceptions = True
 test_dcbz = True
 test_random = True
 
+
+def debug(dut, msg):
+    print ("set debug message", msg)
+    dut.debug_status.str = msg # set the message
+    yield dut.debug_status.eq(0) # trigger an update
+    yield dut.debug_status.eq(1)
+
+
+def _test_loadstore1_ifetch_iface(dut, mem):
+    """test_loadstore1_ifetch_iface
+
+    read in priv mode, non-virtual.  tests the FetchUnitInterface
+
+    """
+
+    mmu = dut.submodules.mmu
+    ldst = dut.submodules.ldst
+    pi = ldst.pi
+    icache = dut.submodules.ldst.icache
+    wbget.stop = False
+
+    print("=== test loadstore instruction (real) ===")
+
+    i_in = icache.i_in
+    i_out  = icache.i_out
+    i_m_in = icache.m_in
+
+    yield from debug(dut, "real mem instruction")
+    # set address to 0x8, update mem[0x8] to 01234 | 0x5678<<32
+    # (have to do 64-bit writes into the dictionary-memory-emulated-thing)
+    addr = 8
+    addr2 = 12
+    expected_insn2 = 0x5678
+    expected_insn = 0x1234
+    mem[addr] = expected_insn | expected_insn2<<32
+
+    yield i_in.priv_mode.eq(1)
+    insn = yield from read_from_addr(icache, addr, stall=False)
+
+    nia   = yield i_out.nia  # NO, must use FetchUnitInterface
+    print ("fetched %x from addr %x" % (insn, nia))
+    assert insn == expected_insn
+
+    print("=== test loadstore instruction (2nd, real) ===")
+    yield from debug(dut, "real mem 2nd (addr 0xc)")
+
+    insn2 = yield from read_from_addr(icache, addr2, stall=False)
+
+    nia   = yield i_out.nia  # NO, must use FetchUnitInterface
+    print ("fetched %x from addr2 %x" % (insn2, nia))
+    assert insn2 == expected_insn2
+
+    print("=== test loadstore instruction (done) ===")
+
+    yield from debug(dut, "test done")
+    yield
+    yield
+
+    print ("fetched %x from addr %x" % (insn, nia))
+    assert insn == expected_insn
+
+    wbget.stop = True
+
+
+def write_mem2(mem, addr, i1, i2):
+    mem[addr] = i1 | i2<<32
+
+
+#TODO: use fetch interface here
+def lookup_virt(dut,addr):
+    icache = dut.submodules.ldst.icache
+    i_in = icache.i_in
+    i_out  = icache.i_out
+    yield i_in.priv_mode.eq(0)
+    yield i_in.virt_mode.eq(1)
+    yield i_in.req.eq(0)
+    yield i_in.stop_mark.eq(0)
+
+    yield icache.a_i_valid.eq(1)
+    yield icache.a_pc_i.eq(addr)
+    yield
+    valid = yield i_out.valid
+    failed = yield i_out.fetch_failed
+    while not valid and not failed:
+        yield
+        valid = yield i_out.valid
+        failed = yield i_out.fetch_failed
+    yield icache.a_i_valid.eq(0)
+
+    return valid,failed
+
+
+def mmu_lookup(dut,addr):
+    ldst = dut.submodules.ldst
+    pi = ldst.pi
+    yield from debug(dut, "instr fault "+hex(addr))
+    yield ldst.priv_mode.eq(0)
+    yield ldst.instr_fault.eq(1)
+    yield ldst.maddr.eq(addr)
+    yield
+    yield ldst.instr_fault.eq(0)
+    while True:
+        done = yield (ldst.done)
+        exc_info = yield from get_exception_info(pi.exc_o)
+        if done or exc_info.happened:
+            break
+        yield
+    yield
+    assert exc_info.happened == 0 # assert just before doing the fault set zero
+    yield ldst.instr_fault.eq(0)
+    yield from debug(dut, "instr fault done "+hex(addr))
+    yield
+    yield
+    yield
+
+
+def _test_loadstore1_ifetch_multi(dut, mem):
+    mmu = dut.submodules.mmu
+    ldst = dut.submodules.ldst
+    pi = ldst.pi
+    icache = dut.submodules.ldst.icache
+    assert wbget.stop == False
+
+    print ("set process table")
+    yield from debug(dut, "set prtble")
+    yield mmu.rin.prtbl.eq(0x1000000) # set process table
+    yield
+
+    i_in = icache.i_in
+    i_out  = icache.i_out
+    i_m_in = icache.m_in
+
+    # fetch instructions from multiple addresses
+    # should cope with some addresses being invalid
+    real_addrs = [0,4,8,0,8,4,0,0,12]
+    write_mem2(mem,0,0xF0,0xF4)
+    write_mem2(mem,8,0xF8,0xFC)
+
+    yield i_in.priv_mode.eq(1)
+    for addr in real_addrs:
+        yield from debug(dut, "real_addr "+hex(addr))
+        insn = yield from read_from_addr(icache, addr, stall=False)
+        nia   = yield i_out.nia  # NO, must use FetchUnitInterface
+        print ("TEST_MULTI: fetched %x from addr %x == %x" % (insn, nia,addr))
+        assert insn==0xF0+addr
+
+    # now with virtual memory enabled
+    yield i_in.virt_mode.eq(1)
+
+    virt_addrs = [0x10200,0x10204,0x10208,0x10200,
+                  0x102008,0x10204,0x10200,0x10200,0x10200C]
+
+    write_mem2(mem,0x10200,0xF8,0xFC)
+
+    for addr in virt_addrs:
+        yield from debug(dut, "virt_addr "+hex(addr))
+
+        valid, failed = yield from lookup_virt(dut,addr)
+        yield
+        print("TEST_MULTI: failed=",failed) # this is reported wrong
+        if failed==1: # test one first
+            yield from mmu_lookup(dut,addr)
+            valid, failed = yield from lookup_virt(dut,addr)
+            assert(valid==1)
+
+    wbget.stop = True
+
+
+def _test_loadstore1_ifetch(dut, mem):
+    """test_loadstore1_ifetch
+
+    this is quite a complex multi-step test.
+
+    * first (just because, as a demo) read in priv mode, non-virtual.
+      just like in experiment/icache.py itself.
+
+    * second, using the (usual) PTE for these things (which came originally
+      from gem5-experimental experiment/radix_walk_example.txt) do a
+      virtual-memory read through the *instruction* cache.
+      this is expected to FAIL
+
+    * third: mess about with the MMU, setting "iside" (instruction-side),
+      requesting an MMU RADIX LOOKUP.  this triggers an itlb_load
+      (instruction-cache TLB entry-insertion)
+
+    * fourth and finally: retry the read of the instruction through i-cache.
+      this is now expected to SUCCEED
+
+    a lot going on.
+    """
+
+    mmu = dut.submodules.mmu
+    ldst = dut.submodules.ldst
+    pi = ldst.pi
+    icache = dut.submodules.ldst.icache
+    wbget.stop = False
+
+    print("=== test loadstore instruction (real) ===")
+
+    i_in = icache.i_in
+    i_out  = icache.i_out
+    i_m_in = icache.m_in
+
+    # first virtual memory test
+
+    print ("set process table")
+    yield from debug(dut, "set prtble")
+    yield mmu.rin.prtbl.eq(0x1000000) # set process table
+    yield
+
+    yield from debug(dut, "real mem instruction")
+    # set address to zero, update mem[0] to 01234
+    addr = 8
+    expected_insn = 0x1234
+    mem[addr] = expected_insn
+
+    yield i_in.priv_mode.eq(1)
+    yield i_in.req.eq(0)
+    yield i_in.nia.eq(addr)
+    yield i_in.stop_mark.eq(0)
+    yield i_m_in.tlbld.eq(0)
+    yield i_m_in.tlbie.eq(0)
+    yield i_m_in.addr.eq(0)
+    yield i_m_in.pte.eq(0)
+    yield
+    yield
+    yield
+
+    # miss, stalls for a bit -- this one is different here
+    ##nia, insn, valid, failed = yield from icache_read(dut,addr,0,0)
+    ##assert(valid==0)
+    ##assert(failed==1)
+
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(addr)
+    yield
+    valid = yield i_out.valid
+    while not valid:
+        yield
+        valid = yield i_out.valid
+    yield i_in.req.eq(0)
+
+    nia   = yield i_out.nia
+    insn  = yield i_out.insn
+    yield
+    yield
+
+    print ("fetched %x from addr %x" % (insn, nia))
+    assert insn == expected_insn
+
+    print("=== test loadstore instruction (virtual) ===")
+
+    # look up i-cache expecting it to fail
+
+    yield from debug(dut, "virtual instr req")
+    # set address to 0x10200, update mem[] to 5678
+    virt_addr = 0x10200
+    real_addr = virt_addr
+    expected_insn = 0x5678
+    mem[real_addr] = expected_insn
+
+    yield i_in.priv_mode.eq(0)
+    yield i_in.virt_mode.eq(1)
+    yield i_in.req.eq(0)
+    yield i_in.nia.eq(virt_addr)
+    yield i_in.stop_mark.eq(0)
+    yield i_m_in.tlbld.eq(0)
+    yield i_m_in.tlbie.eq(0)
+    yield i_m_in.addr.eq(0)
+    yield i_m_in.pte.eq(0)
+    yield
+    yield
+    yield
+
+    # miss, stalls for a bit
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(virt_addr)
+    yield
+    valid = yield i_out.valid
+    failed = yield i_out.fetch_failed
+    while not valid and not failed:
+        yield
+        valid = yield i_out.valid
+        failed = yield i_out.fetch_failed
+    yield i_in.req.eq(0)
+
+    print ("failed?", "yes" if failed else "no")
+    assert failed == 1
+    yield
+    yield
+
+    print("=== test loadstore instruction (instruction fault) ===")
+
+    yield from debug(dut, "instr fault")
+
+    virt_addr = 0x10200
+
+    yield ldst.priv_mode.eq(0)
+    yield ldst.instr_fault.eq(1)
+    yield ldst.maddr.eq(virt_addr)
+    # still broken -- investigate
+    # msr = MSRSpec(pr=?, dr=?, sf=0)
+    # ld_data, exctype, exc = yield from pi_ld(pi, virt_addr, 8, msr=msr)
+    yield
+    yield ldst.instr_fault.eq(0)
+    while True:
+        done = yield (ldst.done)
+        exc_info = yield from get_exception_info(pi.exc_o)
+        if done or exc_info.happened:
+            break
+        yield
+    assert exc_info.happened == 0 # assert just before doing the fault set zero
+    yield ldst.instr_fault.eq(0)
+    yield
+    yield
+    yield
+
+    print("=== test loadstore instruction (try instruction again) ===")
+    yield from debug(dut, "instr virt retry")
+    # set address to 0x10200, update mem[] to 5678
+    virt_addr = 0x10200
+    real_addr = virt_addr
+    expected_insn = 0x5678
+
+    yield i_in.priv_mode.eq(0)
+    yield i_in.virt_mode.eq(1)
+    yield i_in.req.eq(0)
+    yield i_in.nia.eq(virt_addr)
+    yield i_in.stop_mark.eq(0)
+    yield i_m_in.tlbld.eq(0)
+    yield i_m_in.tlbie.eq(0)
+    yield i_m_in.addr.eq(0)
+    yield i_m_in.pte.eq(0)
+    yield
+    yield
+    yield
+
+    # miss, stalls for a bit
+    """
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(virt_addr)
+    yield
+    valid = yield i_out.valid
+    failed = yield i_out.fetch_failed
+    while not valid and not failed:
+        yield
+        valid = yield i_out.valid
+        failed = yield i_out.fetch_failed
+    yield i_in.req.eq(0)
+    nia   = yield i_out.nia
+    insn  = yield i_out.insn
+    """
+
+    ## part 4
+    nia, insn, valid, failed = yield from icache_read(dut,virt_addr,0,1)
+
+    yield from debug(dut, "test done")
+    yield
+    yield
+
+    print ("failed?", "yes" if failed else "no")
+    assert failed == 0
+
+    print ("fetched %x from addr %x" % (insn, nia))
+    assert insn == expected_insn
+
+    wbget.stop = True
+
+
 def _test_loadstore1_invalid(dut, mem):
     mmu = dut.submodules.mmu
     pi = dut.submodules.ldst.pi
-    global stop
-    stop = False
+    wbget.stop = False
 
     print("=== test invalid ===")
 
     addr = 0
-    ld_data, exctype, exc, dar_o = yield from pi_ld(pi, addr, 8, msr_pr=1)
+    msr = MSRSpec(pr=1, dr=0, sf=0) # set problem-state
+    ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
     print("ld_data", ld_data, exctype, exc)
     assert (exctype == "slow")
     invalid = exc.invalid
@@ -121,39 +490,128 @@ def _test_loadstore1_invalid(dut, mem):
 
     print("=== test invalid done ===")
 
-    stop = True
+    wbget.stop = True
+
+
+def _test_loadstore1_microwatt_mmu_bin_test2(dut, mem):
+    mmu = dut.submodules.mmu
+    pi = dut.submodules.ldst.pi
+    ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+    wbget.stop = False
+
+    yield mmu.rin.prtbl.eq(0x12000) # set process table
+    yield mmu.rin.pid.eq(0x1)       # set PID=1
+    yield
+
+    addr = 0x124108
+    msr = MSRSpec(pr=1, dr=1, sf=1)
+
+    print("=== alignment error (ld) ===")
+
+    ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+    print("ld_data after mmu.bin test2")
+    print(ld_data)
+    assert ld_data == 0x0000000badc0ffee
+    assert exctype is None
+
+    wbget.stop = True
+
+
+def _test_loadstore1_microwatt_mmu_bin_test5(dut, mem):
+    mmu = dut.submodules.mmu
+    pi = dut.submodules.ldst.pi
+    ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+    wbget.stop = False
+
+    yield mmu.rin.prtbl.eq(0x12000) # set process table
+    yield mmu.rin.pid.eq(0x1)       # set PID=1
+    yield
+
+    addr = 0x39fffd
+    msr = MSRSpec(pr=1, dr=1, sf=1)
+
+    print("=== page-fault alignment error (ld) ===")
+
+    ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+    print("ld_data after mmu.bin test5")
+    print(ld_data)
+    print (exctype, exc)
+
+    wbget.stop = True
+
+
+def test_pi_ld_misalign(pi, addr, data_len, msr):
+    for i in range(0,data_len):
+        ld_data, exctype, exc = yield from pi_ld(pi, addr+i, data_len, msr=msr)
+        yield
+        assert exc is None # use "is None" not "== None"
+        print("MISALIGN: test_pi_ld_misalign returned",hex(ld_data))
+
+
+def test_pi_st_ld_misalign(pi, addr, data_len, msr):
+    data = 0x0102030405060708
+    for i in range(0, data_len):
+        exctype, exc = yield from pi_st(pi, addr+i, data, data_len, msr=msr)
+        print (exctype, exc)
+        assert exc is None # use "is None" not "== None"
+        ld_data, exctype, exc = yield from pi_ld(pi, addr+i, data_len, msr=msr)
+        yield
+        assert exc is None # use "is None" not "== None"
+        print("MISALIGN: test_pi_ld_misalign returned",hex(ld_data))
+        assert ld_data == data
+
+
+def _test_loadstore1_misalign(dut, mem):
+    mmu = dut.submodules.mmu
+    pi = dut.submodules.ldst.pi
+    ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+    wbget.stop = False
+
+    yield mmu.rin.prtbl.eq(0x12000) # set process table
+    yield mmu.rin.pid.eq(0x1)       # set PID=1
+    #yield
+
+    addr = 1
+    msr = MSRSpec(pr=0, dr=0, sf=1)
+
+    yield from test_pi_ld_misalign(pi,0,8,msr)
+
+    yield from test_pi_st_ld_misalign(pi,0,8,msr)
+
+    wbget.stop = True
 
 
 def _test_loadstore1(dut, mem):
     mmu = dut.submodules.mmu
     pi = dut.submodules.ldst.pi
-    global stop
-    stop = False
+    ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+    wbget.stop = False
 
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
 
     addr = 0x100e0
     data = 0xf553b658ba7e1f51
+    msr = MSRSpec(pr=0, dr=0, sf=0)
 
     if test_dcbz:
-        yield from pi_st(pi, addr, data, 8, msr_pr=1)
+        yield from pi_st(pi, addr, data, 8, msr=msr)
         yield
 
-        ld_data, exctype, exc, dar_o = yield from pi_ld(pi, addr, 8, msr_pr=1)
+        ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
         assert ld_data == 0xf553b658ba7e1f51
         assert exctype is None
 
-        ld_data, exctype, exc, dar_o  = yield from pi_ld(pi, addr, 8, msr_pr=1)
+        ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
         assert ld_data == 0xf553b658ba7e1f51
         assert exctype is None
 
         print("do_dcbz ===============")
-        yield from pi_st(pi, addr, data, 8, msr_pr=1, is_dcbz=1)
+        yield from pi_st(pi, addr, data, 8, msr=msr, is_dcbz=1)
         print("done_dcbz ===============")
         yield
 
-        ld_data, exctype, exc, dar_o  = yield from pi_ld(pi, addr, 8, msr_pr=1)
+        ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
         print("ld_data after dcbz")
         print(ld_data)
         assert ld_data == 0
@@ -162,13 +620,16 @@ def _test_loadstore1(dut, mem):
     if test_exceptions:
         print("=== alignment error (ld) ===")
         addr = 0xFF100e0FF
-        ld_data, exctype, exc, dar = yield from pi_ld(pi, addr, 8, msr_pr=1)
+        ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
         if exc:
             alignment = exc.alignment
             happened = exc.happened
+            yield # wait for dsr to update
+            dar = yield ldst.dar
         else:
             alignment = 0
             happened = 0
+            dar = 0
         assert (happened == 1)
         assert (alignment == 1)
         assert (dar == addr)
@@ -186,7 +647,7 @@ def _test_loadstore1(dut, mem):
 
         print("=== alignment error (st) ===")
         addr = 0xFF100e0FF
-        exctype, exc, dar_o = yield from pi_st(pi, addr,0, 8, msr_pr=1)
+        exctype, exc = yield from pi_st(pi, addr,0, 8, msr=msr)
         if exc:
             alignment = exc.alignment
             happened = exc.happened
@@ -205,7 +666,7 @@ def _test_loadstore1(dut, mem):
     if True:
         print("=== no alignment error (ld) ===")
         addr = 0x100e0
-        ld_data, exctype, exc, dar_o = yield from pi_ld(pi, addr, 8, msr_pr=1)
+        ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
         print("ld_data", ld_data, exctype, exc)
         if exc:
             alignment = exc.alignment
@@ -222,28 +683,219 @@ def _test_loadstore1(dut, mem):
 
         for addr in addrs:
             print("== RANDOM addr ==",hex(addr))
-            ld_data, exctype, exc, dar_o  = \
-                                yield from pi_ld(pi, addr, 8, msr_pr=1)
+            ld_data, exctype, exc  = yield from pi_ld(pi, addr, 8, msr=msr)
             print("ld_data[RANDOM]",ld_data,exc,addr)
             assert (exctype == None)
 
         for addr in addrs:
             print("== RANDOM addr ==",hex(addr))
-            exc = yield from pi_st(pi, addr,0xFF*addr, 8, msr_pr=1)
+            exc = yield from pi_st(pi, addr,0xFF*addr, 8, msr=msr)
             assert (exctype == None)
 
         # readback written data and compare
         for addr in addrs:
             print("== RANDOM addr ==",hex(addr))
-            ld_data, exctype, exc, dar_o = \
-                                yield from pi_ld(pi, addr, 8, msr_pr=1)
+            ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
             print("ld_data[RANDOM_READBACK]",ld_data,exc,addr)
             assert (exctype == None)
             assert (ld_data == 0xFF*addr)
 
         print("== RANDOM addr done ==")
 
-    stop = True
+    wbget.stop = True
+
+
+def _test_loadstore1_ifetch_invalid(dut, mem):
+    mmu = dut.submodules.mmu
+    ldst = dut.submodules.ldst
+    pi = ldst.pi
+    icache = dut.submodules.ldst.icache
+    wbget.stop = False
+
+    print("=== test loadstore instruction (invalid) ===")
+
+    i_in = icache.i_in
+    i_out  = icache.i_out
+    i_m_in = icache.m_in
+
+    # first virtual memory test
+
+    print ("set process table")
+    yield from debug(dut, "set prtbl")
+    yield mmu.rin.prtbl.eq(0x1000000) # set process table
+    yield
+
+    yield from debug(dut, "real mem instruction")
+    # set address to zero, update mem[0] to 01234
+    addr = 8
+    expected_insn = 0x1234
+    mem[addr] = expected_insn
+
+    yield i_in.priv_mode.eq(1)
+    yield i_in.req.eq(0)
+    yield i_in.nia.eq(addr)
+    yield i_in.stop_mark.eq(0)
+    yield i_m_in.tlbld.eq(0)
+    yield i_m_in.tlbie.eq(0)
+    yield i_m_in.addr.eq(0)
+    yield i_m_in.pte.eq(0)
+    yield
+    yield
+    yield
+
+    # miss, stalls for a bit
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(addr)
+    yield
+    valid = yield i_out.valid
+    nia   = yield i_out.nia
+    while not valid:
+        yield
+        valid = yield i_out.valid
+    yield i_in.req.eq(0)
+
+    nia   = yield i_out.nia
+    insn  = yield i_out.insn
+
+    yield
+    yield
+
+    print ("fetched %x from addr %x" % (insn, nia))
+    assert insn == expected_insn
+
+    print("=== test loadstore instruction (virtual) ===")
+    yield from debug(dut, "virtual instr req")
+
+    # look up i-cache expecting it to fail
+
+    # set address to 0x10200, update mem[] to 5678
+    virt_addr = 0x10200
+    real_addr = virt_addr
+    expected_insn = 0x5678
+    mem[real_addr] = expected_insn
+
+    yield i_in.priv_mode.eq(1)
+    yield i_in.virt_mode.eq(1)
+    yield i_in.req.eq(0)
+    yield i_in.nia.eq(virt_addr)
+    yield i_in.stop_mark.eq(0)
+    yield i_m_in.tlbld.eq(0)
+    yield i_m_in.tlbie.eq(0)
+    yield i_m_in.addr.eq(0)
+    yield i_m_in.pte.eq(0)
+    yield
+    yield
+    yield
+
+    # miss, stalls for a bit
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(virt_addr)
+    yield
+    valid = yield i_out.valid
+    failed = yield i_out.fetch_failed
+    while not valid and not failed:
+        yield
+        valid = yield i_out.valid
+        failed = yield i_out.fetch_failed
+    yield i_in.req.eq(0)
+
+    print ("failed?", "yes" if failed else "no")
+    assert failed == 1
+    yield
+    yield
+
+    print("=== test invalid loadstore instruction (instruction fault) ===")
+
+    yield from debug(dut, "instr fault (perm err expected)")
+    virt_addr = 0x10200
+
+    yield ldst.priv_mode.eq(0)
+    yield ldst.instr_fault.eq(1)
+    yield ldst.maddr.eq(virt_addr)
+    #ld_data, exctype, exc = yield from pi_ld(pi, virt_addr, 8, msr=msr)
+    yield
+    yield ldst.instr_fault.eq(0)
+    while True:
+        done = yield (ldst.done)
+        exc_info = yield from get_exception_info(pi.exc_o)
+        if done or exc_info.happened:
+            break
+        yield
+    assert exc_info.happened == 1 # different here as expected
+
+    # TODO: work out what kind of exception occurred and check it's
+    # the right one.  we *expect* it to be a permissions error because
+    # the RPTE leaf node in pagetables.test2 is marked as "non-executable"
+    # but we also expect instr_fault to be set because it is an instruction
+    # (iside) lookup
+    print ("   MMU lookup exception type?")
+    for fname in LDSTExceptionTuple._fields:
+        print ("   fname %20s %d" % (fname, getattr(exc_info, fname)))
+
+    # ok now printed them out and visually inspected: check them with asserts
+    assert exc_info.instr_fault == 1 # instruction fault (yes!)
+    assert exc_info.perm_error == 1 # permissions (yes!)
+    assert exc_info.rc_error == 0
+    assert exc_info.alignment == 0
+    assert exc_info.invalid == 0
+    assert exc_info.segment_fault == 0
+    assert exc_info.rc_error == 0
+
+    yield from debug(dut, "test done")
+    yield ldst.instr_fault.eq(0)
+    yield
+    yield
+    yield
+
+    wbget.stop = True
+
+
+def test_loadstore1_ifetch_unit_iface():
+
+    m, cmpi = setup_mmu()
+
+    mem = pagetables.test1
+
+    # set this up before passing to Simulator (which calls elaborate)
+    icache = m.submodules.ldst.icache
+    icache.use_fetch_interface() # this is the function which converts
+                                 # to FetchUnitInterface. *including*
+                                 # rewiring the Wishbone Bus to ibus
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    sim.add_sync_process(wrap(_test_loadstore1_ifetch_iface(m, mem)))
+    # add two wb_get_classic processes onto the *same* memory dictionary.
+    # this shouuuld work.... cross-fingers...
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(icache.ibus, mem))) # ibus not bus
+    with sim.write_vcd('test_loadstore1_ifetch_iface.vcd',
+                      traces=[m.debug_status]): # include extra debug
+        sim.run()
+
+
+def test_loadstore1_ifetch():
+
+    m, cmpi = setup_mmu()
+
+    mem = pagetables.test1
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    icache = m.submodules.ldst.icache
+    sim.add_sync_process(wrap(_test_loadstore1_ifetch(m, mem)))
+    # add two wb_get_classic processes onto the *same* memory dictionary.
+    # this shouuuld work.... cross-fingers...
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(icache.bus, mem)))
+    with sim.write_vcd('test_loadstore1_ifetch.vcd',
+                      traces=[m.debug_status]): # include extra debug
+        sim.run()
+
 
 def test_loadstore1():
 
@@ -256,10 +908,64 @@ def test_loadstore1():
     sim.add_clock(1e-6)
 
     sim.add_sync_process(wrap(_test_loadstore1(m, mem)))
-    sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
     with sim.write_vcd('test_loadstore1.vcd'):
         sim.run()
 
+
+def test_loadstore1_microwatt_mmu_bin_test2():
+
+    m, cmpi = setup_mmu()
+
+    mem = pagetables.microwatt_test2
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    sim.add_sync_process(wrap(_test_loadstore1_microwatt_mmu_bin_test2(m, mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    with sim.write_vcd('test_microwatt_mmu_test2.vcd'):
+        sim.run()
+
+
+def test_loadstore1_microwatt_mmu_bin_test5():
+
+    m, cmpi = setup_mmu()
+
+    mem = pagetables.microwatt_test5
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    sim.add_sync_process(wrap(_test_loadstore1_microwatt_mmu_bin_test5(m, mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    with sim.write_vcd('test_microwatt_mmu_test5.vcd'):
+        sim.run()
+
+
+def test_loadstore1_misalign():
+
+    m, cmpi = setup_mmu()
+
+    mem = pagetables.microwatt_test2
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    ###########1122334455667788
+    mem[0] = 0x0102030405060708
+    mem[8] = 0xffffffffffffffff
+
+    sim.add_sync_process(wrap(_test_loadstore1_misalign(m, mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    with sim.write_vcd('test_loadstore1_misalign.vcd'):
+        sim.run()
+    print ("mem", mem)
+
+
 def test_loadstore1_invalid():
 
     m, cmpi = setup_mmu()
@@ -271,10 +977,67 @@ def test_loadstore1_invalid():
     sim.add_clock(1e-6)
 
     sim.add_sync_process(wrap(_test_loadstore1_invalid(m, mem)))
-    sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
     with sim.write_vcd('test_loadstore1_invalid.vcd'):
         sim.run()
 
+
+def test_loadstore1_ifetch_invalid():
+    m, cmpi = setup_mmu()
+
+    # this is a specially-arranged page table which has the permissions
+    # barred for execute on the leaf node (EAA=0x2 instead of EAA=0x3)
+    mem = pagetables.test2
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    icache = m.submodules.ldst.icache
+    sim.add_sync_process(wrap(_test_loadstore1_ifetch_invalid(m, mem)))
+    # add two wb_get_classic processes onto the *same* memory dictionary.
+    # this shouuuld work.... cross-fingers...
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(icache.bus, mem)))
+    with sim.write_vcd('test_loadstore1_ifetch_invalid.vcd',
+                      traces=[m.debug_status]): # include extra debug
+        sim.run()
+
+
+def test_loadstore1_ifetch_multi():
+    m, cmpi = setup_mmu()
+    wbget.stop = False
+
+    # this is a specially-arranged page table which has the permissions
+    # barred for execute on the leaf node (EAA=0x2 instead of EAA=0x3)
+    mem = pagetables.test1
+
+    # set this up before passing to Simulator (which calls elaborate)
+    icache = m.submodules.ldst.icache
+    icache.use_fetch_interface() # this is the function which converts
+                                 # to FetchUnitInterface. *including*
+                                 # rewiring the Wishbone Bus to ibus
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    sim.add_sync_process(wrap(_test_loadstore1_ifetch_multi(m, mem)))
+    # add two wb_get_classic processes onto the *same* memory dictionary.
+    # this shouuuld work.... cross-fingers...
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(icache.ibus, mem))) # ibus not bus
+    with sim.write_vcd('test_loadstore1_ifetch_multi.vcd',
+                      traces=[m.debug_status]): # include extra debug
+        sim.run()
+
 if __name__ == '__main__':
-    test_loadstore1()
-    test_loadstore1_invalid()
+    #test_loadstore1()
+    #test_loadstore1_microwatt_mmu_bin_test2()
+    #test_loadstore1_microwatt_mmu_bin_test5()
+    #test_loadstore1_invalid()
+    #test_loadstore1_ifetch() #FIXME
+    #test_loadstore1_ifetch_invalid()
+    #test_loadstore1_ifetch_unit_iface() # guess: should be working
+    #test_loadstore1_ifetch_multi()
+    test_loadstore1_misalign()
index 1528d7d40db31bbaed8f821a7a90663ef087bb26..e31225f6e369cdc48ef8c7fa0cc2c9867438b989 100644 (file)
@@ -21,15 +21,12 @@ from soc.experiment.mem_types import (LoadStore1ToMMUType,
 from soc.experiment.mmu import MMU
 from soc.experiment.dcache import DCache
 from soc.experiment.icache import ICache
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
 
 import random
 
-stop = False
-
-def set_stop(newval):
-    global stop
-    stop = newval
-
+wbget.stop = False
 
 def b(x):
     return int.from_bytes(x.to_bytes(8, byteorder='little'),
@@ -55,48 +52,13 @@ default_mem = { 0x10000:    # PARTITION_TABLE_2
             }
 
 
-def wb_get(c, mem, name):
-    """simulator process for getting memory load requests
-    """
-
-    logfile = open("/tmp/wb_get.log","w")
-
-    def log(msg):
-        logfile.write(msg+"\n")
-        print(msg)
-
-    global stop
-    while not stop:
-        while True: # wait for dc_valid
-            if stop:
-                log("stop")
-                return
-            cyc = yield (c.wb_out.cyc)
-            stb = yield (c.wb_out.stb)
-            if cyc and stb:
-                break
-            yield
-        addr = (yield c.wb_out.adr) << 3
-        if addr not in mem:
-            log("%s LOOKUP FAIL %x" % (name, addr))
-            stop = True
-            return
-
-        yield
-        data = mem[addr]
-        yield c.wb_in.dat.eq(data)
-        log("%s get %x data %x" % (name, addr, data))
-        yield c.wb_in.ack.eq(1)
-        yield
-        yield c.wb_in.ack.eq(0)
-        yield
-
-
 def icache_sim(dut, mem):
     i_out = dut.i_in
     i_in  = dut.i_out
     m_out = dut.m_in
 
+    wbget.stop = False
+
     for k,v in mem.items():
         yield i_in.valid.eq(0)
         yield i_out.priv_mode.eq(1)
@@ -126,6 +88,7 @@ def icache_sim(dut, mem):
         yield i_out.req.eq(0)
         yield
 
+    wbget.stop = True
 
 def test_icache_il():
     dut = ICache()
@@ -155,19 +118,21 @@ def test_icache():
 
     # read from "memory" process and corresponding wishbone "read" process
     sim.add_sync_process(wrap(icache_sim(icache, mem)))
-    sim.add_sync_process(wrap(wb_get(icache, mem, "ICACHE")))
+    sim.add_sync_process(wrap(wb_get(icache.bus, mem, "ICACHE")))
     with sim.write_vcd('test_icache.vcd'):
         sim.run()
 
 
 def mmu_lookup(mmu, addr):
-    global stop
 
     yield mmu.l_in.load.eq(1)
     yield mmu.l_in.priv.eq(1)
     yield mmu.l_in.addr.eq(addr)
     yield mmu.l_in.valid.eq(1)
-    while not stop: # wait for dc_valid / err
+
+    print ("mmu lookup %x stopped" % addr, wbget.stop)
+    while not wbget.stop: # wait for dc_valid / err
+        print ("stopped", wbget.stop)
         l_done = yield (mmu.l_out.done)
         l_err = yield (mmu.l_out.err)
         l_badtree = yield (mmu.l_out.badtree)
@@ -190,7 +155,7 @@ def mmu_lookup(mmu, addr):
 
 
 def mmu_sim(mmu):
-    global stop
+    wbget.stop = False
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
 
@@ -199,8 +164,9 @@ def mmu_sim(mmu):
 
     phys_addr = yield from mmu_lookup(mmu, 0x10000)
     assert phys_addr == 0x40000
+    yield
 
-    stop = True
+    wbget.stop = True
 
 
 def test_mmu():
@@ -219,7 +185,8 @@ def test_mmu():
     sim.add_clock(1e-6)
 
     sim.add_sync_process(wrap(mmu_sim(mmu)))
-    sim.add_sync_process(wrap(wb_get(dcache, default_mem, "DCACHE")))
+    sim.add_sync_process(wrap(wb_get(dcache.bus,
+                              default_mem, "DCACHE")))
     with sim.write_vcd('test_mmu.vcd'):
         sim.run()
 
index d93bd594e84f52614c8bd1992d941250fdd3444d..338480d848d0ae5c03c100666361c784046176f2 100644 (file)
@@ -28,6 +28,8 @@ from soc.experiment.mem_types import (LoadStore1ToMMUType,
 
 from soc.experiment.mmu import MMU
 from soc.experiment.dcache import DCache
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
 
 #more imports 
 
@@ -49,6 +51,28 @@ from nmigen.compat.sim import run_simulation, Settle
 # will take at least one week (10.10.2020)
 # many unconnected signals
 
+def b(x):
+    return int.from_bytes(x.to_bytes(8, byteorder='little'),
+                          byteorder='big', signed=False)
+
+mem = {0x10000:    # PARTITION_TABLE_2
+                   # PATB_GR=1 PRTB=0x1000 PRTS=0xb
+       b(0x800000000100000b),
+
+       0x30000:     # RADIX_ROOT_PTE
+                    # V = 1 L = 0 NLB = 0x400 NLS = 9
+       b(0x8000000000040009),
+
+       0x40000:     # RADIX_SECOND_LEVEL
+                    #     V = 1 L = 1 SW = 0 RPN = 0
+                       # R = 1 C = 1 ATT = 0 EAA 0x7
+       b(0xc000000000000187),
+
+      0x1000000:   # PROCESS_TABLE_3
+                   # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
+       b(0x40000000000300ad),
+      }
+
 
 class TestMicrowattMemoryPortInterface(PortInterfaceBase):
     """TestMicrowattMemoryPortInterface
@@ -61,18 +85,18 @@ class TestMicrowattMemoryPortInterface(PortInterfaceBase):
         self.mmu = mmu
         self.dcache = dcache
 
-    def set_wr_addr(self, m, addr, mask, misalign, msr_pr, is_dcbz):
+    def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
         m.d.comb += self.dcache.d_in.addr.eq(addr)
         m.d.comb += self.mmu.l_in.addr.eq(addr)
         m.d.comb += self.mmu.l_in.load.eq(0)
-        m.d.comb += self.mmu.l_in.priv.eq(1) # TODO put msr_pr here
+        m.d.comb += self.mmu.l_in.priv.eq(~msr.pr) # TODO verify
         m.d.comb += self.mmu.l_in.valid.eq(1)
 
-    def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+    def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
         m.d.comb += self.dcache.d_in.addr.eq(addr)
         m.d.comb += self.mmu.l_in.addr.eq(addr)
         m.d.comb += self.mmu.l_in.load.eq(1)
-        m.d.comb += self.mmu.l_in.priv.eq(1) # TODO put msr_pr here
+        m.d.comb += self.mmu.l_in.priv.eq(~msr.pr) # TODO verify
         m.d.comb += self.mmu.l_in.valid.eq(1)
 
     def set_wr_data(self, m, data, wen):
@@ -120,62 +144,11 @@ class TestMicrowattMemoryPortInterface(PortInterfaceBase):
         yield from super().ports()
         # TODO: memory ports
 
-stop = False
-
-
-def wb_get(dc):
-    """simulator process for getting memory load requests
-    """
-
-    global stop
-
-    def b(x):
-        return int.from_bytes(x.to_bytes(8, byteorder='little'),
-                              byteorder='big', signed=False)
-
-    mem = {0x10000:    # PARTITION_TABLE_2
-                       # PATB_GR=1 PRTB=0x1000 PRTS=0xb
-           b(0x800000000100000b),
-
-           0x30000:     # RADIX_ROOT_PTE
-                        # V = 1 L = 0 NLB = 0x400 NLS = 9
-           b(0x8000000000040009),
-
-           0x40000:     # RADIX_SECOND_LEVEL
-                        #         V = 1 L = 1 SW = 0 RPN = 0
-                           # R = 1 C = 1 ATT = 0 EAA 0x7
-           b(0xc000000000000187),
-
-          0x1000000:   # PROCESS_TABLE_3
-                       # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
-           b(0x40000000000300ad),
-          }
-
-    while not stop:
-        while True: # wait for dc_valid
-            if stop:
-                return
-            cyc = yield (dc.wb_out.cyc)
-            stb = yield (dc.wb_out.stb)
-            if cyc and stb:
-                break
-            yield
-        addr = (yield dc.wb_out.adr) << 3
-        if addr not in mem:
-            print ("    WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
-        data = mem.get(addr, 0)
-        yield dc.wb_in.dat.eq(data)
-        print ("    DCACHE get %x data %x" % (addr, data))
-        yield dc.wb_in.ack.eq(1)
-        yield
-        yield dc.wb_in.ack.eq(0)
-        yield
+wbget.stop = False
 
 
 def mmu_lookup(dut, addr):
     mmu = dut.mmu
-    global stop
 
     print("pi_ld")
     yield from pi_ld(dut.pi, addr, 1)
@@ -210,7 +183,6 @@ def mmu_lookup(dut, addr):
 
 def mmu_sim(dut):
     mmu = dut.mmu
-    global stop
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
 
@@ -226,7 +198,7 @@ def mmu_sim(dut):
     phys_addr = yield from mmu_lookup(dut, 0x10000)
     assert phys_addr == 0x40000
 
-    stop = True
+    wbget.stop = True
 
 
 def test_mmu():
@@ -242,7 +214,7 @@ def test_mmu():
     sim.add_clock(1e-6)
 
     sim.add_sync_process(wrap(mmu_sim(dut)))
-    sim.add_sync_process(wrap(wb_get(dcache)))
+    sim.add_sync_process(wrap(wb_get(dcache.bus, mem)))
     with sim.write_vcd('test_mmu_pi.vcd'):
         sim.run()
 
index fd3279ded07992875ccf996937b3d92e216b7164..d1a99381df83e7ee3534e6852fd7466b0c0e74fd 100644 (file)
@@ -1,44 +1,2 @@
-def wb_get(dut):
-    """simulator process for getting memory load requests
-    """
-    mem = dut.mem
-    wb = dut.cmpi.wb_bus()
+from openpower.test.wb_get import wb_get
 
-    while not dut.stop:
-        while True: # wait for dc_valid
-            if dut.stop:
-                return
-            cyc = yield (wb.cyc)
-            stb = yield (wb.stb)
-            if cyc and stb:
-                break
-            yield
-        addr = (yield wb.adr) << 3
-        if addr not in mem:
-            print ("    WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
-        # read or write?
-        we = (yield wb.we)
-        if we:
-            store = (yield wb.dat_w)
-            sel = (yield wb.sel)
-            data = mem.get(addr, 0)
-            # note we assume 8-bit sel, here
-            res = 0
-            for i in range(8):
-                mask = 0xff << (i*8)
-                if sel & (1<<i):
-                    res |= store & mask
-                else:
-                    res |= data & mask
-            mem[addr] = res
-            print ("    DCACHE set %x mask %x data %x" % (addr, sel, res))
-        else:
-            data = mem.get(addr, 0)
-            yield wb.dat_r.eq(data)
-            print ("    DCACHE get %x data %x" % (addr, data))
-
-        yield wb.ack.eq(1)
-        yield
-        yield wb.ack.eq(0)
-        yield
index 107be930091e65d45dc984a5c4f26903e4ce7a98..ba65373b646dcdde5a90e89161aae7bdea65a578 100644 (file)
@@ -32,7 +32,7 @@ class Driver(Elaboratable):
             recwidth += width
             comb += p.eq(AnyConst(width))
 
-        pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth)
+        pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth, parent_pspec=None)
         m.submodules.dut = dut = ALUInputStage(pspec)
 
         a = Signal(64)
@@ -66,6 +66,7 @@ class GTCombinerTestCase(FHDLTestCase):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=4)
         self.assertFormal(module, mode="cover", depth=4)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index 529381eaf0c799a455525bb5326ea5307f594fe4..de8dc54f1c82ea18eb768e40ec183fab119e5049 100644 (file)
@@ -37,20 +37,20 @@ class Driver(Elaboratable):
             width = p.width
             comb += p.eq(AnyConst(width))
 
-        pspec = ALUPipeSpec(id_wid=2)
+        pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.dut = dut = ALUMainStage(pspec)
 
         # convenience variables
         a = dut.i.a
         b = dut.i.b
         ca_in = dut.i.xer_ca[0]   # CA carry in
-        ca32_in = dut.i.xer_ca[1] # CA32 carry in 32
+        ca32_in = dut.i.xer_ca[1]  # CA32 carry in 32
         so_in = dut.i.xer_so      # SO sticky overflow
 
         ca_o = dut.o.xer_ca.data[0]   # CA carry out
-        ca32_o = dut.o.xer_ca.data[1] # CA32 carry out32
+        ca32_o = dut.o.xer_ca.data[1]  # CA32 carry out32
         ov_o = dut.o.xer_ov.data[0]   # OV overflow
-        ov32_o = dut.o.xer_ov.data[1] # OV32 overflow32
+        ov32_o = dut.o.xer_ov.data[1]  # OV32 overflow32
         o = dut.o.o.data
 
         # setup random inputs
@@ -143,6 +143,7 @@ class ALUTestCase(FHDLTestCase):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=2)
         self.assertFormal(module, mode="cover", depth=2)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index 5e32fbfde9d84e0c91dbf5a0171edae5a95f9f7d..eb6f45719553b8545111595cb0d7964a7a45872f 100644 (file)
@@ -38,7 +38,7 @@ class Driver(Elaboratable):
             recwidth += width
             comb += p.eq(AnyConst(width))
 
-        pspec = ALUPipeSpec(id_wid=2)
+        pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.dut = dut = ALUOutputStage(pspec)
 
         o = Signal(64)
@@ -103,11 +103,13 @@ class Driver(Elaboratable):
 
         return m
 
+
 class GTCombinerTestCase(FHDLTestCase):
     def test_formal(self):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=4)
         self.assertFormal(module, mode="cover", depth=4)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index f4ad49183c1ffbd686644238a676d7dd807c64b6..1f17943c4b6116270b052a1486e1d6358627c749 100644 (file)
@@ -38,6 +38,7 @@ class ALUMainStage(PipeModBase):
         return ALUOutputData(self.pspec) # defines pipeline stage output format
 
     def elaborate(self, platform):
+        XLEN = self.pspec.XLEN
         m = Module()
         comb = m.d.comb
 
@@ -69,11 +70,11 @@ class ALUMainStage(PipeModBase):
             comb += b_i.eq(b)                     # into trap pipeline
         with m.Elif(is_32bit):
             with m.If(op.is_signed):
-                comb += a_i.eq(exts(a, 32, 64))
-                comb += b_i.eq(exts(b, 32, 64))
+                comb += a_i.eq(exts(a, 32, XLEN))
+                comb += b_i.eq(exts(b, 32, XLEN))
             with m.Else():
-                comb += a_i.eq(extz(a, 32, 64))
-                comb += b_i.eq(extz(b, 32, 64))
+                comb += a_i.eq(extz(a, 32, XLEN))
+                comb += b_i.eq(extz(b, 32, XLEN))
         with m.Else():
             comb += a_i.eq(a)
             comb += b_i.eq(b)
@@ -94,7 +95,7 @@ class ALUMainStage(PipeModBase):
             #### CMP, CMPL v3.0B p85-86
 
             with m.Case(MicrOp.OP_CMP):
-                a_n = Signal(64) # temporary - inverted a
+                a_n = Signal(XLEN) # temporary - inverted a
                 tval = Signal(5)
                 a_lt = Signal()
                 carry_32 = Signal()
@@ -107,18 +108,21 @@ class ALUMainStage(PipeModBase):
 
                 # this is supposed to be inverted (b-a, not a-b)
                 comb += a_n.eq(~a) # sigh a gets inverted
-                comb += carry_32.eq(add_o[33] ^ a[32] ^ b[32])
-                comb += carry_64.eq(add_o[65])
+                if XLEN == 64:
+                    comb += carry_32.eq(add_o[33] ^ a[32] ^ b[32])
+                else:
+                    comb += carry_32.eq(add_o[XLEN+1])
+                comb += carry_64.eq(add_o[XLEN+1])
 
                 comb += zerolo.eq(~((a_n[0:32] ^ b[0:32]).bool()))
-                comb += zerohi.eq(~((a_n[32:64] ^ b[32:64]).bool()))
+                comb += zerohi.eq(~((a_n[32:XLEN] ^ b[32:XLEN]).bool()))
 
                 with m.If(zerolo & (is_32bit | zerohi)):
                     # values are equal
                     comb += tval[2].eq(1)
                 with m.Else():
-                    comb += msb_a.eq(Mux(is_32bit, a_n[31], a_n[63]))
-                    comb += msb_b.eq(Mux(is_32bit, b[31], b[63]))
+                    comb += msb_a.eq(Mux(is_32bit, a_n[31], a_n[XLEN-1]))
+                    comb += msb_b.eq(Mux(is_32bit, b[31], b[XLEN-1]))
                     C0 = Const(0, 1)
                     with m.If(msb_a != msb_b):
                         # Subtraction might overflow, but
@@ -149,13 +153,21 @@ class ALUMainStage(PipeModBase):
                 # https://bugs.libre-soc.org/show_bug.cgi?id=319#c5
                 ca = Signal(2, reset_less=True)
                 comb += ca[0].eq(add_o[-1])                   # XER.CA
-                comb += ca[1].eq(add_o[33] ^ (a_i[32] ^ b_i[32])) # XER.CA32
+                if XLEN == 64:
+                    comb += ca[1].eq(add_o[33] ^ (a_i[32] ^ b_i[32])) # XER.CA32
+                else:
+                    comb += ca[1].eq(add_o[-1])                   # XER.CA32
                 comb += cry_o.data.eq(ca)
                 comb += cry_o.ok.eq(1)
                 # 32-bit (ov[1]) and 64-bit (ov[0]) overflow
                 ov = Signal(2, reset_less=True)
                 comb += ov[0].eq(calc_ov(a_i[-1], b_i[-1], ca[0], add_o[-2]))
-                comb += ov[1].eq(calc_ov(a_i[31], b_i[31], ca[1], add_o[32]))
+                if XLEN == 64:
+                    comb += ov[1].eq(calc_ov(a_i[31], b_i[31], ca[1],
+                                             add_o[32]))
+                else:
+                    comb += ov[1].eq(calc_ov(a_i[-1], b_i[-1], ca[0],
+                                            add_o[-2]))
                 comb += ov_o.data.eq(ov)
                 comb += ov_o.ok.eq(1)
 
@@ -164,11 +176,11 @@ class ALUMainStage(PipeModBase):
 
             with m.Case(MicrOp.OP_EXTS):
                 with m.If(op.data_len == 1):
-                    comb += o.data.eq(exts(a, 8, 64))
+                    comb += o.data.eq(exts(a, 8, XLEN))
                 with m.If(op.data_len == 2):
-                    comb += o.data.eq(exts(a, 16, 64))
+                    comb += o.data.eq(exts(a, 16, XLEN))
                 with m.If(op.data_len == 4):
-                    comb += o.data.eq(exts(a, 32, 64))
+                    comb += o.data.eq(exts(a, 32, XLEN))
                 comb += o.ok.eq(1) # output register
 
             ###################
index 7b1334156c9de77b65a64e4319b03a9386f15a46..572ec9a6bcd18c22202f4a100531e6f843975889 100644 (file)
@@ -3,28 +3,36 @@ from soc.fu.pipe_data import FUBaseData, CommonPipeSpec
 
 
 class ALUInputData(FUBaseData):
-    regspec = [('INT', 'ra', '0:63'), # RA
-               ('INT', 'rb', '0:63'), # RB/immediate
-               ('XER', 'xer_so', '32'), # XER bit 32: SO
-               ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32
     def __init__(self, pspec):
         super().__init__(pspec, False)
         # convenience
         self.a, self.b = self.ra, self.rb
 
+    @property
+    def regspec(self):
+        return [('INT', 'ra', self.intrange),  # RA
+               ('INT', 'rb', self.intrange),  # RB/immediate
+               ('XER', 'xer_so', '32'),  # XER bit 32: SO
+               ('XER', 'xer_ca', '34,45')]  # XER bit 34/45: CA/CA32
+
+
 
 class ALUOutputData(FUBaseData):
-    regspec = [('INT', 'o', '0:63'),
-               ('CR', 'cr_a', '0:3'),
-               ('XER', 'xer_ca', '34,45'), # bit0: ca, bit1: ca32
-               ('XER', 'xer_ov', '33,44'), # bit0: ov, bit1: ov32
-               ('XER', 'xer_so', '32')]
     def __init__(self, pspec):
         super().__init__(pspec, True)
         # convenience
         self.cr0 = self.cr_a
 
+    @property
+    def regspec(self):
+        return [('INT', 'o', self.intrange),
+               ('CR', 'cr_a', '0:3'),
+               ('XER', 'xer_ca', '34,45'),  # bit0: ca, bit1: ca32
+               ('XER', 'xer_ov', '33,44'),  # bit0: ov, bit1: ov32
+               ('XER', 'xer_so', '32')]
+
+
 
 class ALUPipeSpec(CommonPipeSpec):
-    regspec = (ALUInputData.regspec, ALUOutputData.regspec)
     opsubsetkls = CompALUOpSubset
+    regspecklses = (ALUInputData, ALUOutputData)
index a9c4f337be5c5c247efa016032d03efd0d8e959c..baaf69c26cde0e599840c9258f716b55d3a93c36 100644 (file)
@@ -5,52 +5,58 @@ from soc.fu.alu.main_stage import ALUMainStage
 from soc.fu.alu.output_stage import ALUOutputStage
 
 
-class ALUStagesOld(PipeModBaseChain):
+class ALUStages(PipeModBaseChain):
     def get_chain(self):
         inp = ALUInputStage(self.pspec)
         main = ALUMainStage(self.pspec)
-        return [inp, main, out]
-
-
-class ALUStageEnd(PipeModBaseChain):
-    def get_chain(self):
         out = ALUOutputStage(self.pspec)
-        return [out]
+        return [inp, main, out]
 
 
-class ALUBasePipeOld(ControlBase):
+class ALUBasePipe(ControlBase):
     def __init__(self, pspec):
         ControlBase.__init__(self)
         self.pspec = pspec
         self.pipe1 = ALUStages(pspec)
-        self.pipe2 = ALUStageEnd(pspec)
-        self._eqs = self.connect([self.pipe1, self.pipe2])
+        self._eqs = self.connect([self.pipe1])
 
     def elaborate(self, platform):
         m = ControlBase.elaborate(self, platform)
         m.submodules.pipe1 = self.pipe1
-        m.submodules.pipe2 = self.pipe2
         m.d.comb += self._eqs
         return m
 
-
-class ALUStages(PipeModBaseChain):
+class ALUStages1(PipeModBaseChain):
     def get_chain(self):
         inp = ALUInputStage(self.pspec)
+        return [inp]
+
+class ALUStages2(PipeModBaseChain):
+    def get_chain(self):
         main = ALUMainStage(self.pspec)
+        return [main]
+
+
+class ALUStages3(PipeModBaseChain):
+    def get_chain(self):
         out = ALUOutputStage(self.pspec)
-        return [inp, main, out]
+        return [out]
 
 
 class ALUBasePipe(ControlBase):
     def __init__(self, pspec):
         ControlBase.__init__(self)
         self.pspec = pspec
-        self.pipe1 = ALUStages(pspec)
-        self._eqs = self.connect([self.pipe1])
+        self.pipe1 = ALUStages1(pspec)
+        self.pipe2 = ALUStages2(pspec)
+        self.pipe3 = ALUStages3(pspec)
+        self._eqs = self.connect([self.pipe1, self.pipe2, self.pipe3])
 
     def elaborate(self, platform):
         m = ControlBase.elaborate(self, platform)
-        m.submodules.pipe1 = self.pipe1
+        m.submodules.logical_pipe1 = self.pipe1
+        m.submodules.logical_pipe2 = self.pipe2
+        m.submodules.logical_pipe3 = self.pipe3
         m.d.comb += self._eqs
         return m
+
index 4b9a14b9263853c18962a73e7cc449c6b78c76b8..512e379406dd77c26f6682c7f7146fb8e299444b 100644 (file)
@@ -51,7 +51,7 @@ def set_alu_inputs(alu, dec2, sim):
 class ALUIAllCases(ALUTestCase):
 
     def case_ilang(self):
-        pspec = ALUPipeSpec(id_wid=2)
+        pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
         alu = ALUBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("alu_pipeline.il", "w") as f:
@@ -60,7 +60,7 @@ class ALUIAllCases(ALUTestCase):
 
 class TestRunner(unittest.TestCase):
 
-    def execute(self, alu,instruction, pdecode2, test):
+    def execute(self, alu, instruction, pdecode2, test):
         program = test.program
         sim = ISA(pdecode2, test.regs, test.sprs, test.cr,
                   test.mem, test.msr,
@@ -88,7 +88,7 @@ class TestRunner(unittest.TestCase):
             fn_unit = yield pdecode2.e.do.fn_unit
             asmcode = yield pdecode2.e.asmcode
             dec_asmcode = yield pdecode2.dec.op.asmcode
-            print ("asmcode", asmcode, dec_asmcode)
+            print("asmcode", asmcode, dec_asmcode)
             self.assertEqual(fn_unit, Function.ALU.value)
             yield from set_alu_inputs(alu, pdecode2, sim)
 
@@ -111,7 +111,7 @@ class TestRunner(unittest.TestCase):
             yield Settle()
 
     def test_it(self):
-        test_data = ALUTestCase().test_data
+        test_data = ALUTestCase({'soc'}).test_data
         m = Module()
         comb = m.d.comb
         instruction = Signal(32)
@@ -120,10 +120,14 @@ class TestRunner(unittest.TestCase):
         opkls = ALUPipeSpec.opsubsetkls
 
         pdecode = create_pdecode()
-        m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode, opkls, fn_name)
+        m.submodules.pdecode2 = pdecode2 = PowerDecode2(
+            pdecode, opkls, fn_name)
         pdecode = pdecode2.dec
 
-        pspec = ALUPipeSpec(id_wid=2)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = ALUPipeSpec(id_wid=2, parent_pspec=pps)
         m.submodules.alu = alu = ALUBasePipe(pspec)
 
         comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
index 780fcbeace7c2271492863588dbaf3a45ef9637a..739d3b20fe8a15806315c546deef5460a0d5653a 100644 (file)
@@ -32,7 +32,7 @@ class Driver(Elaboratable):
             recwidth += width
             comb += p.eq(AnyConst(width))
 
-        pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth)
+        pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth, parent_pspec=None)
         m.submodules.dut = dut = ALUInputStage(pspec)
 
         a = Signal(64)
@@ -64,11 +64,13 @@ class Driver(Elaboratable):
 
         return m
 
+
 class GTCombinerTestCase(FHDLTestCase):
     def test_formal(self):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=4)
         self.assertFormal(module, mode="cover", depth=4)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index 94cf0024bb6a9cd6594884f9373ec47251ce4ea1..0f58e1c049d130e3cc1b46ddccd0bbbc1a6f53dc 100644 (file)
@@ -39,7 +39,7 @@ class Driver(Elaboratable):
             recwidth += width
             comb += p.eq(AnyConst(width))
 
-        pspec = BranchPipeSpec(id_wid=2)
+        pspec = BranchPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.dut = dut = BranchMainStage(pspec)
 
         # convenience aliases
@@ -202,6 +202,7 @@ class LogicalTestCase(FHDLTestCase):
     def test_formal(self):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=2)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index a2f5bcf2508fa09d00f4e43aaf1e610db8f20142..8a6c0071ee51e347379d7f6ca79d9349cd4246ac 100644 (file)
@@ -57,5 +57,5 @@ class BranchOutputData(FUBaseData):
 
 
 class BranchPipeSpec(CommonPipeSpec):
-    regspec = (BranchInputData.regspec, BranchOutputData.regspec)
+    regspecklses = (BranchInputData, BranchOutputData)
     opsubsetkls = CompBROpSubset
index 1cdb3e9a1ff0c0c15219d84505e2c5bdce4d1122..f7c9456ec328d3d6e1fc64a57279fdb7656734d6 100644 (file)
@@ -1,6 +1,26 @@
 from nmutil.singlepipe import ControlBase
 from nmutil.pipemodbase import PipeModBaseChain
 from soc.fu.branch.main_stage import BranchMainStage
+from nmutil.pipemodbase import PipeModBase
+from soc.fu.branch.pipe_data import BranchInputData
+from nmigen import Module
+
+# gives a 1-clock delay to stop combinatorial link between in and out
+class DummyBranchStage(PipeModBase):
+    def __init__(self, pspec): super().__init__(pspec, "dummy")
+    def ispec(self): return BranchInputData(self.pspec)
+    def ospec(self): return BranchInputData(self.pspec)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.o.eq(self.i) # pass-through output
+        return m
+
+class BranchDummyStages(PipeModBaseChain):
+    def get_chain(self):
+        dummy = DummyBranchStage(self.pspec)
+        return [dummy]
+
 
 class BranchStages(PipeModBaseChain):
     def get_chain(self):
@@ -12,11 +32,13 @@ class BranchBasePipe(ControlBase):
     def __init__(self, pspec):
         ControlBase.__init__(self)
         self.pspec = pspec
-        self.pipe1 = BranchStages(pspec)
-        self._eqs = self.connect([self.pipe1])
+        self.pipe1 = BranchDummyStages(pspec)
+        self.pipe2 = BranchStages(pspec)
+        self._eqs = self.connect([self.pipe1, self.pipe2])
 
     def elaborate(self, platform):
         m = ControlBase.elaborate(self, platform)
-        m.submodules.pipe = self.pipe1
+        m.submodules.pipe1 = self.pipe1
+        m.submodules.pipe2 = self.pipe2
         m.d.comb += self._eqs
         return m
index 0b701ae85b0ddcc550cf8f1dfe1a68a4c086d4cf..611ca983b6196d3f07aafe6b6bac0590bca4739d 100644 (file)
@@ -50,7 +50,7 @@ def get_cu_inputs(dec2, sim):
 class BranchAllCases(BranchTestCase):
 
     def case_ilang(self):
-        pspec = BranchPipeSpec(id_wid=2)
+        pspec = BranchPipeSpec(id_wid=2, parent_pspec=None)
         alu = BranchBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("branch_pipeline.il", "w") as f:
@@ -59,7 +59,8 @@ class BranchAllCases(BranchTestCase):
 
 class TestRunner(unittest.TestCase):
     def test_it(self):
-        test_data = BranchAllCases().test_data
+        test_data = BranchTestCase().test_data
+        print ("test data", test_data)
         m = Module()
         comb = m.d.comb
         instruction = Signal(32)
@@ -70,7 +71,7 @@ class TestRunner(unittest.TestCase):
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
         pdecode = pdecode2.dec
 
-        pspec = BranchPipeSpec(id_wid=2)
+        pspec = BranchPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.branch = branch = BranchBasePipe(pspec)
 
         comb += branch.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
@@ -102,7 +103,7 @@ class TestRunner(unittest.TestCase):
                         print(index)
                         ins, code = instructions[index]
 
-                        print("0x{:X}".format(ins & 0xffffffff))
+                        print("insn 0x{:X}".format(ins & 0xffffffff))
                         print(code)
 
                         # ask the decoder to decode this binary data (endian'd)
index 45106984a0e1469a255d5ad5e198974c2d37cc70..a79179bc3503e60585d0b0748ad7719da36e26f6 100644 (file)
@@ -11,6 +11,7 @@ class CommonOutputStage(PipeModBase):
         super().__init__(pspec, "output")
 
     def elaborate(self, platform):
+        XLEN = self.pspec.XLEN
         m = Module()
         comb = m.d.comb
         op = self.i.ctx.op
@@ -49,7 +50,7 @@ class CommonOutputStage(PipeModBase):
         # XXX ah.  right.  this needs to be done only if the *mode* is 32-bit
         # (an MSR bit)
         # see https://bugs.libre-soc.org/show_bug.cgi?id=424
-        target = Signal(64, reset_less=True)
+        target = Signal(XLEN, reset_less=True)
         #with m.If(op.is_32bit):
         #    comb += target.eq(o[:32])
         #with m.Else():
index 971efa9edd3d8ab80e7d75f863a5384960d9454a..873a09df23e3ee884e2ba6eaad6936994ec58e49 100644 (file)
@@ -115,11 +115,16 @@ class FunctionUnitBaseSingle(MultiCompUnit):
     to actually read (and write) the correct register number
     """
 
-    def __init__(self, speckls, pipekls, idx):
+    def __init__(self, speckls, pipekls, idx, parent_pspec):
         alu_name = "alu_%s%d" % (self.fnunit.name.lower(), idx)
-        pspec = speckls(id_wid=2)                # spec (NNNPipeSpec instance)
+        # spec (NNNPipeSpec instance)
+        pspec = speckls(id_wid=2, parent_pspec=parent_pspec)
         opsubset = pspec.opsubsetkls             # get the operand subset class
-        regspec = pspec.regspec                  # get the regspec
+        rsk = pspec.regspecklses        # get the regspec classes
+        regspec = []
+        for kls in rsk:
+            regspec.append(kls(pspec).regspec)
+        print ("regspecs", regspec)
         alu = pipekls(pspec)                     # create actual NNNBasePipe
         self.pspec = pspec
         super().__init__(regspec, alu, opsubset, name=alu_name)  # MultiCompUnit
@@ -154,13 +159,19 @@ class FunctionUnitBaseMulti(ReservationStations2):
     ideal (it could be a lot neater) but works for now.
     """
 
-    def __init__(self, speckls, pipekls, num_rows):
+    def __init__(self, speckls, pipekls, num_rows, parent_pspec):
         id_wid = num_rows.bit_length()
-        pspec = speckls(id_wid=id_wid)           # spec (NNNPipeSpec instance)
-        opsubset = pspec.opsubsetkls             # get the operand subset class
-        regspec = pspec.regspec                  # get the regspec
-        alu = pipekls(pspec)                # create actual NNNBasePipe
+
+        # spec (NNNPipeSpec instance)
+        pspec = speckls(id_wid=id_wid, parent_pspec=parent_pspec)
         self.pspec = pspec
+        opsubset = pspec.opsubsetkls        # get the operand subset class
+        rsk = pspec.regspecklses        # get the regspec classes
+        regspec = []
+        for kls in rsk:
+            regspec.append(kls(pspec).regspec)
+        print ("regspecs", regspec)
+        alu = pipekls(pspec)                # create actual NNNBasePipe
         alu_name = self.fnunit.name.lower()
         super().__init__(alu, num_rows, alu_name)   # initialise fan-in/fan-out
         self.cu = []
@@ -185,87 +196,90 @@ class FunctionUnitBaseMulti(ReservationStations2):
 ######################################################################
 ###### actual Function Units: these are "single" stage pipelines #####
 
-#class ALUFunctionUnit(FunctionUnitBaseSingle):
+# class ALUFunctionUnit(FunctionUnitBaseSingle):
+
+
 class ALUFunctionUnit(FunctionUnitBaseMulti):
     fnunit = Function.ALU
 
-    def __init__(self, num_rses):
-        super().__init__(ALUPipeSpec, ALUBasePipe, num_rses)
+    def __init__(self, num_rses, parent_pspec):
+        super().__init__(ALUPipeSpec, ALUBasePipe, num_rses, parent_pspec)
 
 
-#class LogicalFunctionUnit(FunctionUnitBaseSingle):
+# class LogicalFunctionUnit(FunctionUnitBaseSingle):
 class LogicalFunctionUnit(FunctionUnitBaseMulti):
     fnunit = Function.LOGICAL
 
-    def __init__(self, idx):
-        super().__init__(LogicalPipeSpec, LogicalBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(LogicalPipeSpec, LogicalBasePipe, idx, parent_pspec)
 
 
-#class CRFunctionUnit(FunctionUnitBaseSingle):
+# class CRFunctionUnit(FunctionUnitBaseSingle):
 class CRFunctionUnit(FunctionUnitBaseMulti):
     fnunit = Function.CR
 
-    def __init__(self, idx):
-        super().__init__(CRPipeSpec, CRBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(CRPipeSpec, CRBasePipe, idx, parent_pspec)
 
 
-#class BranchFunctionUnit(FunctionUnitBaseSingle):
+# class BranchFunctionUnit(FunctionUnitBaseSingle):
 class BranchFunctionUnit(FunctionUnitBaseMulti):
     fnunit = Function.BRANCH
 
-    def __init__(self, idx):
-        super().__init__(BranchPipeSpec, BranchBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(BranchPipeSpec, BranchBasePipe, idx, parent_pspec)
 
 
-#class ShiftRotFunctionUnit(FunctionUnitBaseSingle):
+# class ShiftRotFunctionUnit(FunctionUnitBaseSingle):
 class ShiftRotFunctionUnit(FunctionUnitBaseMulti):
     fnunit = Function.SHIFT_ROT
 
-    def __init__(self, idx):
-        super().__init__(ShiftRotPipeSpec, ShiftRotBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(ShiftRotPipeSpec, ShiftRotBasePipe, idx, parent_pspec)
 
 
 class DivFSMFunctionUnit(FunctionUnitBaseSingle):
     fnunit = Function.DIV
 
-    def __init__(self, idx):
-        super().__init__(DivPipeSpecFSMDivCore, DivBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(DivPipeSpecFSMDivCore, DivBasePipe, idx, parent_pspec)
 
 
 class MMUFSMFunctionUnit(FunctionUnitBaseSingle):
     fnunit = Function.MMU
 
-    def __init__(self, idx):
-        super().__init__(MMUPipeSpec, FSMMMUStage, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(MMUPipeSpec, FSMMMUStage, idx, parent_pspec)
+        self.exc_o = self.alu.exc_o # get at MMU exception
 
 
 class DivPipeFunctionUnit(FunctionUnitBaseSingle):
     fnunit = Function.DIV
 
-    def __init__(self, idx):
-        super().__init__(DivPipeSpecDivPipeCore, DivBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(DivPipeSpecDivPipeCore, DivBasePipe, idx, parent_pspec)
 
 
-#class MulFunctionUnit(FunctionUnitBaseSingle):
+# class MulFunctionUnit(FunctionUnitBaseSingle):
 class MulFunctionUnit(FunctionUnitBaseMulti):
     fnunit = Function.MUL
 
-    def __init__(self, idx):
-        super().__init__(MulPipeSpec, MulBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(MulPipeSpec, MulBasePipe, idx, parent_pspec)
 
 
 class TrapFunctionUnit(FunctionUnitBaseSingle):
     fnunit = Function.TRAP
 
-    def __init__(self, idx):
-        super().__init__(TrapPipeSpec, TrapBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(TrapPipeSpec, TrapBasePipe, idx, parent_pspec)
 
 
 class SPRFunctionUnit(FunctionUnitBaseSingle):
     fnunit = Function.SPR
 
-    def __init__(self, idx):
-        super().__init__(SPRPipeSpec, SPRBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(SPRPipeSpec, SPRBasePipe, idx, parent_pspec)
 
 
 # special-case: LD/ST conforms to the CompUnit API but is not a pipeline
@@ -273,11 +287,16 @@ class SPRFunctionUnit(FunctionUnitBaseSingle):
 class LDSTFunctionUnit(LDSTCompUnit):
     fnunit = Function.LDST
 
-    def __init__(self, pi, awid, idx):
+    def __init__(self, pi, awid, idx, parent_pspec):
         alu_name = "ldst_%s%d" % (self.fnunit.name.lower(), idx)
-        pspec = LDSTPipeSpec(id_wid=2)           # spec (NNNPipeSpec instance)
+        # spec (NNNPipeSpec instance)
+        pspec = LDSTPipeSpec(id_wid=2, parent_pspec=parent_pspec)
         opsubset = pspec.opsubsetkls             # get the operand subset class
-        regspec = pspec.regspec                  # get the regspec
+        rsk = pspec.regspecklses        # get the regspec classes
+        regspec = []
+        for kls in rsk:
+            regspec.append(kls(pspec).regspec)
+        print ("regspecs", regspec)
         self.opsubsetkls = opsubset
         super().__init__(pi, regspec, awid, opsubset, name=alu_name)
 
@@ -334,13 +353,14 @@ class AllFunctionUnits(Elaboratable):
         for name, qty in units.items():
             kls = alus[name]
             if issubclass(kls, FunctionUnitBaseMulti):
-                fu = kls(qty) # create just the one ALU but many "fronts"
-                self.actual_alus[name] = fu # to be made a module of AllFUs
+                # create just the one ALU but many "fronts"
+                fu = kls(qty, parent_pspec=pspec)
+                self.actual_alus[name] = fu  # to be made a module of AllFUs
                 for i in range(qty):
                     self.fus["%s%d" % (name, i)] = fu.cu[i]
             else:
                 for i in range(qty):
-                    self.fus["%s%d" % (name, i)] = kls(i)
+                    self.fus["%s%d" % (name, i)] = kls(i, parent_pspec=pspec)
 
         # debug print for MMU ALU
         if microwatt_mmu:
@@ -350,15 +370,15 @@ class AllFunctionUnits(Elaboratable):
         # if any PortInterfaces, we want LDST Units.
         if pilist is None:
             return
-        print ("pilist", pilist)
+        print("pilist", pilist)
         for i, pi in enumerate(pilist):
-            self.fus["ldst%d" % (i)] = LDSTFunctionUnit(pi, addrwid, i)
+            self.fus["ldst%d" % (i)] = LDSTFunctionUnit(pi, addrwid, i, pspec)
 
         # extract exceptions from any FunctionUnits for easy access
         self.excs = {}
         for name, alu in self.fus.items():
             if hasattr(alu, "exc_o"):
-                print ("FU exceptions", name, type(alu.exc_o), alu.exc_o)
+                print("FU exceptions", name, type(alu.exc_o), alu.exc_o)
                 self.excs[name] = alu.exc_o
 
     def get_exc(self, name):
@@ -403,7 +423,7 @@ def tst_single_fus_il():
 def tst_all_fus():
     pspec = TestMemPspec(ldst_ifacetype='testpi',
                          imem_ifacetype='',
-                         addr_wid=48,
+                         addr_wid=64,
                          mask_wid=8,
                          reg_wid=64)
     dut = AllFunctionUnits(pspec)
index 7885b9f74daea6bbb464ddec6e130f45edcbb225..e115c2158e3faec39d9aabf5375a196bcb34dd5e 100644 (file)
@@ -187,7 +187,7 @@ class TestRunner(FHDLTestCase):
         self.funit = funit
         self.bigendian = bigendian
 
-    def execute(self, cu, l0, instruction, pdecode2, simdec2, test):
+    def execute(self, m, cu, l0, instruction, pdecode2, simdec2, test):
 
         program = test.program
         print("test", test.name, test.mem)
@@ -239,7 +239,7 @@ class TestRunner(FHDLTestCase):
             # set operand and get inputs
             yield from set_operand(cu, pdecode2, sim)
             # reset read-operand mask
-            rdmask = get_rdflags(pdecode2.e, cu)
+            rdmask = get_rdflags(m, pdecode2.e, cu)
             #print ("hardcoded rdmask", cu.rdflags(pdecode2.e))
             #print ("decoder rdmask", rdmask)
             yield cu.rdmaskn.eq(~rdmask)
@@ -344,7 +344,7 @@ class TestRunner(FHDLTestCase):
             m.d.comb += cu.ad.go_i.eq(cu.ad.rel_o)  # link addr direct to rel
             m.d.comb += cu.st.go_i.eq(cu.st.rel_o)  # link store direct to rel
         else:
-            m.submodules.cu = cu = self.fukls(0)
+            m.submodules.cu = cu = self.fukls(0, parent_pspec=None)
             l0 = None
 
         comb += pdecode2.dec.raw_opcode_in.eq(instruction)
@@ -359,7 +359,7 @@ class TestRunner(FHDLTestCase):
             for test in self.test_data:
                 print(test.name)
                 with self.subTest(test.name):
-                    yield from self.execute(cu, l0, instruction,
+                    yield from self.execute(m, cu, l0, instruction,
                                             pdecode2, simdec2,
                                             test)
 
index 0a46716530ef146993e9618f0e5595383cd48867..fa44c4d3fb685b7f89bcacca12beafb8b5ef65a1 100644 (file)
@@ -37,7 +37,7 @@ class Driver(Elaboratable):
             recwidth += width
             comb += p.eq(AnyConst(width))
 
-        pspec = ALUPipeSpec(id_wid=2)
+        pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.dut = dut = CRMainStage(pspec)
 
         full_cr_in = Signal(32)
@@ -85,7 +85,6 @@ class Driver(Elaboratable):
                 # into cr_a
                 comb += dut.i.cr_a.eq(cr_input_arr[bc])
 
-
             # For OP_CROP, we need to input the corresponding CR
             # registers for BA, BB, and BT
             with m.Case(MicrOp.OP_CROP):
@@ -172,7 +171,7 @@ class Driver(Elaboratable):
                             comb += Assert(o[4*i:4*i+4] == cr[4*i:4*i+4])
                         with m.Else():
                             comb += Assert(o[4*i:4*i+4] == 0)
-                with m.Else(): # mfcrf
+                with m.Else():  # mfcrf
                     comb += Assert(o == cr)
                 comb += o_ok.eq(1)
 
@@ -237,7 +236,7 @@ class Driver(Elaboratable):
 
             with m.Case(MicrOp.OP_SETB):
                 with m.If(cr_arr[4*bfa]):
-                    comb += Assert(o == ((1<<64)-1))
+                    comb += Assert(o == ((1 << 64)-1))
                 with m.Elif(cr_arr[4*bfa+1]):
                     comb += Assert(o == 1)
                 with m.Else():
@@ -256,6 +255,7 @@ class CRTestCase(FHDLTestCase):
     def test_formal(self):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=2)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index edcad2e9aa6a1c84c53b0e73f5925eea7e981c30..f1c6d349201915764682ae5079cc1753f9c94b10 100644 (file)
@@ -30,5 +30,5 @@ class CROutputData(FUBaseData):
 
 
 class CRPipeSpec(CommonPipeSpec):
-    regspec = (CRInputData.regspec, CROutputData.regspec)
+    regspecklses = (CRInputData, CROutputData)
     opsubsetkls = CompCROpSubset
index 80aa600d4f596103f4efbd52cccadb6244ce733d..9a92d2d6dbdacfdf1478ac99a82ce839245d97ef 100644 (file)
@@ -24,7 +24,7 @@ from openpower.test.cr.cr_cases import CRTestCase
 class CRIlangCase(TestAccumulatorBase):
 
     def case_ilang(self):
-        pspec = CRPipeSpec(id_wid=2)
+        pspec = CRPipeSpec(id_wid=2, parent_pspec=None)
         alu = CRBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("cr_pipeline.il", "w") as f:
@@ -78,8 +78,8 @@ class TestRunner(unittest.TestCase):
         if whole_reg_ok:
             full_cr = yield alu.n.o_data.full_cr.data & full_cr_mask
             expected_cr = simulator.cr.value
-            print("CR whole: expected %x, actual: %x mask: %x" % \
-                (expected_cr, full_cr, full_cr_mask))
+            print("CR whole: expected %x, actual: %x mask: %x" %
+                  (expected_cr, full_cr, full_cr_mask))
             # HACK: only look at the bits that we expected to change
             self.assertEqual(expected_cr & full_cr_mask, full_cr, code)
         elif cr_en:
@@ -144,7 +144,7 @@ class TestRunner(unittest.TestCase):
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
         pdecode = pdecode2.dec
 
-        pspec = CRPipeSpec(id_wid=2)
+        pspec = CRPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.alu = alu = CRBasePipe(pspec)
 
         comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
diff --git a/src/soc/fu/div/experiment/__init__.py b/src/soc/fu/div/experiment/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/soc/fu/div/experiment/goldschmidt_div_sqrt.py b/src/soc/fu/div/experiment/goldschmidt_div_sqrt.py
new file mode 100644 (file)
index 0000000..3f7c248
--- /dev/null
@@ -0,0 +1,1552 @@
+# SPDX-License-Identifier: LGPL-3-or-later
+# Copyright 2022 Jacob Lifshay programmerjake@gmail.com
+
+# Funded by NLnet Assure Programme 2021-02-052, https://nlnet.nl/assure part
+# of Horizon 2020 EU Programme 957073.
+
+from collections import defaultdict
+import logging
+import math
+import enum
+from fractions import Fraction
+from types import FunctionType
+from functools import lru_cache
+from nmigen.hdl.ast import Signal, unsigned, signed, Const
+from nmigen.hdl.dsl import Module, Elaboratable
+from nmigen.hdl.mem import Memory
+from nmutil.clz import CLZ
+from nmutil.plain_data import plain_data, fields, replace
+
+try:
+    from functools import cached_property
+except ImportError:
+    from cached_property import cached_property
+
+# fix broken IDE type detection for cached_property
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from functools import cached_property
+
+
+_NOT_FOUND = object()
+
+
+def cache_on_self(func):
+    """like `functools.cached_property`, except for methods. unlike
+    `lru_cache` the cache is per-class instance rather than a global cache
+    per-method."""
+
+    assert isinstance(func, FunctionType), \
+        "non-plain methods are not supported"
+
+    cache_name = func.__name__ + "__cache"
+
+    def wrapper(self, *args, **kwargs):
+        # specifically access through `__dict__` to bypass frozen=True
+        cache = self.__dict__.get(cache_name, _NOT_FOUND)
+        if cache is _NOT_FOUND:
+            self.__dict__[cache_name] = cache = {}
+        key = (args, *kwargs.items())
+        retval = cache.get(key, _NOT_FOUND)
+        if retval is _NOT_FOUND:
+            retval = func(self, *args, **kwargs)
+            cache[key] = retval
+        return retval
+
+    wrapper.__doc__ = func.__doc__
+    return wrapper
+
+
+@enum.unique
+class RoundDir(enum.Enum):
+    DOWN = enum.auto()
+    UP = enum.auto()
+    NEAREST_TIES_UP = enum.auto()
+    ERROR_IF_INEXACT = enum.auto()
+
+
+@plain_data(frozen=True, eq=False, repr=False)
+class FixedPoint:
+    __slots__ = "bits", "frac_wid"
+
+    def __init__(self, bits, frac_wid):
+        self.bits = bits
+        self.frac_wid = frac_wid
+        assert isinstance(self.bits, int)
+        assert isinstance(self.frac_wid, int) and self.frac_wid >= 0
+
+    @staticmethod
+    def cast(value):
+        """convert `value` to a fixed-point number with enough fractional
+        bits to preserve its value."""
+        if isinstance(value, FixedPoint):
+            return value
+        if isinstance(value, int):
+            return FixedPoint(value, 0)
+        if isinstance(value, str):
+            value = value.strip()
+            neg = value.startswith("-")
+            if neg or value.startswith("+"):
+                value = value[1:]
+            if value.startswith(("0x", "0X")) and "." in value:
+                value = value[2:]
+                got_dot = False
+                bits = 0
+                frac_wid = 0
+                for digit in value:
+                    if digit == "_":
+                        continue
+                    if got_dot:
+                        if digit == ".":
+                            raise ValueError("too many `.` in string")
+                        frac_wid += 4
+                    if digit == ".":
+                        got_dot = True
+                        continue
+                    if not digit.isalnum():
+                        raise ValueError("invalid hexadecimal digit")
+                    bits <<= 4
+                    bits |= int("0x" + digit, base=16)
+            else:
+                bits = int(value, base=0)
+                frac_wid = 0
+            if neg:
+                bits = -bits
+            return FixedPoint(bits, frac_wid)
+
+        if isinstance(value, float):
+            n, d = value.as_integer_ratio()
+            log2_d = d.bit_length() - 1
+            assert d == 1 << log2_d, ("d isn't a power of 2 -- won't ever "
+                                      "fail with float being IEEE 754")
+            return FixedPoint(n, log2_d)
+        raise TypeError("can't convert type to FixedPoint")
+
+    @staticmethod
+    def with_frac_wid(value, frac_wid, round_dir=RoundDir.ERROR_IF_INEXACT):
+        """convert `value` to the nearest fixed-point number with `frac_wid`
+        fractional bits, rounding according to `round_dir`."""
+        assert isinstance(frac_wid, int) and frac_wid >= 0
+        assert isinstance(round_dir, RoundDir)
+        if isinstance(value, Fraction):
+            numerator = value.numerator
+            denominator = value.denominator
+        else:
+            value = FixedPoint.cast(value)
+            numerator = value.bits
+            denominator = 1 << value.frac_wid
+        if denominator < 0:
+            numerator = -numerator
+            denominator = -denominator
+        bits, remainder = divmod(numerator << frac_wid, denominator)
+        if round_dir == RoundDir.DOWN:
+            pass
+        elif round_dir == RoundDir.UP:
+            if remainder != 0:
+                bits += 1
+        elif round_dir == RoundDir.NEAREST_TIES_UP:
+            if remainder * 2 >= denominator:
+                bits += 1
+        elif round_dir == RoundDir.ERROR_IF_INEXACT:
+            if remainder != 0:
+                raise ValueError("inexact conversion")
+        else:
+            assert False, "unimplemented round_dir"
+        return FixedPoint(bits, frac_wid)
+
+    def to_frac_wid(self, frac_wid, round_dir=RoundDir.ERROR_IF_INEXACT):
+        """convert to the nearest fixed-point number with `frac_wid`
+        fractional bits, rounding according to `round_dir`."""
+        return FixedPoint.with_frac_wid(self, frac_wid, round_dir)
+
+    def __float__(self):
+        # use truediv to get correct result even when bits
+        # and frac_wid are huge
+        return float(self.bits / (1 << self.frac_wid))
+
+    def as_fraction(self):
+        return Fraction(self.bits, 1 << self.frac_wid)
+
+    def cmp(self, rhs):
+        """compare self with rhs, returning a positive integer if self is
+        greater than rhs, zero if self is equal to rhs, and a negative integer
+        if self is less than rhs."""
+        rhs = FixedPoint.cast(rhs)
+        common_frac_wid = max(self.frac_wid, rhs.frac_wid)
+        lhs = self.to_frac_wid(common_frac_wid)
+        rhs = rhs.to_frac_wid(common_frac_wid)
+        return lhs.bits - rhs.bits
+
+    def __eq__(self, rhs):
+        return self.cmp(rhs) == 0
+
+    def __ne__(self, rhs):
+        return self.cmp(rhs) != 0
+
+    def __gt__(self, rhs):
+        return self.cmp(rhs) > 0
+
+    def __lt__(self, rhs):
+        return self.cmp(rhs) < 0
+
+    def __ge__(self, rhs):
+        return self.cmp(rhs) >= 0
+
+    def __le__(self, rhs):
+        return self.cmp(rhs) <= 0
+
+    def fract(self):
+        """return the fractional part of `self`.
+        that is `self - math.floor(self)`.
+        """
+        fract_mask = (1 << self.frac_wid) - 1
+        return FixedPoint(self.bits & fract_mask, self.frac_wid)
+
+    def __str__(self):
+        if self < 0:
+            return "-" + str(-self)
+        digit_bits = 4
+        frac_digit_count = (self.frac_wid + digit_bits - 1) // digit_bits
+        fract = self.fract().to_frac_wid(frac_digit_count * digit_bits)
+        frac_str = hex(fract.bits)[2:].zfill(frac_digit_count)
+        return hex(math.floor(self)) + "." + frac_str
+
+    def __repr__(self):
+        return f"FixedPoint.with_frac_wid({str(self)!r}, {self.frac_wid})"
+
+    def __add__(self, rhs):
+        rhs = FixedPoint.cast(rhs)
+        common_frac_wid = max(self.frac_wid, rhs.frac_wid)
+        lhs = self.to_frac_wid(common_frac_wid)
+        rhs = rhs.to_frac_wid(common_frac_wid)
+        return FixedPoint(lhs.bits + rhs.bits, common_frac_wid)
+
+    def __radd__(self, lhs):
+        # symmetric
+        return self.__add__(lhs)
+
+    def __neg__(self):
+        return FixedPoint(-self.bits, self.frac_wid)
+
+    def __sub__(self, rhs):
+        rhs = FixedPoint.cast(rhs)
+        common_frac_wid = max(self.frac_wid, rhs.frac_wid)
+        lhs = self.to_frac_wid(common_frac_wid)
+        rhs = rhs.to_frac_wid(common_frac_wid)
+        return FixedPoint(lhs.bits - rhs.bits, common_frac_wid)
+
+    def __rsub__(self, lhs):
+        # a - b == -(b - a)
+        return -self.__sub__(lhs)
+
+    def __mul__(self, rhs):
+        rhs = FixedPoint.cast(rhs)
+        return FixedPoint(self.bits * rhs.bits, self.frac_wid + rhs.frac_wid)
+
+    def __rmul__(self, lhs):
+        # symmetric
+        return self.__mul__(lhs)
+
+    def __floor__(self):
+        return self.bits >> self.frac_wid
+
+    def div(self, rhs, frac_wid, round_dir=RoundDir.ERROR_IF_INEXACT):
+        assert isinstance(frac_wid, int) and frac_wid >= 0
+        assert isinstance(round_dir, RoundDir)
+        rhs = FixedPoint.cast(rhs)
+        return FixedPoint.with_frac_wid(self.as_fraction()
+                                        / rhs.as_fraction(),
+                                        frac_wid, round_dir)
+
+    def sqrt(self, round_dir=RoundDir.ERROR_IF_INEXACT):
+        assert isinstance(round_dir, RoundDir)
+        if self < 0:
+            raise ValueError("can't compute sqrt of negative number")
+        if self == 0:
+            return self
+        retval = FixedPoint(0, self.frac_wid)
+        int_part_wid = self.bits.bit_length() - self.frac_wid
+        first_bit_index = -(-int_part_wid // 2)  # division rounds up
+        last_bit_index = -self.frac_wid
+        for bit_index in range(first_bit_index, last_bit_index - 1, -1):
+            trial = retval + FixedPoint(1 << (bit_index + self.frac_wid),
+                                        self.frac_wid)
+            if trial * trial <= self:
+                retval = trial
+        if round_dir == RoundDir.DOWN:
+            pass
+        elif round_dir == RoundDir.UP:
+            if retval * retval < self:
+                retval += FixedPoint(1, self.frac_wid)
+        elif round_dir == RoundDir.NEAREST_TIES_UP:
+            half_way = retval + FixedPoint(1, self.frac_wid + 1)
+            if half_way * half_way <= self:
+                retval += FixedPoint(1, self.frac_wid)
+        elif round_dir == RoundDir.ERROR_IF_INEXACT:
+            if retval * retval != self:
+                raise ValueError("inexact sqrt")
+        else:
+            assert False, "unimplemented round_dir"
+        return retval
+
+    def rsqrt(self, round_dir=RoundDir.ERROR_IF_INEXACT):
+        """compute the reciprocal-sqrt of `self`"""
+        assert isinstance(round_dir, RoundDir)
+        if self < 0:
+            raise ValueError("can't compute rsqrt of negative number")
+        if self == 0:
+            raise ZeroDivisionError("can't compute rsqrt of zero")
+        retval = FixedPoint(0, self.frac_wid)
+        first_bit_index = -(-self.frac_wid // 2)  # division rounds up
+        last_bit_index = -self.frac_wid
+        for bit_index in range(first_bit_index, last_bit_index - 1, -1):
+            trial = retval + FixedPoint(1 << (bit_index + self.frac_wid),
+                                        self.frac_wid)
+            if trial * trial * self <= 1:
+                retval = trial
+        if round_dir == RoundDir.DOWN:
+            pass
+        elif round_dir == RoundDir.UP:
+            if retval * retval * self < 1:
+                retval += FixedPoint(1, self.frac_wid)
+        elif round_dir == RoundDir.NEAREST_TIES_UP:
+            half_way = retval + FixedPoint(1, self.frac_wid + 1)
+            if half_way * half_way * self <= 1:
+                retval += FixedPoint(1, self.frac_wid)
+        elif round_dir == RoundDir.ERROR_IF_INEXACT:
+            if retval * retval * self != 1:
+                raise ValueError("inexact rsqrt")
+        else:
+            assert False, "unimplemented round_dir"
+        return retval
+
+
+class ParamsNotAccurateEnough(Exception):
+    """raised when the parameters aren't accurate enough to have goldschmidt
+    division work."""
+
+
+def _assert_accuracy(condition, msg="not accurate enough"):
+    if condition:
+        return
+    raise ParamsNotAccurateEnough(msg)
+
+
+@plain_data(frozen=True, unsafe_hash=True)
+class GoldschmidtDivParamsBase:
+    """parameters for a Goldschmidt division algorithm, excluding derived
+    parameters.
+    """
+
+    __slots__ = ("io_width", "extra_precision", "table_addr_bits",
+                 "table_data_bits", "iter_count")
+
+    def __init__(self, io_width, extra_precision, table_addr_bits,
+                 table_data_bits, iter_count):
+        assert isinstance(io_width, int)
+        assert isinstance(extra_precision, int)
+        assert isinstance(table_addr_bits, int)
+        assert isinstance(table_data_bits, int)
+        assert isinstance(iter_count, int)
+        self.io_width = io_width
+        """bit-width of the input divisor and the result.
+        the input numerator is `2 * io_width`-bits wide.
+        """
+
+        self.extra_precision = extra_precision
+        """number of bits of additional precision used inside the algorithm."""
+
+        self.table_addr_bits = table_addr_bits
+        """the number of address bits used in the lookup-table."""
+
+        self.table_data_bits = table_data_bits
+        """the number of data bits used in the lookup-table."""
+
+        self.iter_count = iter_count
+        """the total number of iterations of the division algorithm's loop"""
+
+
+@plain_data(frozen=True, unsafe_hash=True)
+class GoldschmidtDivParams(GoldschmidtDivParamsBase):
+    """parameters for a Goldschmidt division algorithm.
+    Use `GoldschmidtDivParams.get` to find a efficient set of parameters.
+    """
+
+    __slots__ = "table", "ops"
+
+    def _shrink_bound(self, bound, round_dir):
+        """prevent fractions from having huge numerators/denominators by
+        rounding to a `FixedPoint` and converting back to a `Fraction`.
+
+        This is intended only for values used to compute bounds, and not for
+        values that end up in the hardware.
+        """
+        assert isinstance(bound, (Fraction, int))
+        assert round_dir is RoundDir.DOWN or round_dir is RoundDir.UP, \
+            "you shouldn't use that round_dir on bounds"
+        frac_wid = self.io_width * 4 + 100  # should be enough precision
+        fixed = FixedPoint.with_frac_wid(bound, frac_wid, round_dir)
+        return fixed.as_fraction()
+
+    def _shrink_min(self, min_bound):
+        """prevent fractions used as minimum bounds from having huge
+        numerators/denominators by rounding down to a `FixedPoint` and
+        converting back to a `Fraction`.
+
+        This is intended only for values used to compute bounds, and not for
+        values that end up in the hardware.
+        """
+        return self._shrink_bound(min_bound, RoundDir.DOWN)
+
+    def _shrink_max(self, max_bound):
+        """prevent fractions used as maximum bounds from having huge
+        numerators/denominators by rounding up to a `FixedPoint` and
+        converting back to a `Fraction`.
+
+        This is intended only for values used to compute bounds, and not for
+        values that end up in the hardware.
+        """
+        return self._shrink_bound(max_bound, RoundDir.UP)
+
+    @property
+    def table_addr_count(self):
+        """number of distinct addresses in the lookup-table."""
+        # used while computing self.table, so can't just do len(self.table)
+        return 1 << self.table_addr_bits
+
+    def table_input_exact_range(self, addr):
+        """return the range of inputs as `Fraction`s used for the table entry
+        with address `addr`."""
+        assert isinstance(addr, int)
+        assert 0 <= addr < self.table_addr_count
+        _assert_accuracy(self.io_width >= self.table_addr_bits)
+        addr_shift = self.io_width - self.table_addr_bits
+        min_numerator = (1 << self.io_width) + (addr << addr_shift)
+        denominator = 1 << self.io_width
+        values_per_table_entry = 1 << addr_shift
+        max_numerator = min_numerator + values_per_table_entry - 1
+        min_input = Fraction(min_numerator, denominator)
+        max_input = Fraction(max_numerator, denominator)
+        min_input = self._shrink_min(min_input)
+        max_input = self._shrink_max(max_input)
+        assert 1 <= min_input <= max_input < 2
+        return min_input, max_input
+
+    def table_value_exact_range(self, addr):
+        """return the range of values as `Fraction`s used for the table entry
+        with address `addr`."""
+        min_input, max_input = self.table_input_exact_range(addr)
+        # division swaps min/max
+        min_value = 1 / max_input
+        max_value = 1 / min_input
+        min_value = self._shrink_min(min_value)
+        max_value = self._shrink_max(max_value)
+        assert 0.5 < min_value <= max_value <= 1
+        return min_value, max_value
+
+    def table_exact_value(self, index):
+        min_value, max_value = self.table_value_exact_range(index)
+        # we round down
+        return min_value
+
+    def __init__(self, io_width, extra_precision, table_addr_bits,
+                 table_data_bits, iter_count):
+        super().__init__(io_width=io_width,
+                         extra_precision=extra_precision,
+                         table_addr_bits=table_addr_bits,
+                         table_data_bits=table_data_bits,
+                         iter_count=iter_count)
+        _assert_accuracy(self.io_width >= 1, "io_width out of range")
+        _assert_accuracy(self.extra_precision >= 0,
+                         "extra_precision out of range")
+        _assert_accuracy(self.table_addr_bits >= 1,
+                         "table_addr_bits out of range")
+        _assert_accuracy(self.table_data_bits >= 1,
+                         "table_data_bits out of range")
+        _assert_accuracy(self.iter_count >= 1, "iter_count out of range")
+        table = []
+        for addr in range(1 << self.table_addr_bits):
+            table.append(FixedPoint.with_frac_wid(self.table_exact_value(addr),
+                                                  self.table_data_bits,
+                                                  RoundDir.DOWN))
+
+        self.table = tuple(table)
+        """ the lookup-table.
+        type: tuple[FixedPoint, ...]
+        """
+
+        self.ops = tuple(self.__make_ops())
+        "the operations needed to perform the goldschmidt division algorithm."
+
+    @property
+    def expanded_width(self):
+        """the total number of bits of precision used inside the algorithm."""
+        return self.io_width + self.extra_precision
+
+    @property
+    def n_d_f_int_wid(self):
+        """the number of bits in the integer part of `state.n`, `state.d`, and
+        `state.f` during the main iteration loop.
+        """
+        return 2
+
+    @property
+    def n_d_f_total_wid(self):
+        """the total number of bits (both integer and fraction bits) in
+        `state.n`, `state.d`, and `state.f` during the main iteration loop.
+        """
+        return self.n_d_f_int_wid + self.expanded_width
+
+    @cache_on_self
+    def max_neps(self, i):
+        """maximum value of `neps[i]`.
+        `neps[i]` is defined to be `n[i] * N_prime[i - 1] * F_prime[i - 1]`.
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        return Fraction(1, 1 << self.expanded_width)
+
+    @cache_on_self
+    def max_deps(self, i):
+        """maximum value of `deps[i]`.
+        `deps[i]` is defined to be `d[i] * D_prime[i - 1] * F_prime[i - 1]`.
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        return Fraction(1, 1 << self.expanded_width)
+
+    @cache_on_self
+    def max_feps(self, i):
+        """maximum value of `feps[i]`.
+        `feps[i]` is defined to be `f[i] * (2 - D_prime[i - 1])`.
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        # zero, because the computation of `F_prime[i]` in
+        # `GoldschmidtDivOp.MulDByF.run(...)` is exact.
+        return Fraction(0)
+
+    @cached_property
+    def e0_range(self):
+        """minimum and maximum values of `e[0]`
+        (the relative error in `F_prime[-1]`)
+        """
+        min_e0 = Fraction(0)
+        max_e0 = Fraction(0)
+        for addr in range(self.table_addr_count):
+            # `F_prime[-1] = (1 - e[0]) / B`
+            # => `e[0] = 1 - B * F_prime[-1]`
+            min_b, max_b = self.table_input_exact_range(addr)
+            f_prime_m1 = self.table[addr].as_fraction()
+            assert min_b >= 0 and f_prime_m1 >= 0, \
+                "only positive quadrant of interval multiplication implemented"
+            min_product = min_b * f_prime_m1
+            max_product = max_b * f_prime_m1
+            # negation swaps min/max
+            cur_min_e0 = 1 - max_product
+            cur_max_e0 = 1 - min_product
+            min_e0 = min(min_e0, cur_min_e0)
+            max_e0 = max(max_e0, cur_max_e0)
+        min_e0 = self._shrink_min(min_e0)
+        max_e0 = self._shrink_max(max_e0)
+        return min_e0, max_e0
+
+    @cached_property
+    def min_e0(self):
+        """minimum value of `e[0]` (the relative error in `F_prime[-1]`)
+        """
+        min_e0, max_e0 = self.e0_range
+        return min_e0
+
+    @cached_property
+    def max_e0(self):
+        """maximum value of `e[0]` (the relative error in `F_prime[-1]`)
+        """
+        min_e0, max_e0 = self.e0_range
+        return max_e0
+
+    @cached_property
+    def max_abs_e0(self):
+        """maximum value of `abs(e[0])`."""
+        return max(abs(self.min_e0), abs(self.max_e0))
+
+    @cached_property
+    def min_abs_e0(self):
+        """minimum value of `abs(e[0])`."""
+        return Fraction(0)
+
+    @cache_on_self
+    def max_n(self, i):
+        """maximum value of `n[i]` (the relative error in `N_prime[i]`
+        relative to the previous iteration)
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        if i == 0:
+            # from Claim 10
+            # `n[0] = neps[0] / ((1 - e[0]) * (A / B))`
+            # `n[0] <= 2 * neps[0] / (1 - e[0])`
+
+            assert self.max_e0 < 1 and self.max_neps(0) >= 0, \
+                "only one quadrant of interval division implemented"
+            retval = 2 * self.max_neps(0) / (1 - self.max_e0)
+        elif i == 1:
+            # from Claim 10
+            # `n[1] <= neps[1] / ((1 - f[0]) * (1 - pi[0] - delta[0]))`
+            min_mpd = 1 - self.max_pi(0) - self.max_delta(0)
+            assert self.max_f(0) <= 1 and min_mpd >= 0, \
+                "only one quadrant of interval multiplication implemented"
+            prod = (1 - self.max_f(0)) * min_mpd
+            assert self.max_neps(1) >= 0 and prod > 0, \
+                "only one quadrant of interval division implemented"
+            retval = self.max_neps(1) / prod
+        else:
+            # from Claim 6
+            # `0 <= n[i] <= 2 * max_neps[i] / (1 - pi[i - 1] - delta[i - 1])`
+            min_mpd = 1 - self.max_pi(i - 1) - self.max_delta(i - 1)
+            assert self.max_neps(i) >= 0 and min_mpd > 0, \
+                "only one quadrant of interval division implemented"
+            retval = self.max_neps(i) / min_mpd
+
+        return self._shrink_max(retval)
+
+    @cache_on_self
+    def max_d(self, i):
+        """maximum value of `d[i]` (the relative error in `D_prime[i]`
+        relative to the previous iteration)
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        if i == 0:
+            # from Claim 10
+            # `d[0] = deps[0] / (1 - e[0])`
+
+            assert self.max_e0 < 1 and self.max_deps(0) >= 0, \
+                "only one quadrant of interval division implemented"
+            retval = self.max_deps(0) / (1 - self.max_e0)
+        elif i == 1:
+            # from Claim 10
+            # `d[1] <= deps[1] / ((1 - f[0]) * (1 - delta[0] ** 2))`
+            assert self.max_f(0) <= 1 and self.max_delta(0) <= 1, \
+                "only one quadrant of interval multiplication implemented"
+            divisor = (1 - self.max_f(0)) * (1 - self.max_delta(0) ** 2)
+            assert self.max_deps(1) >= 0 and divisor > 0, \
+                "only one quadrant of interval division implemented"
+            retval = self.max_deps(1) / divisor
+        else:
+            # from Claim 6
+            # `0 <= d[i] <= max_deps[i] / (1 - delta[i - 1])`
+            assert self.max_deps(i) >= 0 and self.max_delta(i - 1) < 1, \
+                "only one quadrant of interval division implemented"
+            retval = self.max_deps(i) / (1 - self.max_delta(i - 1))
+
+        return self._shrink_max(retval)
+
+    @cache_on_self
+    def max_f(self, i):
+        """maximum value of `f[i]` (the relative error in `F_prime[i]`
+        relative to the previous iteration)
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        if i == 0:
+            # from Claim 10
+            # `f[0] = feps[0] / (1 - delta[0])`
+
+            assert self.max_delta(0) < 1 and self.max_feps(0) >= 0, \
+                "only one quadrant of interval division implemented"
+            retval = self.max_feps(0) / (1 - self.max_delta(0))
+        elif i == 1:
+            # from Claim 10
+            # `f[1] = feps[1]`
+            retval = self.max_feps(1)
+        else:
+            # from Claim 6
+            # `f[i] <= max_feps[i]`
+            retval = self.max_feps(i)
+
+        return self._shrink_max(retval)
+
+    @cache_on_self
+    def max_delta(self, i):
+        """ maximum value of `delta[i]`.
+        `delta[i]` is defined in Definition 4 of paper.
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        if i == 0:
+            # `delta[0] = abs(e[0]) + 3 * d[0] / 2`
+            retval = self.max_abs_e0 + Fraction(3, 2) * self.max_d(0)
+        else:
+            # `delta[i] = delta[i - 1] ** 2 + f[i - 1]`
+            prev_max_delta = self.max_delta(i - 1)
+            assert prev_max_delta >= 0
+            retval = prev_max_delta ** 2 + self.max_f(i - 1)
+
+        # `delta[i]` has to be smaller than one otherwise errors would go off
+        # to infinity
+        _assert_accuracy(retval < 1)
+
+        return self._shrink_max(retval)
+
+    @cache_on_self
+    def max_pi(self, i):
+        """ maximum value of `pi[i]`.
+        `pi[i]` is defined right below Theorem 5 of paper.
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        # `pi[i] = 1 - (1 - n[i]) * prod`
+        # where `prod` is the product of,
+        # for `j` in `0 <= j < i`, `(1 - n[j]) / (1 + d[j])`
+        min_prod = Fraction(1)
+        for j in range(i):
+            max_n_j = self.max_n(j)
+            max_d_j = self.max_d(j)
+            assert max_n_j <= 1 and max_d_j > -1, \
+                "only one quadrant of interval division implemented"
+            min_prod *= (1 - max_n_j) / (1 + max_d_j)
+        max_n_i = self.max_n(i)
+        assert max_n_i <= 1 and min_prod >= 0, \
+            "only one quadrant of interval multiplication implemented"
+        retval = 1 - (1 - max_n_i) * min_prod
+        return self._shrink_max(retval)
+
+    @cached_property
+    def max_n_shift(self):
+        """ maximum value of `state.n_shift`.
+        """
+        # numerator must be less than `denominator << self.io_width`, so
+        # `n_shift` is at most `self.io_width`
+        return self.io_width
+
+    @cached_property
+    def n_hat(self):
+        """ maximum value of, for all `i`, `max_n(i)` and `max_d(i)`
+        """
+        n_hat = Fraction(0)
+        for i in range(self.iter_count):
+            n_hat = max(n_hat, self.max_n(i), self.max_d(i))
+        return self._shrink_max(n_hat)
+
+    def __make_ops(self):
+        """ Goldschmidt division algorithm.
+
+            based on:
+            Even, G., Seidel, P. M., & Ferguson, W. E. (2003).
+            A Parametric Error Analysis of Goldschmidt's Division Algorithm.
+            https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.90.1238&rep=rep1&type=pdf
+
+            yields: GoldschmidtDivOp
+                the operations needed to perform the division.
+        """
+        # establish assumptions of the paper's error analysis (section 3.1):
+
+        # 1. normalize so A (numerator) and B (denominator) are in [1, 2)
+        yield GoldschmidtDivOp.Normalize
+
+        # 2. ensure all relative errors from directed rounding are <= 1 / 4.
+        # the assumption is met by multipliers with > 4-bits precision
+        _assert_accuracy(self.expanded_width > 4)
+
+        # 3. require `abs(e[0]) + 3 * d[0] / 2 + f[0] < 1 / 2`.
+        _assert_accuracy(self.max_abs_e0 + 3 * self.max_d(0) / 2
+                         + self.max_f(0) < Fraction(1, 2))
+
+        # 4. the initial approximation F'[-1] of 1/B is in [1/2, 1].
+        # (B is the denominator)
+
+        for addr in range(self.table_addr_count):
+            f_prime_m1 = self.table[addr]
+            _assert_accuracy(0.5 <= f_prime_m1 <= 1)
+
+        yield GoldschmidtDivOp.FEqTableLookup
+
+        # we use Setting I (section 4.1 of the paper):
+        # Require `n[i] <= n_hat` and `d[i] <= n_hat` and `f[i] = 0`:
+        # the conditions on n_hat are satisfied by construction.
+        for i in range(self.iter_count):
+            _assert_accuracy(self.max_f(i) == 0)
+            yield GoldschmidtDivOp.MulNByF
+            if i != self.iter_count - 1:
+                yield GoldschmidtDivOp.MulDByF
+                yield GoldschmidtDivOp.FEq2MinusD
+
+        # relative approximation error `p(N_prime[i])`:
+        # `p(N_prime[i]) = (A / B - N_prime[i]) / (A / B)`
+        # `0 <= p(N_prime[i])`
+        # `p(N_prime[i]) <= (2 * i) * n_hat \`
+        # ` + (abs(e[0]) + 3 * n_hat / 2) ** (2 ** i)`
+        i = self.iter_count - 1  # last used `i`
+        # compute power manually to prevent huge intermediate values
+        power = self._shrink_max(self.max_abs_e0 + 3 * self.n_hat / 2)
+        for _ in range(i):
+            power = self._shrink_max(power * power)
+
+        max_rel_error = (2 * i) * self.n_hat + power
+
+        min_a_over_b = Fraction(1, 2)
+        min_abs_error_for_correctness = min_a_over_b / (1 << self.max_n_shift)
+        min_rel_error_for_correctness = (min_abs_error_for_correctness
+                                         / min_a_over_b)
+
+        _assert_accuracy(
+            max_rel_error < min_rel_error_for_correctness,
+            f"not accurate enough: max_rel_error={max_rel_error}"
+            f" min_rel_error_for_correctness={min_rel_error_for_correctness}")
+
+        yield GoldschmidtDivOp.CalcResult
+
+    @cache_on_self
+    def default_cost_fn(self):
+        """ calculate the estimated cost on an arbitrary scale of implementing
+        goldschmidt division with the specified parameters. larger cost
+        values mean worse parameters.
+
+        This is the default cost function for `GoldschmidtDivParams.get`.
+
+        returns: float
+        """
+        rom_cells = self.table_data_bits << self.table_addr_bits
+        cost = float(rom_cells)
+        for op in self.ops:
+            if op == GoldschmidtDivOp.MulNByF \
+                    or op == GoldschmidtDivOp.MulDByF:
+                mul_cost = self.expanded_width ** 2
+                mul_cost *= self.expanded_width.bit_length()
+                cost += mul_cost
+        cost += 5e7 * self.iter_count
+        return cost
+
+    @staticmethod
+    @lru_cache(maxsize=1 << 16)
+    def __cached_new(base_params):
+        assert isinstance(base_params, GoldschmidtDivParamsBase)
+        kwargs = {}
+        for field in fields(GoldschmidtDivParamsBase):
+            kwargs[field] = getattr(base_params, field)
+        try:
+            return GoldschmidtDivParams(**kwargs), None
+        except ParamsNotAccurateEnough as e:
+            return None, e
+
+    @staticmethod
+    def __raise(e):  # type: (ParamsNotAccurateEnough) -> Any
+        raise e
+
+    @staticmethod
+    def cached_new(base_params, handle_error=__raise):
+        assert isinstance(base_params, GoldschmidtDivParamsBase)
+        params, error = GoldschmidtDivParams.__cached_new(base_params)
+        if error is None:
+            return params
+        else:
+            return handle_error(error)
+
+    @staticmethod
+    def get(io_width, cost_fn=default_cost_fn, max_table_addr_bits=12):
+        """ find efficient parameters for a goldschmidt division algorithm
+        with `params.io_width == io_width`.
+
+        arguments:
+        io_width: int
+            bit-width of the input divisor and the result.
+            the input numerator is `2 * io_width`-bits wide.
+        cost_fn: Callable[[GoldschmidtDivParams], float]
+            return the estimated cost on an arbitrary scale of implementing
+            goldschmidt division with the specified parameters. larger cost
+            values mean worse parameters.
+        max_table_addr_bits: int
+            maximum allowable value of `table_addr_bits`
+        """
+        assert isinstance(io_width, int) and io_width >= 1
+        assert callable(cost_fn)
+
+        last_error = None
+        last_error_params = None
+
+        def cached_new(base_params):
+            def handle_error(e):
+                nonlocal last_error, last_error_params
+                last_error = e
+                last_error_params = base_params
+                return None
+
+            retval = GoldschmidtDivParams.cached_new(base_params, handle_error)
+            if retval is None:
+                logging.debug(f"GoldschmidtDivParams.get: err: {base_params}")
+            else:
+                logging.debug(f"GoldschmidtDivParams.get: ok: {base_params}")
+            return retval
+
+        @lru_cache(maxsize=None)
+        def get_cost(base_params):
+            params = cached_new(base_params)
+            if params is None:
+                return math.inf
+            retval = cost_fn(params)
+            logging.debug(f"GoldschmidtDivParams.get: cost={retval}: {params}")
+            return retval
+
+        # start with parameters big enough to always work.
+        initial_extra_precision = io_width * 2 + 4
+        initial_params = GoldschmidtDivParamsBase(
+            io_width=io_width,
+            extra_precision=initial_extra_precision,
+            table_addr_bits=min(max_table_addr_bits, io_width),
+            table_data_bits=io_width + initial_extra_precision,
+            iter_count=1 + io_width.bit_length())
+
+        if cached_new(initial_params) is None:
+            raise ValueError(f"initial goldschmidt division algorithm "
+                             f"parameters are invalid: {initial_params}"
+                             ) from last_error
+
+        # find good initial `iter_count`
+        params = initial_params
+        for iter_count in range(1, initial_params.iter_count):
+            trial_params = replace(params, iter_count=iter_count)
+            if cached_new(trial_params) is not None:
+                params = trial_params
+                break
+
+        # now find `table_addr_bits`
+        cost = get_cost(params)
+        for table_addr_bits in range(1, max_table_addr_bits):
+            trial_params = replace(params, table_addr_bits=table_addr_bits)
+            trial_cost = get_cost(trial_params)
+            if trial_cost < cost:
+                params = trial_params
+                cost = trial_cost
+                break
+
+        # check one higher `iter_count` to see if it has lower cost
+        for table_addr_bits in range(1, max_table_addr_bits + 1):
+            trial_params = replace(params,
+                                   table_addr_bits=table_addr_bits,
+                                   iter_count=params.iter_count + 1)
+            trial_cost = get_cost(trial_params)
+            if trial_cost < cost:
+                params = trial_params
+                cost = trial_cost
+                break
+
+        # now shrink `table_data_bits`
+        while True:
+            trial_params = replace(params,
+                                   table_data_bits=params.table_data_bits - 1)
+            trial_cost = get_cost(trial_params)
+            if trial_cost < cost:
+                params = trial_params
+                cost = trial_cost
+            else:
+                break
+
+        # and shrink `extra_precision`
+        while True:
+            trial_params = replace(params,
+                                   extra_precision=params.extra_precision - 1)
+            trial_cost = get_cost(trial_params)
+            if trial_cost < cost:
+                params = trial_params
+                cost = trial_cost
+            else:
+                break
+
+        retval = cached_new(params)
+        assert isinstance(retval, GoldschmidtDivParams)
+        return retval
+
+
+def clz(v, wid):
+    """count leading zeros -- handy for debugging."""
+    assert isinstance(wid, int)
+    assert isinstance(v, int) and 0 <= v < (1 << wid)
+    return (1 << wid).bit_length() - v.bit_length()
+
+
+@enum.unique
+class GoldschmidtDivOp(enum.Enum):
+    Normalize = "n, d, n_shift = normalize(n, d)"
+    FEqTableLookup = "f = table_lookup(d)"
+    MulNByF = "n *= f"
+    MulDByF = "d *= f"
+    FEq2MinusD = "f = 2 - d"
+    CalcResult = "result = unnormalize_and_round(n)"
+
+    def run(self, params, state):
+        assert isinstance(params, GoldschmidtDivParams)
+        assert isinstance(state, GoldschmidtDivState)
+        expanded_width = params.expanded_width
+        table_addr_bits = params.table_addr_bits
+        if self == GoldschmidtDivOp.Normalize:
+            # normalize so 1 <= d < 2
+            # can easily be done with count-leading-zeros and left shift
+            while state.d < 1:
+                state.n = (state.n * 2).to_frac_wid(expanded_width)
+                state.d = (state.d * 2).to_frac_wid(expanded_width)
+
+            state.n_shift = 0
+            # normalize so 1 <= n < 2
+            while state.n >= 2:
+                state.n = (state.n * 0.5).to_frac_wid(expanded_width,
+                                                      round_dir=RoundDir.DOWN)
+                state.n_shift += 1
+        elif self == GoldschmidtDivOp.FEqTableLookup:
+            # compute initial f by table lookup
+            d_m_1 = state.d - 1
+            d_m_1 = d_m_1.to_frac_wid(table_addr_bits, RoundDir.DOWN)
+            assert 0 <= d_m_1.bits < (1 << params.table_addr_bits)
+            state.f = params.table[d_m_1.bits]
+            state.f = state.f.to_frac_wid(expanded_width,
+                                          round_dir=RoundDir.DOWN)
+        elif self == GoldschmidtDivOp.MulNByF:
+            assert state.f is not None
+            n = state.n * state.f
+            state.n = n.to_frac_wid(expanded_width, round_dir=RoundDir.DOWN)
+        elif self == GoldschmidtDivOp.MulDByF:
+            assert state.f is not None
+            d = state.d * state.f
+            state.d = d.to_frac_wid(expanded_width, round_dir=RoundDir.UP)
+        elif self == GoldschmidtDivOp.FEq2MinusD:
+            state.f = (2 - state.d).to_frac_wid(expanded_width)
+        elif self == GoldschmidtDivOp.CalcResult:
+            assert state.n_shift is not None
+            # scale to correct value
+            n = state.n * (1 << state.n_shift)
+
+            state.quotient = math.floor(n)
+            state.remainder = state.orig_n - state.quotient * state.orig_d
+            if state.remainder >= state.orig_d:
+                state.quotient += 1
+                state.remainder -= state.orig_d
+        else:
+            assert False, f"unimplemented GoldschmidtDivOp: {self}"
+
+    def gen_hdl(self, params, state, sync_rom):
+        """generate the hdl for this operation.
+
+        arguments:
+        params: GoldschmidtDivParams
+            the goldschmidt division parameters.
+        state: GoldschmidtDivHDLState
+            the input/output state
+        sync_rom: bool
+            true if the rom should be read synchronously rather than
+            combinatorially, incurring an extra clock cycle of latency.
+        """
+        assert isinstance(params, GoldschmidtDivParams)
+        assert isinstance(state, GoldschmidtDivHDLState)
+        m = state.m
+        if self == GoldschmidtDivOp.Normalize:
+            # normalize so 1 <= d < 2
+            assert state.d.width == params.io_width
+            assert state.n.width == 2 * params.io_width
+            d_leading_zeros = CLZ(params.io_width)
+            m.submodules.d_leading_zeros = d_leading_zeros
+            m.d.comb += d_leading_zeros.sig_in.eq(state.d)
+            d_shift_out = Signal.like(state.d)
+            m.d.comb += d_shift_out.eq(state.d << d_leading_zeros.lz)
+            d = Signal(params.n_d_f_total_wid)
+            m.d.comb += d.eq((d_shift_out << (1 + params.expanded_width))
+                             >> state.d.width)
+
+            # normalize so 1 <= n < 2
+            n_leading_zeros = CLZ(2 * params.io_width)
+            m.submodules.n_leading_zeros = n_leading_zeros
+            m.d.comb += n_leading_zeros.sig_in.eq(state.n)
+            signed_zero = Const(0, signed(1))  # force subtraction to be signed
+            n_shift_s_v = (params.io_width + signed_zero + d_leading_zeros.lz
+                           - n_leading_zeros.lz)
+            n_shift_s = Signal.like(n_shift_s_v)
+            n_shift_n_lz_out = Signal.like(state.n)
+            n_shift_d_lz_out = Signal.like(state.n << d_leading_zeros.lz)
+            m.d.comb += [
+                n_shift_s.eq(n_shift_s_v),
+                n_shift_d_lz_out.eq(state.n << d_leading_zeros.lz),
+                n_shift_n_lz_out.eq(state.n << n_leading_zeros.lz),
+            ]
+            state.n_shift = Signal(d_leading_zeros.lz.width)
+            n = Signal(params.n_d_f_total_wid)
+            with m.If(n_shift_s < 0):
+                m.d.comb += [
+                    state.n_shift.eq(0),
+                    n.eq((n_shift_d_lz_out << (1 + params.expanded_width))
+                         >> state.d.width),
+                ]
+            with m.Else():
+                m.d.comb += [
+                    state.n_shift.eq(n_shift_s),
+                    n.eq((n_shift_n_lz_out << (1 + params.expanded_width))
+                         >> state.n.width),
+                ]
+            state.n = n
+            state.d = d
+        elif self == GoldschmidtDivOp.FEqTableLookup:
+            assert state.d.width == params.n_d_f_total_wid, "invalid d width"
+            # compute initial f by table lookup
+
+            # extra bit for table entries == 1.0
+            table_width = 1 + params.table_data_bits
+            table = Memory(width=table_width, depth=len(params.table),
+                           init=[i.bits for i in params.table])
+            addr = state.d[:-params.n_d_f_int_wid][-params.table_addr_bits:]
+            if sync_rom:
+                table_read = table.read_port()
+                m.d.comb += table_read.addr.eq(addr)
+                state.insert_pipeline_register()
+            else:
+                table_read = table.read_port(domain="comb")
+                m.d.comb += table_read.addr.eq(addr)
+            m.submodules.table_read = table_read
+            state.f = Signal(params.n_d_f_int_wid + params.expanded_width)
+            data_shift = params.expanded_width - params.table_data_bits
+            m.d.comb += state.f.eq(table_read.data << data_shift)
+        elif self == GoldschmidtDivOp.MulNByF:
+            assert state.n.width == params.n_d_f_total_wid, "invalid n width"
+            assert state.f is not None
+            assert state.f.width == params.n_d_f_total_wid, "invalid f width"
+            n = Signal.like(state.n)
+            m.d.comb += n.eq((state.n * state.f) >> params.expanded_width)
+            state.n = n
+        elif self == GoldschmidtDivOp.MulDByF:
+            assert state.d.width == params.n_d_f_total_wid, "invalid d width"
+            assert state.f is not None
+            assert state.f.width == params.n_d_f_total_wid, "invalid f width"
+            d = Signal.like(state.d)
+            d_times_f = Signal.like(state.d * state.f)
+            m.d.comb += [
+                d_times_f.eq(state.d * state.f),
+                # round the multiplication up
+                d.eq((d_times_f >> params.expanded_width)
+                     + (d_times_f[:params.expanded_width] != 0)),
+            ]
+            state.d = d
+        elif self == GoldschmidtDivOp.FEq2MinusD:
+            assert state.d.width == params.n_d_f_total_wid, "invalid d width"
+            f = Signal.like(state.d)
+            m.d.comb += f.eq((2 << params.expanded_width) - state.d)
+            state.f = f
+        elif self == GoldschmidtDivOp.CalcResult:
+            assert state.n.width == params.n_d_f_total_wid, "invalid n width"
+            assert state.n_shift is not None
+            # scale to correct value
+            n = state.n * (1 << state.n_shift)
+            q_approx = Signal(params.io_width)
+            # extra bit for if it's bigger than orig_d
+            r_approx = Signal(params.io_width + 1)
+            adjusted_r = Signal(signed(1 + params.io_width))
+            m.d.comb += [
+                q_approx.eq((state.n << state.n_shift)
+                            >> params.expanded_width),
+                r_approx.eq(state.orig_n - q_approx * state.orig_d),
+                adjusted_r.eq(r_approx - state.orig_d),
+            ]
+            state.quotient = Signal(params.io_width)
+            state.remainder = Signal(params.io_width)
+
+            with m.If(adjusted_r >= 0):
+                m.d.comb += [
+                    state.quotient.eq(q_approx + 1),
+                    state.remainder.eq(adjusted_r),
+                ]
+            with m.Else():
+                m.d.comb += [
+                    state.quotient.eq(q_approx),
+                    state.remainder.eq(r_approx),
+                ]
+        else:
+            assert False, f"unimplemented GoldschmidtDivOp: {self}"
+
+
+@plain_data(repr=False)
+class GoldschmidtDivState:
+    __slots__ = ("orig_n", "orig_d", "n", "d",
+                 "f", "quotient", "remainder", "n_shift")
+
+    def __init__(self, orig_n, orig_d, n, d,
+                 f=None, quotient=None, remainder=None, n_shift=None):
+        assert isinstance(orig_n, int)
+        assert isinstance(orig_d, int)
+        assert isinstance(n, FixedPoint)
+        assert isinstance(d, FixedPoint)
+        assert f is None or isinstance(f, FixedPoint)
+        assert quotient is None or isinstance(quotient, int)
+        assert remainder is None or isinstance(remainder, int)
+        assert n_shift is None or isinstance(n_shift, int)
+        self.orig_n = orig_n
+        """original numerator"""
+
+        self.orig_d = orig_d
+        """original denominator"""
+
+        self.n = n
+        """numerator -- N_prime[i] in the paper's algorithm 2"""
+
+        self.d = d
+        """denominator -- D_prime[i] in the paper's algorithm 2"""
+
+        self.f = f
+        """current factor -- F_prime[i] in the paper's algorithm 2"""
+
+        self.quotient = quotient
+        """final quotient"""
+
+        self.remainder = remainder
+        """final remainder"""
+
+        self.n_shift = n_shift
+        """amount the numerator needs to be left-shifted at the end of the
+        algorithm.
+        """
+
+    def __repr__(self):
+        fields_str = []
+        for field in fields(GoldschmidtDivState):
+            value = getattr(self, field)
+            if value is None:
+                continue
+            if isinstance(value, int) and field != "n_shift":
+                fields_str.append(f"{field}={hex(value)}")
+            else:
+                fields_str.append(f"{field}={value!r}")
+        return f"GoldschmidtDivState({', '.join(fields_str)})"
+
+
+def goldschmidt_div(n, d, params, trace=lambda state: None):
+    """ Goldschmidt division algorithm.
+
+        based on:
+        Even, G., Seidel, P. M., & Ferguson, W. E. (2003).
+        A Parametric Error Analysis of Goldschmidt's Division Algorithm.
+        https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.90.1238&rep=rep1&type=pdf
+
+        arguments:
+        n: int
+            numerator. a `2*width`-bit unsigned integer.
+            must be less than `d << width`, otherwise the quotient wouldn't
+            fit in `width` bits.
+        d: int
+            denominator. a `width`-bit unsigned integer. must not be zero.
+        width: int
+            the bit-width of the inputs/outputs. must be a positive integer.
+        trace: Function[[GoldschmidtDivState], None]
+            called with the initial state and the state after executing each
+            operation in `params.ops`.
+
+        returns: tuple[int, int]
+            the quotient and remainder. a tuple of two `width`-bit unsigned
+            integers.
+    """
+    assert isinstance(params, GoldschmidtDivParams)
+    assert isinstance(d, int) and 0 < d < (1 << params.io_width)
+    assert isinstance(n, int) and 0 <= n < (d << params.io_width)
+
+    # this whole algorithm is done with fixed-point arithmetic where values
+    # have `width` fractional bits
+
+    state = GoldschmidtDivState(
+        orig_n=n,
+        orig_d=d,
+        n=FixedPoint(n, params.io_width),
+        d=FixedPoint(d, params.io_width),
+    )
+
+    trace(state)
+    for op in params.ops:
+        op.run(params, state)
+        trace(state)
+
+    assert state.quotient is not None
+    assert state.remainder is not None
+
+    return state.quotient, state.remainder
+
+
+@plain_data(eq=False)
+class GoldschmidtDivHDLState:
+    __slots__ = ("m", "orig_n", "orig_d", "n", "d",
+                 "f", "quotient", "remainder", "n_shift")
+
+    __signal_name_prefix = "state_"
+
+    def __init__(self, m, orig_n, orig_d, n, d,
+                 f=None, quotient=None, remainder=None, n_shift=None):
+        assert isinstance(m, Module)
+        assert isinstance(orig_n, Signal)
+        assert isinstance(orig_d, Signal)
+        assert isinstance(n, Signal)
+        assert isinstance(d, Signal)
+        assert f is None or isinstance(f, Signal)
+        assert quotient is None or isinstance(quotient, Signal)
+        assert remainder is None or isinstance(remainder, Signal)
+        assert n_shift is None or isinstance(n_shift, Signal)
+
+        self.m = m
+        """The HDL Module"""
+
+        self.orig_n = orig_n
+        """original numerator"""
+
+        self.orig_d = orig_d
+        """original denominator"""
+
+        self.n = n
+        """numerator -- N_prime[i] in the paper's algorithm 2"""
+
+        self.d = d
+        """denominator -- D_prime[i] in the paper's algorithm 2"""
+
+        self.f = f
+        """current factor -- F_prime[i] in the paper's algorithm 2"""
+
+        self.quotient = quotient
+        """final quotient"""
+
+        self.remainder = remainder
+        """final remainder"""
+
+        self.n_shift = n_shift
+        """amount the numerator needs to be left-shifted at the end of the
+        algorithm.
+        """
+
+        # old_signals must be set last
+        self.old_signals = defaultdict(list)
+
+    def __setattr__(self, name, value):
+        assert isinstance(name, str)
+        if name.startswith("_"):
+            return super().__setattr__(name, value)
+        try:
+            old_signals = self.old_signals[name]
+        except AttributeError:
+            # haven't yet finished __post_init__
+            return super().__setattr__(name, value)
+        assert name != "m" and name != "old_signals", f"can't write to {name}"
+        assert isinstance(value, Signal)
+        value.name = f"{self.__signal_name_prefix}{name}_{len(old_signals)}"
+        old_signal = getattr(self, name, None)
+        if old_signal is not None:
+            assert isinstance(old_signal, Signal)
+            old_signals.append(old_signal)
+        return super().__setattr__(name, value)
+
+    def insert_pipeline_register(self):
+        old_prefix = self.__signal_name_prefix
+        try:
+            for field in fields(GoldschmidtDivHDLState):
+                if field.startswith("_") or field == "m":
+                    continue
+                old_sig = getattr(self, field, None)
+                if old_sig is None:
+                    continue
+                assert isinstance(old_sig, Signal)
+                new_sig = Signal.like(old_sig)
+                setattr(self, field, new_sig)
+                self.m.d.sync += new_sig.eq(old_sig)
+        finally:
+            self.__signal_name_prefix = old_prefix
+
+
+class GoldschmidtDivHDL(Elaboratable):
+    """ Goldschmidt division algorithm.
+
+        based on:
+        Even, G., Seidel, P. M., & Ferguson, W. E. (2003).
+        A Parametric Error Analysis of Goldschmidt's Division Algorithm.
+        https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.90.1238&rep=rep1&type=pdf
+
+        attributes:
+        params: GoldschmidtDivParams
+            the goldschmidt division algorithm parameters.
+        pipe_reg_indexes: list[int]
+            the operation indexes where pipeline registers should be inserted.
+            duplicate values mean multiple registers should be inserted for
+            that operation index -- this is useful to allow yosys to spread a
+            multiplication across those multiple pipeline stages.
+        sync_rom: bool
+            true if the rom should be read synchronously rather than
+            combinatorially, incurring an extra clock cycle of latency.
+        n: Signal(unsigned(2 * params.io_width))
+            input numerator. a `2 * params.io_width`-bit unsigned integer.
+            must be less than `d << params.io_width`, otherwise the quotient
+            wouldn't fit in `params.io_width` bits.
+        d: Signal(unsigned(params.io_width))
+            input denominator. a `params.io_width`-bit unsigned integer.
+            must not be zero.
+        q: Signal(unsigned(params.io_width))
+            output quotient. only valid when `n < (d << params.io_width)`.
+        r: Signal(unsigned(params.io_width))
+            output remainder. only valid when `n < (d << params.io_width)`.
+        trace: list[GoldschmidtDivHDLState]
+            list of the initial state and the state after executing each
+            operation in `params.ops`.
+    """
+
+    @property
+    def total_pipeline_registers(self):
+        """the total number of pipeline registers"""
+        return len(self.pipe_reg_indexes) + self.sync_rom
+
+    def __init__(self, params, pipe_reg_indexes=(), sync_rom=False):
+        assert isinstance(params, GoldschmidtDivParams)
+        assert isinstance(sync_rom, bool)
+        self.params = params
+        self.pipe_reg_indexes = sorted(int(i) for i in pipe_reg_indexes)
+        self.sync_rom = sync_rom
+        self.n = Signal(unsigned(2 * params.io_width))
+        self.d = Signal(unsigned(params.io_width))
+        self.q = Signal(unsigned(params.io_width))
+        self.r = Signal(unsigned(params.io_width))
+
+        # in constructor so we get trace without needing to call elaborate
+        state = GoldschmidtDivHDLState(
+            m=Module(),
+            orig_n=self.n,
+            orig_d=self.d,
+            n=self.n,
+            d=self.d)
+
+        self.trace = [replace(state)]
+
+        # copy and reverse
+        pipe_reg_indexes = list(reversed(self.pipe_reg_indexes))
+
+        for op_index, op in enumerate(self.params.ops):
+            while len(pipe_reg_indexes) > 0 \
+                    and pipe_reg_indexes[-1] <= op_index:
+                pipe_reg_indexes.pop()
+                state.insert_pipeline_register()
+            op.gen_hdl(self.params, state, self.sync_rom)
+            self.trace.append(replace(state))
+
+        while len(pipe_reg_indexes) > 0:
+            pipe_reg_indexes.pop()
+            state.insert_pipeline_register()
+
+        state.m.d.comb += [
+            self.q.eq(state.quotient),
+            self.r.eq(state.remainder),
+        ]
+
+    def elaborate(self, platform):
+        return self.trace[0].m
+
+
+GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID = 2
+
+
+@lru_cache()
+def goldschmidt_sqrt_rsqrt_table(table_addr_bits, table_data_bits):
+    """Generate the look-up table needed for Goldschmidt's square-root and
+    reciprocal-square-root algorithm.
+
+    arguments:
+    table_addr_bits: int
+        the number of address bits for the look-up table.
+    table_data_bits: int
+        the number of data bits for the look-up table.
+    """
+    assert isinstance(table_addr_bits, int) and \
+        table_addr_bits >= GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID
+    assert isinstance(table_data_bits, int) and table_data_bits >= 1
+    table = []
+    table_len = 1 << table_addr_bits
+    for addr in range(table_len):
+        if addr == 0:
+            value = FixedPoint(0, table_data_bits)
+        elif (addr << 2) < table_len:
+            value = None  # table entries should be unused
+        else:
+            table_addr_frac_wid = table_addr_bits
+            table_addr_frac_wid -= GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID
+            max_input_value = FixedPoint(addr + 1, table_addr_bits - 2)
+            max_frac_wid = max(max_input_value.frac_wid, table_data_bits)
+            value = max_input_value.to_frac_wid(max_frac_wid)
+            value = value.rsqrt(RoundDir.DOWN)
+            value = value.to_frac_wid(table_data_bits, RoundDir.DOWN)
+        table.append(value)
+
+    # tuple for immutability
+    return tuple(table)
+
+# FIXME: add code to calculate error bounds and check that the algorithm will
+# actually work (like in the goldschmidt division algorithm).
+# FIXME: add code to calculate a good set of parameters based on the error
+# bounds checking.
+
+
+def goldschmidt_sqrt_rsqrt(radicand, io_width, frac_wid, extra_precision,
+                           table_addr_bits, table_data_bits, iter_count):
+    """Goldschmidt's square-root and reciprocal-square-root algorithm.
+
+    uses algorithm based on second method at:
+    https://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Goldschmidt%E2%80%99s_algorithm
+
+    arguments:
+    radicand: FixedPoint(frac_wid=frac_wid)
+        the input value to take the square-root and reciprocal-square-root of.
+    io_width: int
+        the number of bits in the input (`radicand`) and output values.
+    frac_wid: int
+        the number of fraction bits in the input (`radicand`) and output
+        values.
+    extra_precision: int
+        the number of bits of internal extra precision.
+    table_addr_bits: int
+        the number of address bits for the look-up table.
+    table_data_bits: int
+        the number of data bits for the look-up table.
+
+    returns: tuple[FixedPoint, FixedPoint]
+        the square-root and reciprocal-square-root, rounded down to the
+        nearest representable value. If `radicand == 0`, then the
+        reciprocal-square-root value returned is zero.
+    """
+    assert (isinstance(radicand, FixedPoint)
+            and radicand.frac_wid == frac_wid
+            and 0 <= radicand.bits < (1 << io_width))
+    assert isinstance(io_width, int) and io_width >= 1
+    assert isinstance(frac_wid, int) and 0 <= frac_wid < io_width
+    assert isinstance(extra_precision, int) and extra_precision >= io_width
+    assert isinstance(table_addr_bits, int) and table_addr_bits >= 1
+    assert isinstance(table_data_bits, int) and table_data_bits >= 1
+    assert isinstance(iter_count, int) and iter_count >= 0
+    expanded_frac_wid = frac_wid + extra_precision
+    s = radicand.to_frac_wid(expanded_frac_wid)
+    sqrt_rshift = extra_precision
+    rsqrt_rshift = extra_precision
+    while s != 0 and s < 1:
+        s = (s * 4).to_frac_wid(expanded_frac_wid)
+        sqrt_rshift += 1
+        rsqrt_rshift -= 1
+    while s >= 4:
+        s = s.div(4, expanded_frac_wid)
+        sqrt_rshift -= 1
+        rsqrt_rshift += 1
+    table = goldschmidt_sqrt_rsqrt_table(table_addr_bits=table_addr_bits,
+                                         table_data_bits=table_data_bits)
+    # core goldschmidt sqrt/rsqrt algorithm:
+    # initial setup:
+    table_addr_frac_wid = table_addr_bits
+    table_addr_frac_wid -= GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID
+    addr = s.to_frac_wid(table_addr_frac_wid, RoundDir.DOWN)
+    assert 0 <= addr.bits < (1 << table_addr_bits), "table addr out of range"
+    f = table[addr.bits]
+    assert f is not None, "accessed invalid table entry"
+    # use with_frac_wid to fix IDE type deduction
+    f = FixedPoint.with_frac_wid(f, expanded_frac_wid, RoundDir.DOWN)
+    x = (s * f).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+    h = (f * 0.5).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+    for _ in range(iter_count):
+        # iteration step:
+        f = (1.5 - x * h).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+        x = (x * f).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+        h = (h * f).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+    r = 2 * h
+    # now `x` is approximately `sqrt(s)` and `r` is approximately `rsqrt(s)`
+
+    sqrt = FixedPoint(x.bits >> sqrt_rshift, frac_wid)
+    rsqrt = FixedPoint(r.bits >> rsqrt_rshift, frac_wid)
+
+    next_sqrt = FixedPoint(sqrt.bits + 1, frac_wid)
+    if next_sqrt * next_sqrt <= radicand:
+        sqrt = next_sqrt
+
+    next_rsqrt = FixedPoint(rsqrt.bits + 1, frac_wid)
+    if next_rsqrt * next_rsqrt * radicand <= 1 and radicand != 0:
+        rsqrt = next_rsqrt
+    return sqrt, rsqrt
diff --git a/src/soc/fu/div/experiment/test/__init__.py b/src/soc/fu/div/experiment/test/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/soc/fu/div/experiment/test/test_goldschmidt_div_sqrt.py b/src/soc/fu/div/experiment/test/test_goldschmidt_div_sqrt.py
new file mode 100644 (file)
index 0000000..28e795f
--- /dev/null
@@ -0,0 +1,480 @@
+# SPDX-License-Identifier: LGPL-3-or-later
+# Copyright 2022 Jacob Lifshay programmerjake@gmail.com
+
+# Funded by NLnet Assure Programme 2021-02-052, https://nlnet.nl/assure part
+# of Horizon 2020 EU Programme 957073.
+
+from nmutil.plain_data import fields, replace
+import math
+import unittest
+from nmutil.formaltest import FHDLTestCase
+from nmutil.sim_util import do_sim, hash_256
+from nmigen.sim import Tick, Delay
+from nmigen.hdl.ast import Signal
+from nmigen.hdl.dsl import Module
+from soc.fu.div.experiment.goldschmidt_div_sqrt import (
+    GoldschmidtDivHDL, GoldschmidtDivHDLState, GoldschmidtDivParams,
+    GoldschmidtDivState, ParamsNotAccurateEnough, goldschmidt_div,
+    FixedPoint, RoundDir, goldschmidt_sqrt_rsqrt)
+
+
+class TestFixedPoint(FHDLTestCase):
+    def test_str_roundtrip(self):
+        for frac_wid in range(8):
+            for bits in range(-1 << 9, 1 << 9):
+                with self.subTest(bits=hex(bits), frac_wid=frac_wid):
+                    value = FixedPoint(bits, frac_wid)
+                    round_trip_value = FixedPoint.cast(str(value))
+                    self.assertEqual(value, round_trip_value)
+
+    @staticmethod
+    def trap(f):
+        try:
+            return f(), None
+        except (ValueError, ZeroDivisionError) as e:
+            return None, e.__class__.__name__
+
+    def test_sqrt(self):
+        for frac_wid in range(8):
+            for bits in range(1 << 9):
+                for round_dir in RoundDir:
+                    radicand = FixedPoint(bits, frac_wid)
+                    expected_f = math.sqrt(float(radicand))
+                    expected = self.trap(lambda: FixedPoint.with_frac_wid(
+                        expected_f, frac_wid, round_dir))
+                    with self.subTest(radicand=repr(radicand),
+                                      round_dir=str(round_dir),
+                                      expected=repr(expected)):
+                        result = self.trap(lambda: radicand.sqrt(round_dir))
+                        self.assertEqual(result, expected)
+
+    def test_rsqrt(self):
+        for frac_wid in range(8):
+            for bits in range(1, 1 << 9):
+                for round_dir in RoundDir:
+                    radicand = FixedPoint(bits, frac_wid)
+                    expected_f = 1 / math.sqrt(float(radicand))
+                    expected = self.trap(lambda: FixedPoint.with_frac_wid(
+                        expected_f, frac_wid, round_dir))
+                    with self.subTest(radicand=repr(radicand),
+                                      round_dir=str(round_dir),
+                                      expected=repr(expected)):
+                        result = self.trap(lambda: radicand.rsqrt(round_dir))
+                        self.assertEqual(result, expected)
+
+
+class TestGoldschmidtDiv(FHDLTestCase):
+    def test_case1(self):
+        with self.assertRaises(ParamsNotAccurateEnough):
+            GoldschmidtDivParams(io_width=3, extra_precision=2,
+                                 table_addr_bits=3, table_data_bits=5,
+                                 iter_count=2)
+
+    def test_case2(self):
+        with self.assertRaises(ParamsNotAccurateEnough):
+            GoldschmidtDivParams(io_width=4, extra_precision=1,
+                                 table_addr_bits=1, table_data_bits=5,
+                                 iter_count=1)
+
+    @staticmethod
+    def cases(io_width, cases=None):
+        assert isinstance(io_width, int) and io_width >= 1
+        if cases is not None:
+            for n, d in cases:
+                assert isinstance(d, int) \
+                    and 0 < d < (1 << io_width), "invalid case"
+                assert isinstance(n, int) \
+                    and 0 <= n < (d << io_width), "invalid case"
+                yield (n, d)
+        elif io_width > 6:
+            assert io_width * 2 <= 256, \
+                "can't generate big enough numbers for test cases"
+            for i in range(10000):
+                d = hash_256(f'd {i}') % (1 << io_width)
+                if d == 0:
+                    d = 1
+                n = hash_256(f'n {i}') % (d << io_width)
+                yield (n, d)
+        else:
+            for d in range(1, 1 << io_width):
+                for n in range(d << io_width):
+                    yield (n, d)
+
+    def tst(self, io_width, cases=None):
+        assert isinstance(io_width, int)
+        params = GoldschmidtDivParams.get(io_width)
+        with self.subTest(params=str(params)):
+            for n, d in self.cases(io_width, cases):
+                expected_q, expected_r = divmod(n, d)
+                with self.subTest(n=hex(n), d=hex(d),
+                                  expected_q=hex(expected_q),
+                                  expected_r=hex(expected_r)):
+                    trace = []
+
+                    def trace_fn(state):
+                        assert isinstance(state, GoldschmidtDivState)
+                        trace.append((replace(state)))
+                    q, r = goldschmidt_div(n, d, params, trace=trace_fn)
+                    with self.subTest(q=hex(q), r=hex(r), trace=repr(trace)):
+                        self.assertEqual((q, r), (expected_q, expected_r))
+
+    def tst_sim(self, io_width, cases=None, pipe_reg_indexes=(),
+                sync_rom=False):
+        assert isinstance(io_width, int)
+        params = GoldschmidtDivParams.get(io_width)
+        m = Module()
+        dut = GoldschmidtDivHDL(params, pipe_reg_indexes=pipe_reg_indexes,
+                                sync_rom=sync_rom)
+        m.submodules.dut = dut
+        # make sync domain get added
+        m.d.sync += Signal().eq(0)
+
+        def inputs_proc():
+            yield Tick()
+            for n, d in self.cases(io_width, cases):
+                yield dut.n.eq(n)
+                yield dut.d.eq(d)
+                yield Tick()
+
+        def check_interals(n, d):
+            # check internals only if dut is completely combinatorial
+            # so we don't have to figure out how to read values in
+            # previous clock cycles
+            if dut.total_pipeline_registers != 0:
+                return
+            ref_trace = []
+
+            def ref_trace_fn(state):
+                assert isinstance(state, GoldschmidtDivState)
+                ref_trace.append((replace(state)))
+            goldschmidt_div(n=n, d=d, params=params, trace=ref_trace_fn)
+            self.assertEqual(len(dut.trace), len(ref_trace))
+            for index, state in enumerate(dut.trace):
+                ref_state = ref_trace[index]
+                last_op = None if index == 0 else params.ops[index - 1]
+                with self.subTest(index=index, state=repr(state),
+                                  ref_state=repr(ref_state),
+                                  last_op=str(last_op)):
+                    for field in fields(GoldschmidtDivHDLState):
+                        sig = getattr(state, field)
+                        if not isinstance(sig, Signal):
+                            continue
+                        ref_value = getattr(ref_state, field)
+                        ref_value_str = repr(ref_value)
+                        if isinstance(ref_value, int):
+                            ref_value_str = hex(ref_value)
+                        value = yield sig
+                        with self.subTest(field_name=field,
+                                          sig=repr(sig),
+                                          sig_shape=repr(sig.shape()),
+                                          value=hex(value),
+                                          ref_value=ref_value_str):
+                            if isinstance(ref_value, int):
+                                self.assertEqual(value, ref_value)
+                            else:
+                                assert isinstance(ref_value, FixedPoint)
+                                self.assertEqual(value, ref_value.bits)
+
+        def check_outputs():
+            yield Tick()
+            for _ in range(dut.total_pipeline_registers):
+                yield Tick()
+            for n, d in self.cases(io_width, cases):
+                yield Delay(0.1e-6)
+                expected_q, expected_r = divmod(n, d)
+                with self.subTest(n=hex(n), d=hex(d),
+                                  expected_q=hex(expected_q),
+                                  expected_r=hex(expected_r)):
+                    q = yield dut.q
+                    r = yield dut.r
+                    with self.subTest(q=hex(q), r=hex(r)):
+                        self.assertEqual((q, r), (expected_q, expected_r))
+                    yield from check_interals(n, d)
+
+                yield Tick()
+
+        with self.subTest(params=str(params)):
+            with do_sim(self, m, (dut.n, dut.d, dut.q, dut.r)) as sim:
+                sim.add_clock(1e-6)
+                sim.add_process(inputs_proc)
+                sim.add_process(check_outputs)
+                sim.run()
+
+    def test_1_through_4(self):
+        for io_width in range(1, 4 + 1):
+            with self.subTest(io_width=io_width):
+                self.tst(io_width)
+
+    def test_5(self):
+        self.tst(5)
+
+    def test_6(self):
+        self.tst(6)
+
+    def test_8(self):
+        self.tst(8)
+
+    def test_16(self):
+        self.tst(16)
+
+    def test_32(self):
+        self.tst(32)
+
+    def test_64(self):
+        self.tst(64)
+
+    def test_sim_5(self):
+        self.tst_sim(5)
+
+    def test_sim_8(self):
+        self.tst_sim(8)
+
+    def test_sim_16(self):
+        self.tst_sim(16)
+
+    def test_sim_32(self):
+        self.tst_sim(32)
+
+    def test_sim_64(self):
+        self.tst_sim(64)
+
+    def tst_params(self, io_width):
+        assert isinstance(io_width, int)
+        params = GoldschmidtDivParams.get(io_width)
+        print()
+        print(params)
+
+    def test_params_1(self):
+        self.tst_params(1)
+
+    def test_params_2(self):
+        self.tst_params(2)
+
+    def test_params_3(self):
+        self.tst_params(3)
+
+    def test_params_4(self):
+        self.tst_params(4)
+
+    def test_params_5(self):
+        self.tst_params(5)
+
+    def test_params_6(self):
+        self.tst_params(6)
+
+    def test_params_7(self):
+        self.tst_params(7)
+
+    def test_params_8(self):
+        self.tst_params(8)
+
+    def test_params_9(self):
+        self.tst_params(9)
+
+    def test_params_10(self):
+        self.tst_params(10)
+
+    def test_params_11(self):
+        self.tst_params(11)
+
+    def test_params_12(self):
+        self.tst_params(12)
+
+    def test_params_13(self):
+        self.tst_params(13)
+
+    def test_params_14(self):
+        self.tst_params(14)
+
+    def test_params_15(self):
+        self.tst_params(15)
+
+    def test_params_16(self):
+        self.tst_params(16)
+
+    def test_params_17(self):
+        self.tst_params(17)
+
+    def test_params_18(self):
+        self.tst_params(18)
+
+    def test_params_19(self):
+        self.tst_params(19)
+
+    def test_params_20(self):
+        self.tst_params(20)
+
+    def test_params_21(self):
+        self.tst_params(21)
+
+    def test_params_22(self):
+        self.tst_params(22)
+
+    def test_params_23(self):
+        self.tst_params(23)
+
+    def test_params_24(self):
+        self.tst_params(24)
+
+    def test_params_25(self):
+        self.tst_params(25)
+
+    def test_params_26(self):
+        self.tst_params(26)
+
+    def test_params_27(self):
+        self.tst_params(27)
+
+    def test_params_28(self):
+        self.tst_params(28)
+
+    def test_params_29(self):
+        self.tst_params(29)
+
+    def test_params_30(self):
+        self.tst_params(30)
+
+    def test_params_31(self):
+        self.tst_params(31)
+
+    def test_params_32(self):
+        self.tst_params(32)
+
+    def test_params_33(self):
+        self.tst_params(33)
+
+    def test_params_34(self):
+        self.tst_params(34)
+
+    def test_params_35(self):
+        self.tst_params(35)
+
+    def test_params_36(self):
+        self.tst_params(36)
+
+    def test_params_37(self):
+        self.tst_params(37)
+
+    def test_params_38(self):
+        self.tst_params(38)
+
+    def test_params_39(self):
+        self.tst_params(39)
+
+    def test_params_40(self):
+        self.tst_params(40)
+
+    def test_params_41(self):
+        self.tst_params(41)
+
+    def test_params_42(self):
+        self.tst_params(42)
+
+    def test_params_43(self):
+        self.tst_params(43)
+
+    def test_params_44(self):
+        self.tst_params(44)
+
+    def test_params_45(self):
+        self.tst_params(45)
+
+    def test_params_46(self):
+        self.tst_params(46)
+
+    def test_params_47(self):
+        self.tst_params(47)
+
+    def test_params_48(self):
+        self.tst_params(48)
+
+    def test_params_49(self):
+        self.tst_params(49)
+
+    def test_params_50(self):
+        self.tst_params(50)
+
+    def test_params_51(self):
+        self.tst_params(51)
+
+    def test_params_52(self):
+        self.tst_params(52)
+
+    def test_params_53(self):
+        self.tst_params(53)
+
+    def test_params_54(self):
+        self.tst_params(54)
+
+    def test_params_55(self):
+        self.tst_params(55)
+
+    def test_params_56(self):
+        self.tst_params(56)
+
+    def test_params_57(self):
+        self.tst_params(57)
+
+    def test_params_58(self):
+        self.tst_params(58)
+
+    def test_params_59(self):
+        self.tst_params(59)
+
+    def test_params_60(self):
+        self.tst_params(60)
+
+    def test_params_61(self):
+        self.tst_params(61)
+
+    def test_params_62(self):
+        self.tst_params(62)
+
+    def test_params_63(self):
+        self.tst_params(63)
+
+    def test_params_64(self):
+        self.tst_params(64)
+
+
+class TestGoldschmidtSqrtRSqrt(FHDLTestCase):
+    def tst(self, io_width, frac_wid, extra_precision,
+            table_addr_bits, table_data_bits, iter_count):
+        assert isinstance(io_width, int)
+        assert isinstance(frac_wid, int)
+        assert isinstance(extra_precision, int)
+        assert isinstance(table_addr_bits, int)
+        assert isinstance(table_data_bits, int)
+        assert isinstance(iter_count, int)
+        with self.subTest(io_width=io_width, frac_wid=frac_wid,
+                          extra_precision=extra_precision,
+                          table_addr_bits=table_addr_bits,
+                          table_data_bits=table_data_bits,
+                          iter_count=iter_count):
+            for bits in range(1 << io_width):
+                radicand = FixedPoint(bits, frac_wid)
+                expected_sqrt = radicand.sqrt(RoundDir.DOWN)
+                expected_rsqrt = FixedPoint(0, frac_wid)
+                if radicand > 0:
+                    expected_rsqrt = radicand.rsqrt(RoundDir.DOWN)
+                with self.subTest(radicand=repr(radicand),
+                                  expected_sqrt=repr(expected_sqrt),
+                                  expected_rsqrt=repr(expected_rsqrt)):
+                    sqrt, rsqrt = goldschmidt_sqrt_rsqrt(
+                        radicand=radicand, io_width=io_width,
+                        frac_wid=frac_wid,
+                        extra_precision=extra_precision,
+                        table_addr_bits=table_addr_bits,
+                        table_data_bits=table_data_bits,
+                        iter_count=iter_count)
+                    with self.subTest(sqrt=repr(sqrt), rsqrt=repr(rsqrt)):
+                        self.assertEqual((sqrt, rsqrt),
+                                         (expected_sqrt, expected_rsqrt))
+
+    def test1(self):
+        self.tst(io_width=16, frac_wid=8, extra_precision=20,
+                 table_addr_bits=4, table_data_bits=28, iter_count=4)
+
+
+if __name__ == "__main__":
+    unittest.main()
index 4c70fdf177d35e8d18144cfec25751a82563b43d..1c807dc05492f5b654cf70f1443e55cef99774bb 100644 (file)
@@ -10,28 +10,33 @@ from ieee754.div_rem_sqrt_rsqrt.core import (
 
 
 class DivInputData(FUBaseData):
-    regspec = [('INT', 'ra', '0:63'),  # RA
-               ('INT', 'rb', '0:63'),  # RB/immediate
-               ('XER', 'xer_so', '32'), ]  # XER bit 32: SO
-
     def __init__(self, pspec):
         super().__init__(pspec, False)
         # convenience
         self.a, self.b = self.ra, self.rb
 
+    @property
+    def regspec(self):
+        return [('INT', 'ra', self.intrange),  # RA
+               ('INT', 'rb', self.intrange),  # RB/immediate
+               ('XER', 'xer_so', '32'), ]  # XER bit 32: SO
+
 
 # output stage shared between div and mul: like ALUOutputData but no CA/32
 class DivMulOutputData(FUBaseData):
-    regspec = [('INT', 'o', '0:63'),
-               ('CR', 'cr_a', '0:3'),
-               ('XER', 'xer_ov', '33,44'),  # bit0: ov, bit1: ov32
-               ('XER', 'xer_so', '32')]
 
     def __init__(self, pspec):
         super().__init__(pspec, True)
         # convenience
         self.cr0 = self.cr_a
 
+    @property
+    def regspec(self):
+        return [('INT', 'o', self.intrange),
+               ('CR', 'cr_a', '0:3'),
+               ('XER', 'xer_ov', '33,44'),  # bit0: ov, bit1: ov32
+               ('XER', 'xer_so', '32')]
+
 
 class DivPipeKindConfigBase:
     def __init__(self,
@@ -129,28 +134,34 @@ class DivPipeKind(enum.Enum):
 
 
 class DivPipeSpec(CommonPipeSpec):
-    def __init__(self, id_wid, div_pipe_kind):
-        super().__init__(id_wid=id_wid)
+    def __init__(self, id_wid, parent_pspec, div_pipe_kind):
+        super().__init__(id_wid=id_wid, parent_pspec=parent_pspec)
         self.div_pipe_kind = div_pipe_kind
         self.core_config = div_pipe_kind.config.core_config
 
-    regspec = (DivInputData.regspec, DivMulOutputData.regspec)
+    regspecklses = (DivInputData, DivMulOutputData)
     opsubsetkls = CompLogicalOpSubset
 
 
 class DivPipeSpecDivPipeCore(DivPipeSpec):
-    def __init__(self, id_wid):
-        super().__init__(id_wid=id_wid, div_pipe_kind=DivPipeKind.DivPipeCore)
+    def __init__(self, id_wid, parent_pspec):
+        super().__init__(id_wid=id_wid,
+                         parent_pspec=parent_pspec,
+                         div_pipe_kind=DivPipeKind.DivPipeCore)
 
 
 class DivPipeSpecFSMDivCore(DivPipeSpec):
-    def __init__(self, id_wid):
-        super().__init__(id_wid=id_wid, div_pipe_kind=DivPipeKind.FSMDivCore)
+    def __init__(self, id_wid, parent_pspec):
+        super().__init__(id_wid=id_wid,
+                         parent_pspec=parent_pspec,
+                         div_pipe_kind=DivPipeKind.FSMDivCore)
 
 
 class DivPipeSpecSimOnly(DivPipeSpec):
-    def __init__(self, id_wid):
-        super().__init__(id_wid=id_wid, div_pipe_kind=DivPipeKind.SimOnly)
+    def __init__(self, id_wid, parent_pspec):
+        super().__init__(id_wid=id_wid,
+                         parent_pspec=parent_pspec,
+                         div_pipe_kind=DivPipeKind.SimOnly)
 
 
 class CoreBaseData(DivInputData):
index 56308942c1c02fb8ccca9d65acbf3563f8692caa..71c5c01fb03fb8dc08adda2522cc5bc551db486f 100644 (file)
@@ -12,13 +12,18 @@ from soc.fu.div.pipe_data import DivPipeKindConfigCombPipe
 class DivStagesStart(PipeModBaseChain):
     def get_chain(self):
         alu_input = DivMulInputStage(self.pspec)
+        return [alu_input]
+
+
+class DivStagesSetup(PipeModBaseChain):
+    def get_chain(self):
         div_setup = DivSetupStage(self.pspec)
         if isinstance(self.pspec.div_pipe_kind.config,
                       DivPipeKindConfigCombPipe):
             core_setup = [DivCoreSetupStage(self.pspec)]
         else:
             core_setup = ()
-        return [alu_input, div_setup, *core_setup]
+        return [div_setup, *core_setup]
 
 
 class DivStagesMiddle(PipeModBaseChain):
@@ -45,9 +50,14 @@ class DivStagesEnd(PipeModBaseChain):
         else:
             core_final = ()
         div_out = DivOutputStage(self.pspec)
-        alu_out = DivMulOutputStage(self.pspec)
         self.div_out = div_out  # debugging - bug #425
-        return [*core_final, div_out, alu_out]
+        return [*core_final, div_out]
+
+
+class DivStagesFinalise(PipeModBaseChain):
+    def get_chain(self):
+        alu_out = DivMulOutputStage(self.pspec)
+        return [alu_out]
 
 
 class DivBasePipe(ControlBase):
@@ -55,6 +65,7 @@ class DivBasePipe(ControlBase):
         ControlBase.__init__(self)
         self.pspec = pspec
         self.pipe_start = DivStagesStart(pspec)
+        self.pipe_setup = DivStagesSetup(pspec)
         self.pipe_middles = []
         if isinstance(self.pspec.div_pipe_kind.config,
                       DivPipeKindConfigCombPipe):
@@ -66,16 +77,21 @@ class DivBasePipe(ControlBase):
             self.pipe_middles.append(
                 self.pspec.div_pipe_kind.config.core_stage_class(pspec))
         self.pipe_end = DivStagesEnd(pspec)
+        self.pipe_final = DivStagesFinalise(pspec)
         self._eqs = self.connect([self.pipe_start,
+                                  self.pipe_setup,
                                   *self.pipe_middles,
-                                  self.pipe_end])
+                                  self.pipe_end,
+                                  self.pipe_final])
 
     def elaborate(self, platform):
         m = ControlBase.elaborate(self, platform)
         m.submodules.pipe_start = self.pipe_start
+        m.submodules.pipe_setup = self.pipe_setup
         for i in range(len(self.pipe_middles)):
             name = f"pipe_middle_{i}"
             setattr(m.submodules, name, self.pipe_middles[i])
         m.submodules.pipe_end = self.pipe_end
+        m.submodules.pipe_final = self.pipe_final
         m.d.comb += self._eqs
         return m
index 0625159e123b67658a7e667327835ec637b2a038..5fe049786ae9074e30e39a48fe4939b0e7382ba3 100644 (file)
@@ -27,6 +27,7 @@ class DivSetupStage(PipeModBase):
         return CoreInputData(self.pspec)
 
     def elaborate(self, platform):
+        XLEN = self.pspec.XLEN
         m = Module()
         comb = m.d.comb
         # convenience variables
@@ -42,14 +43,15 @@ class DivSetupStage(PipeModBase):
 
         # work out if a/b are negative (check 32-bit / signed)
         comb += dividend_neg_o.eq(Mux(op.is_32bit,
-                                      a[31], a[63]) & op.is_signed)
-        comb += divisor_neg_o.eq(Mux(op.is_32bit, b[31], b[63]) & op.is_signed)
+                                      a[31], a[XLEN-1]) & op.is_signed)
+        comb += divisor_neg_o.eq(Mux(op.is_32bit,
+                                      b[31], b[XLEN-1]) & op.is_signed)
 
         # negation of a 64-bit value produces the same lower 32-bit
         # result as negation of just the lower 32-bits, so we don't
         # need to do anything special before negating
-        abs_dor = Signal(64, reset_less=True)  # absolute of divisor
-        abs_dend = Signal(64, reset_less=True)  # absolute of dividend
+        abs_dor = Signal(XLEN, reset_less=True)  # absolute of divisor
+        abs_dend = Signal(XLEN, reset_less=True)  # absolute of dividend
         comb += abs_dor.eq(Mux(divisor_neg_o, -b, b))
         comb += abs_dend.eq(Mux(dividend_neg_o, -a, a))
 
@@ -78,7 +80,7 @@ class DivSetupStage(PipeModBase):
                 with m.If(op.is_32bit):
                     comb += dividend_o.eq(abs_dend[0:32] << 32)
                 with m.Else():
-                    comb += dividend_o.eq(abs_dend[0:64] << 64)
+                    comb += dividend_o.eq(abs_dend[0:XLEN] << XLEN)
 
         ###### sticky overflow and context, both pass-through #####
 
index 80871fd30e5180e0e9eeeb05eb49ea7579549726..3a854975b1aa2b4999fcdd2ee6c3789a96c38847 100644 (file)
@@ -163,7 +163,11 @@ class DivTestHelper(unittest.TestCase):
 
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
 
-        pspec = DivPipeSpec(id_wid=2, div_pipe_kind=div_pipe_kind)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = DivPipeSpec(
+            id_wid=2, div_pipe_kind=div_pipe_kind, parent_pspec=pps)
         m.submodules.alu = alu = DivBasePipe(pspec)
 
         comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
index a5b343910827a6f1ddebc43492b68b4fca4dd899..215b3a65d7e54b48e21c66bf33e36fb15b3f246a 100644 (file)
@@ -6,7 +6,11 @@ from soc.fu.div.pipeline import DivBasePipe
 
 class TestPipeIlang(unittest.TestCase):
     def write_ilang(self, div_pipe_kind):
-        pspec = DivPipeSpec(id_wid=2, div_pipe_kind=div_pipe_kind)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = DivPipeSpec(
+            id_wid=2, div_pipe_kind=div_pipe_kind, parent_pspec=pps)
         alu = DivBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open(f"div_pipeline_{div_pipe_kind.name}.il", "w") as f:
index 8ba8f0255c8e7b62d6607360bd6a6cd32123773e..928ab9922f11f88a2a1b2120c5e31cacb94127bf 100644 (file)
@@ -25,6 +25,7 @@ class CompLDSTOpSubset(CompOpSubsetBase):
                   ('is_signed', 1),
                   ('data_len', 4),
                   ('byte_reverse', 1),
+                  ('reserve', 1),     # atomic update
                   ('sign_extend', 1),
                   ('ldst_mode', LDSTMode),
                   ('insn', 32),
index d3c91fdbbe53024d204f76b9e175aa59317cb1ea..6e868b1eebdd7eeffbc6f1812c29e508a0a0ad9c 100644 (file)
@@ -19,12 +19,13 @@ Links:
 
 from nmigen import (Elaboratable, Module, Signal, Shape, unsigned, Cat, Mux,
                     Record, Memory,
-                    Const)
+                    Const, C)
 from nmutil.iocontrol import RecordObject
 from nmutil.util import rising_edge, Display
 from enum import Enum, unique
 
 from soc.experiment.dcache import DCache
+from soc.experiment.icache import ICache
 from soc.experiment.pimem import PortInterfaceBase
 from soc.experiment.mem_types import LoadStore1ToMMUType
 from soc.experiment.mem_types import MMUToLoadStore1Type
@@ -39,7 +40,14 @@ class State(Enum):
     IDLE = 0       # ready for instruction
     ACK_WAIT = 1   # waiting for ack from dcache
     MMU_LOOKUP = 2 # waiting for MMU to look up translation
-    TLBIE_WAIT = 3 # waiting for MMU to finish doing a tlbie
+    #SECOND_REQ = 3 # second request for unaligned transfer
+
+@unique
+class Misalign(Enum):
+    ONEWORD = 0    # only one word needed, all good
+    NEED2WORDS = 1 # need to send/receive two words
+    WAITFIRST = 2  # waiting for the first word
+    WAITSECOND = 3 # waiting for the second word
 
 
 # captures the LDSTRequest from the PortInterface, which "blips" most
@@ -50,13 +58,20 @@ class LDSTRequest(RecordObject):
 
         self.load          = Signal()
         self.dcbz          = Signal()
-        self.addr          = Signal(64)
+        self.raddr          = Signal(64)
         # self.store_data    = Signal(64) # this is already sync (on a delay)
-        self.byte_sel      = Signal(8)
+        self.byte_sel      = Signal(16)
         self.nc            = Signal()              # non-cacheable access
         self.virt_mode     = Signal()
         self.priv_mode     = Signal()
+        self.mode_32bit    = Signal() # XXX UNUSED AT PRESENT
+        self.alignstate    = Signal(Misalign) # progress of alignment request
         self.align_intr    = Signal()
+        # atomic (LR/SC reservation)
+        self.reserve       = Signal()
+        self.atomic        = Signal()
+        self.atomic_last   = Signal()
+
 
 # glue logic for microwatt mmu and dcache
 class LoadStore1(PortInterfaceBase):
@@ -68,16 +83,20 @@ class LoadStore1(PortInterfaceBase):
         addrwid = pspec.addr_wid
 
         super().__init__(regwid, addrwid)
-        self.dcache = DCache()
+        self.dcache = DCache(pspec)
+        self.icache = ICache(pspec)
         # these names are from the perspective of here (LoadStore1)
         self.d_out  = self.dcache.d_in     # in to dcache is out for LoadStore
         self.d_in = self.dcache.d_out      # out from dcache is in for LoadStore
-        self.m_out  = LoadStore1ToMMUType() # out *to* MMU
-        self.m_in = MMUToLoadStore1Type()   # in *from* MMU
+        self.i_out  = self.icache.i_in     # in to icache is out for LoadStore
+        self.i_in = self.icache.i_out      # out from icache is in for LoadStore
+        self.m_out  = LoadStore1ToMMUType("m_out") # out *to* MMU
+        self.m_in = MMUToLoadStore1Type("m_in")   # in *from* MMU
         self.req = LDSTRequest(name="ldst_req")
 
         # TODO, convert dcache wb_in/wb_out to "standard" nmigen Wishbone bus
         self.dbus = Record(make_wb_layout(pspec))
+        self.ibus = Record(make_wb_layout(pspec))
 
         # for creating a single clock blip to DCache
         self.d_valid = Signal()
@@ -91,54 +110,85 @@ class LoadStore1(PortInterfaceBase):
         self.load          = Signal()
         self.tlbie         = Signal()
         self.dcbz          = Signal()
-        self.addr          = Signal(64)
-        self.store_data    = Signal(64)
-        self.load_data     = Signal(64)
-        self.load_data_delay = Signal(64)
-        self.byte_sel      = Signal(8)
+        self.raddr          = Signal(64)
+        self.maddr          = Signal(64)
+        self.store_data    = Signal(64)   # first half (aligned)
+        self.store_data2   = Signal(64)   # second half (misaligned)
+        self.load_data     = Signal(128)   # 128 to cope with misalignment
+        self.load_data_delay = Signal(128) # perform 2 LD/STs
+        self.byte_sel      = Signal(16)    # also for misaligned, 16-bit
+        self.alignstate    = Signal(Misalign) # progress of alignment request
+        self.next_addr      = Signal(64)      # 2nd (aligned) read/write addr
         #self.xerc         : xer_common_t;
-        #self.reserve       = Signal()
-        #self.atomic        = Signal()
-        #self.atomic_last   = Signal()
         #self.rc            = Signal()
         self.nc            = Signal()              # non-cacheable access
-        self.virt_mode     = Signal()
-        self.priv_mode     = Signal()
-        self.state        = Signal(State)
-        self.instr_fault   = Signal()
+        self.mode_32bit    = Signal() # XXX UNUSED AT PRESENT
+        self.state         = Signal(State)
+        self.instr_fault   = Signal()  # indicator to request i-cache MMU lookup
+        self.r_instr_fault  = Signal() # accessed in external_busy
+        self.priv_mode     = Signal() # only for instruction fetch (not LDST)
         self.align_intr    = Signal()
         self.busy          = Signal()
         self.wait_dcache   = Signal()
         self.wait_mmu      = Signal()
-        #self.mode_32bit    = Signal()
+        self.lrsc_misalign = Signal()
         #self.intr_vec     : integer range 0 to 16#fff#;
         #self.nia           = Signal(64)
         #self.srr1          = Signal(16)
-
-    def set_wr_addr(self, m, addr, mask, misalign, msr_pr, is_dcbz):
+        # use these to set the dsisr or dar respectively
+        self.mmu_set_spr    = Signal()
+        self.mmu_set_dsisr  = Signal()
+        self.mmu_set_dar    = Signal()
+        self.sprval_in      = Signal(64)
+
+        # ONLY access these read-only, do NOT attempt to change
+        self.dsisr          = Signal(32)
+        self.dar            = Signal(64)
+
+    # when external_busy set, do not allow PortInterface to proceed
+    def external_busy(self, m):
+        return self.instr_fault | self.r_instr_fault
+
+    def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
+        m.d.comb += self.req.nc.eq(is_nc)
         m.d.comb += self.req.load.eq(0) # store operation
         m.d.comb += self.req.byte_sel.eq(mask)
-        m.d.comb += self.req.addr.eq(addr)
-        m.d.comb += self.req.priv_mode.eq(~msr_pr) # not-problem  ==> priv
-        m.d.comb += self.req.virt_mode.eq(msr_pr) # problem-state ==> virt
-        m.d.comb += self.req.align_intr.eq(misalign)
+        m.d.comb += self.req.raddr.eq(addr)
+        m.d.comb += self.req.priv_mode.eq(~msr.pr) # not-problem  ==> priv
+        m.d.comb += self.req.virt_mode.eq(msr.dr) # DR ==> virt
+        m.d.comb += self.req.mode_32bit.eq(~msr.sf) # not-sixty-four ==> 32bit
         m.d.comb += self.req.dcbz.eq(is_dcbz)
+        with m.If(misalign):
+            m.d.comb += self.req.alignstate.eq(Misalign.NEED2WORDS)
+            m.d.sync += self.next_addr.eq(Cat(C(0, 3), addr[3:]+1))
 
         # m.d.comb += Display("set_wr_addr %i dcbz %i",addr,is_dcbz)
 
         # option to disable the cache entirely for write
         if self.disable_cache:
             m.d.comb += self.req.nc.eq(1)
+
+        # dcbz cannot do no-cache
+        with m.If(is_dcbz & self.req.nc):
+            m.d.comb += self.req.align_intr.eq(1)
+
+        # hmm, rather than add yet another argument to set_wr_addr
+        # read direct from PortInterface
+        m.d.comb += self.req.reserve.eq(self.pi.reserve) # atomic request
+        m.d.comb += self.req.atomic.eq(~self.lrsc_misalign)
+        m.d.comb += self.req.atomic_last.eq(~self.lrsc_misalign)
+
         return None
 
-    def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+    def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
         m.d.comb += self.d_valid.eq(1)
         m.d.comb += self.req.load.eq(1) # load operation
         m.d.comb += self.req.byte_sel.eq(mask)
-        m.d.comb += self.req.align_intr.eq(misalign)
-        m.d.comb += self.req.addr.eq(addr)
-        m.d.comb += self.req.priv_mode.eq(~msr_pr) # not-problem  ==> priv
-        m.d.comb += self.req.virt_mode.eq(msr_pr) # problem-state ==> virt
+        m.d.comb += self.req.raddr.eq(addr)
+        m.d.comb += self.req.priv_mode.eq(~msr.pr) # not-problem  ==> priv
+        m.d.comb += self.req.virt_mode.eq(msr.dr) # DR ==> virt
+        m.d.comb += self.req.mode_32bit.eq(~msr.sf) # not-sixty-four ==> 32bit
+        m.d.comb += self.req.nc.eq(is_nc)
         # BAD HACK! disable cacheing on LD when address is 0xCxxx_xxxx
         # this is for peripherals. same thing done in Microwatt loadstore1.vhdl
         with m.If(addr[28:] == Const(0xc, 4)):
@@ -146,6 +196,17 @@ class LoadStore1(PortInterfaceBase):
         # option to disable the cache entirely for read
         if self.disable_cache:
             m.d.comb += self.req.nc.eq(1)
+        with m.If(misalign):
+            # need two reads: prepare next address in advance
+            m.d.comb += self.req.alignstate.eq(Misalign.NEED2WORDS)
+            m.d.sync += self.next_addr.eq(Cat(C(0, 3), addr[3:]+1))
+
+        # hmm, rather than add yet another argument to set_rd_addr
+        # read direct from PortInterface
+        m.d.comb += self.req.reserve.eq(self.pi.reserve) # atomic request
+        m.d.comb += self.req.atomic.eq(~self.lrsc_misalign)
+        m.d.comb += self.req.atomic_last.eq(~self.lrsc_misalign)
+
         return None #FIXME return value
 
     def set_wr_data(self, m, data, wen):
@@ -154,8 +215,10 @@ class LoadStore1(PortInterfaceBase):
         # put data into comb which is picked up in main elaborate()
         m.d.comb += self.d_w_valid.eq(1)
         m.d.comb += self.store_data.eq(data)
-        #m.d.sync += self.d_out.byte_sel.eq(wen) # this might not be needed
+        m.d.sync += self.store_data2.eq(data[64:128])
         st_ok = self.done # TODO indicates write data is valid
+        m.d.comb += self.pi.store_done.data.eq(self.d_in.store_done)
+        m.d.comb += self.pi.store_done.ok.eq(1)
         return st_ok
 
     def get_rd_data(self, m):
@@ -169,44 +232,65 @@ class LoadStore1(PortInterfaceBase):
 
         # microwatt takes one more cycle before next operation can be issued
         sync += self.done_delay.eq(self.done)
-        sync += self.load_data_delay.eq(self.load_data)
+        #sync += self.load_data_delay[0:64].eq(self.load_data[0:64])
 
-        # create dcache module
+        # create dcache and icache module
         m.submodules.dcache = dcache = self.dcache
+        m.submodules.icache = icache = self.icache
 
         # temp vars
         d_out, d_in, dbus = self.d_out, self.d_in, self.dbus
+        i_out, i_in, ibus = self.i_out, self.i_in, self.ibus
         m_out, m_in = self.m_out, self.m_in
         exc = self.pi.exc_o
         exception = exc.happened
         mmureq = Signal()
 
-        # copy of address, but gets over-ridden for OP_FETCH_FAILED
+        # copy of address, but gets over-ridden for instr_fault
         maddr = Signal(64)
-        m.d.comb += maddr.eq(self.addr)
+        m.d.comb += maddr.eq(self.raddr)
+
+        # check for LR/SC misalignment, used in set_rd/wr_addr above
+        comb += self.lrsc_misalign.eq(((self.pi.data_len[0:3]-1) &
+                                        self.req.raddr[0:3]).bool())
+        with m.If(self.lrsc_misalign & self.req.reserve):
+            m.d.comb += self.req.align_intr.eq(1)
 
         # create a blip (single pulse) on valid read/write request
         # this can be over-ridden in the FSM to get dcache to re-run
         # a request when MMU_LOOKUP completes.
         m.d.comb += self.d_validblip.eq(rising_edge(m, self.d_valid))
         ldst_r = LDSTRequest("ldst_r")
-        comb += Display("MMUTEST: LoadStore1 d_in.error=%i",d_in.error)
+        sync += Display("MMUTEST: LoadStore1 d_in.error=%i",d_in.error)
 
         # fsm skeleton
         with m.Switch(self.state):
             with m.Case(State.IDLE):
-                with m.If(self.d_validblip & ~exc.happened):
+                sync += self.load_data_delay.eq(0) # clear out
+                with m.If((self.d_validblip | self.instr_fault) &
+                          ~exc.happened):
                     comb += self.busy.eq(1)
                     sync += self.state.eq(State.ACK_WAIT)
                     sync += ldst_r.eq(self.req) # copy of LDSTRequest on "blip"
-#                   sync += Display("validblip self.req.virt_mode=%i",
-#                   self.req.virt_mode)
+                    # sync += Display("validblip self.req.virt_mode=%i",
+                    #                 self.req.virt_mode)
+                    with m.If(self.instr_fault):
+                        comb += mmureq.eq(1)
+                        sync += self.r_instr_fault.eq(1)
+                        comb += maddr.eq(self.maddr)
+                        sync += self.state.eq(State.MMU_LOOKUP)
+                    with m.Else():
+                        sync += self.r_instr_fault.eq(0)
+                    # if the LD/ST requires two dwords, move to waiting
+                    # for first word
+                    with m.If(self.req.alignstate == Misalign.NEED2WORDS):
+                        sync += ldst_r.alignstate.eq(Misalign.WAITFIRST)
                 with m.Else():
                     sync += ldst_r.eq(0)
 
             # waiting for completion
             with m.Case(State.ACK_WAIT):
-                comb += Display("MMUTEST: ACK_WAIT")
+                sync += Display("MMUTEST: ACK_WAIT")
                 comb += self.busy.eq(~exc.happened)
 
                 with m.If(d_in.error):
@@ -217,10 +301,11 @@ class LoadStore1(PortInterfaceBase):
                         sync += self.state.eq(State.IDLE)
                         sync += ldst_r.eq(0)
                         sync += Display("cache error -> update dsisr")
-                        #sync += self.dsisr[63 - 38].eq(~self.load)
+                        sync += self.dsisr[63 - 38].eq(~ldst_r.load)
                         # XXX there is no architected bit for this
                         # (probably should be a machine check in fact)
-                        #sync += self.dsisr[63 - 35].eq(d_in.cache_paradox)
+                        sync += self.dsisr[63 - 35].eq(d_in.cache_paradox)
+                        sync += self.r_instr_fault.eq(0)
 
                     with m.Else():
                         # Look up the translation for TLB miss
@@ -229,54 +314,98 @@ class LoadStore1(PortInterfaceBase):
                         comb += mmureq.eq(1)
                         sync += self.state.eq(State.MMU_LOOKUP)
                 with m.If(d_in.valid):
-                    m.d.comb += self.done.eq(~mmureq) # done if not doing MMU
                     with m.If(self.done):
-                        sync += Display("ACK_WAIT, done %x", self.addr)
-                    sync += self.state.eq(State.IDLE)
-                    sync += ldst_r.eq(0)
-                    with m.If(self.load):
-                        m.d.comb += self.load_data.eq(d_in.data)
+                        sync += Display("ACK_WAIT, done %x", self.raddr)
+                    with m.If(ldst_r.alignstate == Misalign.ONEWORD):
+                        # done if there is only one dcache operation
+                        sync += self.state.eq(State.IDLE)
+                        sync += ldst_r.eq(0)
+                        with m.If(ldst_r.load):
+                            m.d.comb += self.load_data.eq(d_in.data)
+                            sync += self.load_data_delay[0:64].eq(d_in.data)
+                        m.d.comb += self.done.eq(~mmureq) # done if not MMU
+                    with m.Elif(ldst_r.alignstate == Misalign.WAITFIRST):
+                        # first LD done: load data, initiate 2nd request.
+                        # leave in ACK_WAIT state
+                        with m.If(ldst_r.load):
+                            m.d.comb += self.load_data[0:63].eq(d_in.data)
+                            sync += self.load_data_delay[0:64].eq(d_in.data)
+                        with m.Else():
+                            m.d.sync += d_out.data.eq(self.store_data2)
+                        # mmm kinda cheating, make a 2nd blip.
+                        # use an aligned version of the address
+                        m.d.comb += self.d_validblip.eq(1)
+                        comb += self.req.eq(ldst_r) # from copy of request
+                        comb += self.req.raddr.eq(self.next_addr)
+                        comb += self.req.byte_sel.eq(ldst_r.byte_sel[8:])
+                        comb += self.req.alignstate.eq(Misalign.WAITSECOND)
+                        sync += ldst_r.raddr.eq(self.next_addr)
+                        sync += ldst_r.byte_sel.eq(ldst_r.byte_sel[8:])
+                        sync += ldst_r.alignstate.eq(Misalign.WAITSECOND)
+                        sync += Display("    second req %x", self.req.raddr)
+                    with m.Elif(ldst_r.alignstate == Misalign.WAITSECOND):
+                        sync += Display("    done second %x", d_in.data)
+                        # done second load
+                        sync += self.state.eq(State.IDLE)
+                        sync += ldst_r.eq(0)
+                        with m.If(ldst_r.load):
+                            m.d.comb += self.load_data[64:128].eq(d_in.data)
+                            sync += self.load_data_delay[64:128].eq(d_in.data)
+                        m.d.comb += self.done.eq(~mmureq) # done if not MMU
 
             # waiting here for the MMU TLB lookup to complete.
             # either re-try the dcache lookup or throw MMU exception
             with m.Case(State.MMU_LOOKUP):
-                comb += self.busy.eq(1)
+                comb += self.busy.eq(~exception)
                 with m.If(m_in.done):
-                    with m.If(~self.instr_fault):
+                    with m.If(~self.r_instr_fault):
                         sync += Display("MMU_LOOKUP, done %x -> %x",
-                                        self.addr, d_out.addr)
+                                        self.raddr, d_out.addr)
                         # retry the request now that the MMU has
                         # installed a TLB entry, if not exception raised
                         m.d.comb += self.d_out.valid.eq(~exception)
                         sync += self.state.eq(State.ACK_WAIT)
-                        sync += ldst_r.eq(0)
                     with m.Else():
-                        sync += Display("MMU_LOOKUP, exception %x", self.addr)
-                        # instruction lookup fault: store address in DAR
-                        comb += exc.happened.eq(1) # reason = MMU_LOOKUP
-                        # mark dar as updated ?
-                        comb += self.pi.dar_o.eq(self.addr)
                         sync += self.state.eq(State.IDLE)
+                        sync += self.r_instr_fault.eq(0)
+                        comb += self.done.eq(1)
 
                 with m.If(m_in.err):
-                    # MMU RADIX exception thrown
+                    # MMU RADIX exception thrown. XXX
+                    # TODO: critical that the write here has to
+                    # notify the MMU FSM of the change to dsisr
                     comb += exception.eq(1)
+                    comb += self.done.eq(1)
                     sync += Display("MMU RADIX exception thrown")
-                    #sync += self.dsisr[63 - 33].eq(m_in.invalid)
-                    #sync += self.dsisr[63 - 36].eq(m_in.perm_error)
-                    #sync += self.dsisr[63 - 38].eq(self.load)
-                    #sync += self.dsisr[63 - 44].eq(m_in.badtree)
-                    #sync += self.dsisr[63 - 45].eq(m_in.rc_error)
+                    sync += self.dsisr[63 - 33].eq(m_in.invalid)
+                    sync += self.dsisr[63 - 36].eq(m_in.perm_error) # noexec
+                    sync += self.dsisr[63 - 38].eq(~ldst_r.load)
+                    sync += self.dsisr[63 - 44].eq(m_in.badtree)
+                    sync += self.dsisr[63 - 45].eq(m_in.rc_error)
                     sync += self.state.eq(State.IDLE)
+                    # exception thrown, clear out instruction fault state
+                    sync += self.r_instr_fault.eq(0)
 
-            with m.Case(State.TLBIE_WAIT):
-                pass
+        # MMU FSM communicating a request to update DSISR or DAR (OP_MTSPR)
+        with m.If(self.mmu_set_spr):
+            with m.If(self.mmu_set_dsisr):
+                sync += self.dsisr.eq(self.sprval_in)
+            with m.If(self.mmu_set_dar):
+                sync += self.dar.eq(self.sprval_in)
 
-        # alignment error: store address in DAR
+        # hmmm, alignment occurs in set_rd_addr/set_wr_addr, note exception
         with m.If(self.align_intr):
-            comb += exc.happened.eq(1) # reason = alignment
-            sync += Display("alignment error: store addr in DAR %x", self.addr)
-            comb += self.pi.dar_o.eq(self.addr)
+            comb += exc.happened.eq(1)
+        # check for updating DAR
+        with m.If(exception):
+            sync += Display("exception %x", self.raddr)
+            # alignment error: store address in DAR
+            with m.If(self.align_intr):
+                sync += Display("alignment error: addr in DAR %x", self.raddr)
+                sync += self.dar.eq(self.raddr)
+            with m.Elif(~self.r_instr_fault):
+                sync += Display("not instr fault, addr in DAR %x", self.raddr)
+                sync += self.dar.eq(self.raddr)
 
         # when done or exception, return to idle state
         with m.If(self.done | exception):
@@ -289,27 +418,33 @@ class LoadStore1(PortInterfaceBase):
         comb += self.align_intr.eq(self.req.align_intr)
         comb += exc.invalid.eq(m_in.invalid)
         comb += exc.alignment.eq(self.align_intr)
-        comb += exc.instr_fault.eq(self.instr_fault)
+        comb += exc.instr_fault.eq(self.r_instr_fault)
         # badtree, perm_error, rc_error, segment_fault
         comb += exc.badtree.eq(m_in.badtree)
         comb += exc.perm_error.eq(m_in.perm_error)
         comb += exc.rc_error.eq(m_in.rc_error)
         comb += exc.segment_fault.eq(m_in.segerr)
+        # conditions for 0x400 trap need these in SRR1
+        with m.If(exception & ~exc.alignment & exc.instr_fault):
+            comb += exc.srr1[14].eq(exc.invalid)      # 47-33
+            comb += exc.srr1[12].eq(exc.perm_error)   # 47-35
+            comb += exc.srr1[3].eq(exc.badtree)       # 47-44
+            comb += exc.srr1[2].eq(exc.rc_error)      # 47-45
 
         # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
-        comb += dbus.adr.eq(dcache.wb_out.adr)
-        comb += dbus.dat_w.eq(dcache.wb_out.dat)
-        comb += dbus.sel.eq(dcache.wb_out.sel)
-        comb += dbus.cyc.eq(dcache.wb_out.cyc)
-        comb += dbus.stb.eq(dcache.wb_out.stb)
-        comb += dbus.we.eq(dcache.wb_out.we)
-
-        comb += dcache.wb_in.dat.eq(dbus.dat_r)
-        comb += dcache.wb_in.ack.eq(dbus.ack)
+        comb += dbus.adr.eq(dcache.bus.adr)
+        comb += dbus.dat_w.eq(dcache.bus.dat_w)
+        comb += dbus.sel.eq(dcache.bus.sel)
+        comb += dbus.cyc.eq(dcache.bus.cyc)
+        comb += dbus.stb.eq(dcache.bus.stb)
+        comb += dbus.we.eq(dcache.bus.we)
+
+        comb += dcache.bus.dat_r.eq(dbus.dat_r)
+        comb += dcache.bus.ack.eq(dbus.ack)
         if hasattr(dbus, "stall"):
-            comb += dcache.wb_in.stall.eq(dbus.stall)
+            comb += dcache.bus.stall.eq(dbus.stall)
 
-        # update out d data when flag set
+        # update out d data when flag set, for first half (second done in FSM)
         with m.If(self.d_w_valid):
             m.d.sync += d_out.data.eq(self.store_data)
         #with m.Else():
@@ -325,36 +460,39 @@ class LoadStore1(PortInterfaceBase):
             m.d.comb += self.d_out.valid.eq(~exc.happened)
             m.d.comb += d_out.load.eq(self.req.load)
             m.d.comb += d_out.byte_sel.eq(self.req.byte_sel)
-            m.d.comb += self.addr.eq(self.req.addr)
+            m.d.comb += self.raddr.eq(self.req.raddr)
             m.d.comb += d_out.nc.eq(self.req.nc)
-            # XXX driver conflict.  ehn??
-            # XXX m.d.comb += d_out.priv_mode.eq(self.req.priv_mode)
-            # XXX m.d.comb += d_out.virt_mode.eq(self.req.virt_mode)
+            m.d.comb += d_out.priv_mode.eq(self.req.priv_mode)
+            m.d.comb += d_out.virt_mode.eq(self.req.virt_mode)
+            m.d.comb += d_out.reserve.eq(self.req.reserve)
+            m.d.comb += d_out.atomic.eq(self.req.atomic)
+            m.d.comb += d_out.atomic_last.eq(self.req.atomic_last)
             #m.d.comb += Display("validblip dcbz=%i addr=%x",
             #self.req.dcbz,self.req.addr)
             m.d.comb += d_out.dcbz.eq(self.req.dcbz)
         with m.Else():
             m.d.comb += d_out.load.eq(ldst_r.load)
             m.d.comb += d_out.byte_sel.eq(ldst_r.byte_sel)
-            m.d.comb += self.addr.eq(ldst_r.addr)
+            m.d.comb += self.raddr.eq(ldst_r.raddr)
             m.d.comb += d_out.nc.eq(ldst_r.nc)
-            # XXX driver conflict.  ehn??
-            # XXX m.d.comb += d_out.priv_mode.eq(ldst_r.priv_mode)
-            # XXX m.d.comb += d_out.virt_mode.eq(ldst_r.virt_mode)
+            m.d.comb += d_out.priv_mode.eq(ldst_r.priv_mode)
+            m.d.comb += d_out.virt_mode.eq(ldst_r.virt_mode)
+            m.d.comb += d_out.reserve.eq(ldst_r.reserve)
+            m.d.comb += d_out.atomic.eq(ldst_r.atomic)
+            m.d.comb += d_out.atomic_last.eq(ldst_r.atomic_last)
             #m.d.comb += Display("no_validblip dcbz=%i addr=%x",
             #ldst_r.dcbz,ldst_r.addr)
             m.d.comb += d_out.dcbz.eq(ldst_r.dcbz)
-
-        # XXX these should be possible to remove but for some reason
-        # cannot be... yet. TODO, investigate
-        m.d.comb += self.load_data.eq(d_in.data)
-        m.d.comb += d_out.addr.eq(self.addr)
+        m.d.comb += d_out.addr.eq(self.raddr)
 
         # Update outputs to MMU
         m.d.comb += m_out.valid.eq(mmureq)
         m.d.comb += m_out.iside.eq(self.instr_fault)
         m.d.comb += m_out.load.eq(ldst_r.load)
-        # m_out.priv <= r.priv_mode; TODO
+        with m.If(self.instr_fault):
+            m.d.comb += m_out.priv.eq(self.priv_mode)
+        with m.Else():
+            m.d.comb += m_out.priv.eq(ldst_r.priv_mode)
         m.d.comb += m_out.tlbie.eq(self.tlbie)
         # m_out.mtspr <= mmu_mtspr; # TODO
         # m_out.sprn <= sprn; # TODO
index c2d8a43cb47c0096d31e34e60e243ac7f5aba8b9..caf8bf5a15fca0dc01692225738ca70b00ae60bf 100644 (file)
@@ -22,7 +22,7 @@ class LDSTOutputData(FUBaseData):
     # LDSTCompUnit is unusual in that it's non-standard to RegSpecAPI
     regspec = [('INT', 'o', '0:63'),   # RT
                ('INT', 'o1', '0:63'),  # RA (effective address, update mode)
-               # TODO, later ('CR', 'cr_a', '0:3'),
+               ('CR', 'cr_a', '0:3'),
                # TODO, later ('XER', 'xer_so', '32')
                 ]
     def __init__(self, pspec):
@@ -32,5 +32,5 @@ class LDSTOutputData(FUBaseData):
 
 
 class LDSTPipeSpec(CommonPipeSpec):
-    regspec = (LDSTInputData.regspec, LDSTOutputData.regspec)
+    regspecklses = (LDSTInputData, LDSTOutputData)
     opsubsetkls = CompLDSTOpSubset
index dc086faefb70a5f217a6ee3cb894c4553a18371f..83eaf989ca08df99b4f23eba8dc026215a3dbab7 100644 (file)
@@ -58,15 +58,16 @@ class Bpermd(Elaboratable):
     def elaborate(self, platform):
         m = Module()
         perm = Signal(self.width, reset_less=True)
-        rb64 = [Signal(1, reset_less=True, name=f"rb64_{i}") for i in range(64)]
-        for i in range(64):
-            m.d.comb += rb64[i].eq(self.rb[63-i])
+        rb64 = [Signal(1, reset_less=True, name=f"rb64_{i}")
+                for i in range(self.width)]
+        for i in range(self.width):
+            m.d.comb += rb64[i].eq(self.rb[self.width-1-i])
         rb64 = Array(rb64)
-        for i in range(8):
+        for i in range(self.width//8):
             index = self.rs[8*i:8*i+8]
             idx = Signal(8, name=f"idx_{i}", reset_less=True)
             m.d.comb += idx.eq(index)
-            with m.If(idx < 64):
+            with m.If(idx < self.width):
                 m.d.comb += perm[i].eq(rb64[idx])
         m.d.comb += self.ra[0:8].eq(perm)
         return m
index d11f832df0b7e4d68957e85c40e83ca013b5aaf8..aa9b937d937ac52061912f994b295c7c7d4b1f6c 100644 (file)
@@ -32,7 +32,7 @@ class Driver(Elaboratable):
             recwidth += width
             comb += p.eq(AnyConst(width))
 
-        pspec = ALUPipeSpec(id_wid=2)
+        pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.dut = dut = ALUInputStage(pspec)
 
         a = Signal(64)
@@ -41,7 +41,7 @@ class Driver(Elaboratable):
                  dut.i.b.eq(b),
                  a.eq(AnyConst(64)),
                  b.eq(AnyConst(64))]
-                      
+
         comb += dut.i.ctx.op.eq(rec)
 
         # Assert that op gets copied from the input to output
@@ -70,6 +70,7 @@ class GTCombinerTestCase(FHDLTestCase):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=4)
         self.assertFormal(module, mode="cover", depth=4)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index 179d9ba26926ebe63afefc57eb5ce56add73f5fb..87d87283de4563e7c0ec2a8f6646159d601da4fc 100644 (file)
@@ -47,7 +47,7 @@ class Driver(Elaboratable):
             width = p.width
             comb += p.eq(AnyConst(width))
 
-        pspec = ALUPipeSpec(id_wid=2)
+        pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.dut = dut = LogicalMainStage(pspec)
 
         # convenience variables
@@ -60,7 +60,7 @@ class Driver(Elaboratable):
         # setup random inputs
         comb += [a.eq(AnyConst(64)),
                  b.eq(AnyConst(64)),
-                 #carry_in.eq(AnyConst(0b11)),
+                 # carry_in.eq(AnyConst(0b11)),
                  ]
 
         comb += dut.i.ctx.op.eq(rec)
@@ -78,7 +78,7 @@ class Driver(Elaboratable):
         comb += a_signed_32.eq(a[0:32])
 
         o_ok = Signal()
-        comb += o_ok.eq(1) # will be set to zero if no op takes place
+        comb += o_ok.eq(1)  # will be set to zero if no op takes place
 
         # main assertion of arithmetic operations
         with m.Switch(rec.insn_type):
@@ -125,10 +125,10 @@ class Driver(Elaboratable):
                         comb += peo.eq(32)
                     with m.Else():
                         comb += peo.eq(pe32.o)
-                    with m.If(XO[-1]): # cnttzw
+                    with m.If(XO[-1]):  # cnttzw
                         comb += pe32.i.eq(a[0:32])
                         comb += Assert(o == peo)
-                    with m.Else(): # cntlzw
+                    with m.Else():  # cntlzw
                         comb += pe32.i.eq(a[0:32][::-1])
                         comb += Assert(o == peo)
                 with m.Else():
@@ -138,10 +138,10 @@ class Driver(Elaboratable):
                         comb += peo64.eq(64)
                     with m.Else():
                         comb += peo64.eq(pe64.o)
-                    with m.If(XO[-1]): # cnttzd
+                    with m.If(XO[-1]):  # cnttzd
                         comb += pe64.i.eq(a[0:64])
                         comb += Assert(o == peo64)
-                    with m.Else(): # cntlzd
+                    with m.Else():  # cntlzd
                         comb += pe64.i.eq(a[0:64][::-1])
                         comb += Assert(o == peo64)
 
@@ -180,6 +180,7 @@ class LogicalTestCase(FHDLTestCase):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=2)
         self.assertFormal(module, mode="cover", depth=2)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index 253664032fc16a1401665416a70e33688c6caa65..6a90395783e798165bd55576c769c9bf73144952 100644 (file)
@@ -6,6 +6,8 @@
 # to the output stage
 
 # Copyright (C) 2020 Michael Nolan <mtnolan2640@gmail.com>
+# Copyright (C) 2020, 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+
 from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
 from nmutil.pipemodbase import PipeModBase
 from nmutil.clz import CLZ
@@ -33,14 +35,15 @@ class LogicalMainStage(PipeModBase):
         return LogicalOutputData(self.pspec)
 
     def elaborate(self, platform):
+        XLEN = self.pspec.XLEN
         m = Module()
         comb = m.d.comb
         op, a, b, o = self.i.ctx.op, self.i.a, self.i.b, self.o.o
 
         comb += o.ok.eq(1) # overridden if no op activates
 
-        m.submodules.bpermd = bpermd = Bpermd(64)
-        m.submodules.popcount = popcount = Popcount()
+        m.submodules.bpermd = bpermd = Bpermd(XLEN)
+        m.submodules.popcount = popcount = Popcount(XLEN)
 
         ##########################
         # main switch for logic ops AND, OR and XOR, cmpb, parity, and popcount
@@ -84,12 +87,14 @@ class LogicalMainStage(PipeModBase):
                 par0 = Signal(reset_less=True)
                 par1 = Signal(reset_less=True)
                 comb += par0.eq(Cat(a[0], a[8], a[16], a[24]).xor())
-                comb += par1.eq(Cat(a[32], a[40], a[48], a[56]).xor())
+                if XLEN == 64:
+                    comb += par1.eq(Cat(a[32], a[40], a[48], a[56]).xor())
                 with m.If(op.data_len[3] == 1):
                     comb += o.data.eq(par0 ^ par1)
                 with m.Else():
                     comb += o[0].eq(par0)
-                    comb += o[32].eq(par1)
+                    if XLEN == 64:
+                        comb += o[32].eq(par1)
 
             ###################
             ###### cntlz v3.0B p99
@@ -99,7 +104,7 @@ class LogicalMainStage(PipeModBase):
                 count_right = Signal(reset_less=True)
                 comb += count_right.eq(XO[-1])
 
-                cntz_i = Signal(64, reset_less=True)
+                cntz_i = Signal(XLEN, reset_less=True)
                 a32 = Signal(32, reset_less=True)
                 comb += a32.eq(a[0:32])
 
@@ -108,7 +113,7 @@ class LogicalMainStage(PipeModBase):
                 with m.Else():
                     comb += cntz_i.eq(Mux(count_right, a[::-1], a))
 
-                m.submodules.clz = clz = CLZ(64)
+                m.submodules.clz = clz = CLZ(XLEN)
                 comb += clz.sig_in.eq(cntz_i)
                 comb += o.data.eq(Mux(op.is_32bit, clz.lz-32, clz.lz))
 
index 3d9077aaf1721b0aea1bbc65c29023e6d8638164..359a2a595689ed66b5b15f2789e92b7a90b90998 100644 (file)
@@ -5,40 +5,47 @@ from soc.fu.logical.logical_input_record import CompLogicalOpSubset
 
 # input (and output) for logical initial stage (common input)
 class LogicalInputData(FUBaseData):
-    regspec = [('INT', 'ra', '0:63'), # RA
-               ('INT', 'rb', '0:63'), # RB/immediate
-               ('XER', 'xer_so', '32'),    # bit0: so
-               ]
     def __init__(self, pspec):
         super().__init__(pspec, False)
         # convenience
         self.a, self.b = self.ra, self.rb
 
+    @property
+    def regspec(self):
+        return [('INT', 'ra', self.intrange),  # RA
+               ('INT', 'rb', self.intrange),  # RB/immediate
+               ('XER', 'xer_so', '32'),    # bit0: so
+               ]
 
 # input to logical final stage (common output)
 class LogicalOutputData(FUBaseData):
-    regspec = [('INT', 'o', '0:63'),        # RT
-               ('CR', 'cr_a', '0:3'),
-               ('XER', 'xer_so', '32'),    # bit0: so
-               ]
     def __init__(self, pspec):
         super().__init__(pspec, True)
         # convenience
         self.cr0 = self.cr_a
 
+    @property
+    def regspec(self):
+        return [('INT', 'o', self.intrange),
+               ('CR', 'cr_a', '0:3'),
+               ('XER', 'xer_so', '32'),    # bit0: so
+               ]
+
 
 # output from logical final stage (common output) - note that XER.so
 # is *not* included (the only reason it's in the input is because of CR0)
 class LogicalOutputDataFinal(FUBaseData):
-    regspec = [('INT', 'o', '0:63'),        # RT
-               ('CR', 'cr_a', '0:3'),
-               ]
     def __init__(self, pspec):
         super().__init__(pspec, True)
         # convenience
         self.cr0 = self.cr_a
+    @property
+    def regspec(self):
+        return [('INT', 'o', self.intrange),
+               ('CR', 'cr_a', '0:3'),
+               ]
 
 
 class LogicalPipeSpec(CommonPipeSpec):
-    regspec = (LogicalInputData.regspec, LogicalOutputDataFinal.regspec)
+    regspecklses = (LogicalInputData, LogicalOutputDataFinal)
     opsubsetkls = CompLogicalOpSubset
index a16bd78acab1c6c65702368ddd28b5a2f07f1dc1..a0f00d1dcd6f473b77cb852d6200ccb78c982272 100644 (file)
@@ -8,11 +8,15 @@ from soc.fu.logical.output_stage import LogicalOutputStage
 class LogicalStages1(PipeModBaseChain):
     def get_chain(self):
         inp = LogicalInputStage(self.pspec)
+        return [inp]
+
+class LogicalStages2(PipeModBaseChain):
+    def get_chain(self):
         main = LogicalMainStage(self.pspec)
-        return [inp, main]
+        return [main]
 
 
-class LogicalStages2(PipeModBaseChain):
+class LogicalStages3(PipeModBaseChain):
     def get_chain(self):
         out = LogicalOutputStage(self.pspec)
         return [out]
@@ -24,11 +28,13 @@ class LogicalBasePipe(ControlBase):
         self.pspec = pspec
         self.pipe1 = LogicalStages1(pspec)
         self.pipe2 = LogicalStages2(pspec)
-        self._eqs = self.connect([self.pipe1, self.pipe2])
+        self.pipe3 = LogicalStages3(pspec)
+        self._eqs = self.connect([self.pipe1, self.pipe2, self.pipe3])
 
     def elaborate(self, platform):
         m = ControlBase.elaborate(self, platform)
         m.submodules.logical_pipe1 = self.pipe1
         m.submodules.logical_pipe2 = self.pipe2
+        m.submodules.logical_pipe3 = self.pipe3
         m.d.comb += self._eqs
         return m
index ca90112d495c326996e16b1b11d93ef2649dfb12..5975149db345bdb28822d7ee683a47148e80c61a 100644 (file)
@@ -23,11 +23,13 @@ def array_of(count, bitwidth):
 
 
 class Popcount(Elaboratable):
-    def __init__(self):
-        self.a = Signal(64, reset_less=True)
-        self.b = Signal(64, reset_less=True)
+    def __init__(self, width=64):
+        self.width = width
+        self.a = Signal(width, reset_less=True)
+        self.b = Signal(width, reset_less=True)
         self.data_len = Signal(4, reset_less=True) # data len up to... err.. 8?
-        self.o = Signal(64, reset_less=True)
+        self.o = Signal(width, reset_less=True)
+        assert width in [32, 64], "only 32 or 64 bit supported for now"
 
     def elaborate(self, platform):
         m = Module()
@@ -38,11 +40,13 @@ class Popcount(Elaboratable):
         # creating arrays big enough to store the sum, each time
         pc = [a]
         # QTY32 2-bit (to take 2x 1-bit sums) etc.
-        work = [(32, 2), (16, 3), (8, 4), (4, 5), (2, 6), (1, 7)]
+        work = [(16, 3), (8, 4), (4, 5), (2, 6), (1, 7)]
+        if self.width == 64:
+            work = [(32, 2)] + work
         for l, bw in work: # l=number of add-reductions, bw=bitwidth
             pc.append(array_of(l, bw))
-        pc8 = pc[3]     # array of 8 8-bit counts (popcntb)
-        pc32 = pc[5]    # array of 2 32-bit counts (popcntw)
+        pc8 = pc[-4]     # array of 8 8-bit counts (popcntb)
+        pc32 = pc[-2]    # array of 2 32-bit counts (popcntw)
         popcnt = pc[-1]  # array of 1 64-bit count (popcntd)
         # cascade-tree of adds
         for idx, (l, bw) in enumerate(work):
@@ -54,12 +58,15 @@ class Popcount(Elaboratable):
         # decode operation length (1-hot)
         with m.If(data_len == 1):
             # popcntb - pack 8x 4-bit answers into 8x 8-bit output fields
-            for i in range(8):
+            for i in range(self.width//8):
                 comb += o[i*8:(i+1)*8].eq(pc8[i])
         with m.Elif(data_len == 4):
-            # popcntw - pack 2x 5-bit answers into 2x 32-bit output fields
-            for i in range(2):
-                comb += o[i*32:(i+1)*32].eq(pc32[i])
+            if self.width == 64:
+                # popcntw - pack 2x 5-bit answers into 2x 32-bit output fields
+                for i in range(2):
+                    comb += o[i*32:(i+1)*32].eq(pc32[i])
+            else:
+                comb += o.eq(popcnt[0])
         with m.Else():
             # popcntd - put 1x 6-bit answer into 64-bit output
             comb += o.eq(popcnt[0])
index 7c323ba1d208013ccdeeb7d1a2118a4f94a2de37..8e7c67e83005081fed50756e46c1019411533f70 100644 (file)
@@ -42,7 +42,7 @@ def set_alu_inputs(alu, dec2, sim):
     # and place it into i_data.b
 
     inp = yield from get_cu_inputs(dec2, sim)
-    print ("set alu inputs", inp)
+    print("set alu inputs", inp)
     yield from ALUHelpers.set_int_ra(alu, dec2, inp)
     yield from ALUHelpers.set_int_rb(alu, dec2, inp)
     yield from ALUHelpers.set_xer_so(alu, dec2, inp)
@@ -51,19 +51,19 @@ def set_alu_inputs(alu, dec2, sim):
 class LogicalIlangCase(TestAccumulatorBase):
 
     def case_ilang(self):
-        pspec = LogicalPipeSpec(id_wid=2)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = LogicalPipeSpec(id_wid=2, parent_pspec=pps)
         alu = LogicalBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("logical_pipeline.il", "w") as f:
             f.write(vl)
 
 
-class TestRunner(FHDLTestCase):
-    def __init__(self, test_data):
-        super().__init__("run_all")
-        self.test_data = test_data
+class TestRunner(unittest.TestCase):
 
-    def execute(self, alu,instruction, pdecode2, test):
+    def execute(self, alu, instruction, pdecode2, test):
         print(test.name)
         program = test.program
         self.subTest(test.name)
@@ -107,7 +107,9 @@ class TestRunner(FHDLTestCase):
                                               simulator, code)
             yield Settle()
 
-    def run_all(self):
+    def test_it(self):
+        test_data = LogicalIlangCase().test_data + \
+            LogicalTestCase({'soc'}).test_data
         m = Module()
         comb = m.d.comb
         instruction = Signal(32)
@@ -116,7 +118,10 @@ class TestRunner(FHDLTestCase):
 
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
 
-        pspec = LogicalPipeSpec(id_wid=2)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = LogicalPipeSpec(id_wid=2, parent_pspec=pps)
         m.submodules.alu = alu = LogicalBasePipe(pspec)
 
         comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
@@ -127,7 +132,7 @@ class TestRunner(FHDLTestCase):
         sim.add_clock(1e-6)
 
         def process():
-            for test in self.test_data:
+            for test in test_data:
                 print(test.name)
                 program = test.program
                 with self.subTest(test.name):
@@ -163,10 +168,4 @@ class TestRunner(FHDLTestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(exit=False)
-    suite = unittest.TestSuite()
-    suite.addTest(TestRunner(LogicalIlangCase().test_data))
-    suite.addTest(TestRunner(LogicalTestCase().test_data))
-
-    runner = unittest.TextTestRunner()
-    runner.run(suite)
+    unittest.main()
index f09fd987e311f80f72db005ddec4be26c7c437d9..24be3f5402710bed1fb3a016e672ef12cc13671b 100644 (file)
@@ -26,6 +26,7 @@ from soc.experiment.mem_types import MMUToLoadStore1Type
 from soc.fu.ldst.loadstore import LoadStore1, TestSRAMLoadStore1
 from nmutil.util import Display
 
+
 class FSMMMUStage(ControlBase):
     """FSM MMU
 
@@ -44,6 +45,7 @@ class FSMMMUStage(ControlBase):
         # set up p/n data
         self.p.i_data = MMUInputData(pspec)
         self.n.o_data = MMUOutputData(pspec)
+        self.exc_o = self.n.o_data.exception # AllFunctionUnits needs this
 
         self.mmu = MMU()
 
@@ -64,41 +66,39 @@ class FSMMMUStage(ControlBase):
         # incoming PortInterface
         self.ldst = ldst
         self.dcache = self.ldst.dcache
+        self.icache = self.ldst.icache
         self.pi = self.ldst.pi
 
     def elaborate(self, platform):
         assert hasattr(self, "dcache"), "remember to call set_ldst_interface"
         m = super().elaborate(platform)
         comb, sync = m.d.comb, m.d.sync
-        dcache = self.dcache
+        dcache, icache = self.dcache, self.icache
+        ldst = self.ldst # managed externally: do not add here
 
-        # link mmu and dcache together
+        # link mmu, dcache and icache together
         m.submodules.mmu = mmu = self.mmu
-        ldst = self.ldst # managed externally: do not add here
         m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
         m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
+        m.d.comb += icache.m_in.eq(mmu.i_out) # MMUToICacheType
 
         l_in, l_out = mmu.l_in, mmu.l_out
         d_in, d_out = dcache.d_in, dcache.d_out
-        wb_out, wb_in = dcache.wb_out, dcache.wb_in
 
         # link ldst and MMU together
         comb += l_in.eq(ldst.m_out)
         comb += ldst.m_in.eq(l_out)
 
         i_data, o_data = self.p.i_data, self.n.o_data
-        a_i, b_i, o, spr1_o = i_data.ra, i_data.rb, o_data.o, o_data.spr1
         op = i_data.ctx.op
+        cia_i = op.cia
         msr_i = op.msr
-        spr1_i = i_data.spr1
-
-        # these are set / got here *ON BEHALF* of LoadStore1
-        # XXX have to deal with this another way
-        # dsisr, dar = ldst.dsisr, ldst.dar
+        a_i, b_i, spr1_i = i_data.ra, i_data.rb, i_data.spr1
+        o, exc_o, spr1_o = o_data.o, o_data.exception, o_data.spr1
 
         # busy/done signals
-        busy = Signal()
-        done = Signal()
+        busy = Signal(name="mmu_fsm_busy")
+        done = Signal(name="mmu_fsm_done")
         m.d.comb += self.n.o_valid.eq(busy & done)
         m.d.comb += self.p.o_ready.eq(~busy)
 
@@ -107,11 +107,6 @@ class FSMMMUStage(ControlBase):
         spr = Signal(len(x_fields.SPR))
         comb += spr.eq(decode_spr_num(x_fields.SPR))
 
-        # based on MSR bits, set priv and virt mode.  TODO: 32-bit mode
-        comb += d_in.priv_mode.eq(~msr_i[MSR.PR])
-        comb += d_in.virt_mode.eq(msr_i[MSR.DR])
-        #comb += d_in.mode_32bit.eq(msr_i[MSR.SF]) # ?? err
-
         # ok so we have to "pulse" the MMU (or dcache) rather than
         # hold the valid hi permanently.  guess what this does...
         valid = Signal()
@@ -131,6 +126,11 @@ class FSMMMUStage(ControlBase):
             # WIP: properly implement MicrOp.OP_MTSPR and MicrOp.OP_MFSPR
 
             with m.Switch(op.insn_type):
+
+                ##########
+                # OP_MTSPR
+                ##########
+
                 with m.Case(MicrOp.OP_MTSPR):
                     comb += Display("MMUTEST: OP_MTSPR: spr=%i", spr)
                     # despite redirection this FU **MUST** behave exactly
@@ -148,10 +148,12 @@ class FSMMMUStage(ControlBase):
                         comb += self.debug0.eq(3)
                         #if matched update local cached value
                         #commented out because there is a driver conflict
-                        #with m.If(spr[0]):
-                        #    sync += dsisr.eq(a_i[:32])
-                        #with m.Else():
-                        #    sync += dar.eq(a_i)
+                        comb += ldst.sprval_in.eq(a_i)
+                        comb += ldst.mmu_set_spr.eq(1)
+                        with m.If(spr[0]):
+                            comb += ldst.mmu_set_dar.eq(1)
+                        with m.Else():
+                            comb += ldst.mmu_set_dsisr.eq(1)
                         comb += done.eq(1)
                     # pass it over to the MMU instead
                     with m.Else():
@@ -165,13 +167,35 @@ class FSMMMUStage(ControlBase):
                         comb += l_in.rs.eq(a_i)    # incoming operand (RS)
                         comb += done.eq(1) # FIXME l_out.done
 
+                ##########
+                # OP_MFSPR
+                ##########
+
                 with m.Case(MicrOp.OP_MFSPR):
                     comb += Display("MMUTEST: OP_MFSPR: spr=%i returns=%i",
                                     spr, spr1_i)
-                    comb += o.data.eq(spr1_i)
+                    # partial SPR number decoding perfectly fine
+                    with m.If(spr[9] | spr[5]):
+                        # identified as an MMU OP_MFSPR, contact the MMU.
+                        # interestingly, the read is combinatorial: no need
+                        # to set "valid", just set the SPR number
+                        comb += l_in.sprn.eq(spr)  # which SPR
+                        comb += o.data.eq(l_out.sprval)
+                    with m.Else():
+                        # identified as DSISR or DAR.  again: read the SPR
+                        # directly, combinatorial access
+                        with m.If(spr[0]):
+                            comb += o.data.eq(ldst.dar)
+                        with m.Else():
+                            comb += o.data.eq(ldst.dsisr)
+
                     comb += o.ok.eq(1)
                     comb += done.eq(1)
 
+                ##########
+                # OP_TLBIE
+                ##########
+
                 with m.Case(MicrOp.OP_TLBIE):
                     comb += Display("MMUTEST: OP_TLBIE: insn_bits=%i", spr)
                     # pass TLBIE request to MMU (spec: v3.0B p1034)
@@ -187,6 +211,33 @@ class FSMMMUStage(ControlBase):
                     comb += done.eq(l_out.done) # zzzz
                     comb += self.debug0.eq(2)
 
+                ##########
+                # OP_FETCH_FAILED
+                ##########
+
+                with m.Case(MicrOp.OP_FETCH_FAILED):
+                    comb += Display("MMUTEST: OP_FETCH_FAILED: @%x", cia_i)
+                    # trigger an instruction fetch failed MMU event.
+                    # PowerDecoder2 drops svstate.pc into NIA for us
+                    # really, this should be direct communication with the
+                    # MMU, rather than going through LoadStore1.  but, doing
+                    # so allows for the opportunity to prevent LoadStore1
+                    # from accepting any other LD/ST requests.
+                    comb += valid.eq(1)   # start "pulse"
+                    comb += ldst.instr_fault.eq(blip)
+                    comb += ldst.priv_mode.eq(~msr_i[MSR.PR])
+                    comb += ldst.maddr.eq(cia_i)
+                    # XXX should not access this!
+                    comb += done.eq(ldst.done)
+                    comb += self.debug0.eq(3)
+                    # LDST unit contains exception data, which (messily)
+                    # is copied over, here.  not ideal but it will do for now
+                    comb += exc_o.eq(ldst.pi.exc_o)
+
+                ############
+                # OP_ILLEGAL
+                ############
+
                 with m.Case(MicrOp.OP_ILLEGAL):
                     comb += self.illegal.eq(1)
 
index 109d2d389327f646404df4dfcaf5ba324b42c466..aea08bc8a31c9ea6dadd4cf2cbad2af6ccd69202 100644 (file)
@@ -13,7 +13,8 @@ class CompMMUOpSubset(CompOpSubsetBase):
         layout = (('insn_type', MicrOp),
                   ('fn_unit', Function),
                   ('insn', 32),
-                  ('msr', 64), # TODO: a lot less bits.  only need PR, DR, SF
+                  ('cia', 64), # for instruction fault (MMU PTE lookup)
+                  ('msr', 64), # ditto, to set priv_mode etc.
                   ('zero_a', 1),
                   )
         super().__init__(layout, name=name)
index bc86e29151d060679cca0818cf485641843d184b..7272a2256a3117fa2f554fe617e1eec75d7e1d84 100644 (file)
@@ -13,6 +13,7 @@ Links:
 from soc.fu.pipe_data import FUBaseData
 from soc.fu.mmu.mmu_input_record import CompMMUOpSubset
 from soc.fu.alu.pipe_data import CommonPipeSpec
+from openpower.exceptions import LDSTException
 
 
 class MMUInputData(FUBaseData):
@@ -32,9 +33,9 @@ class MMUOutputData(FUBaseData):
                ('SPR', 'spr1', '0:63'),     # MMU (slow)
                ]
     def __init__(self, pspec):
-        super().__init__(pspec, True)
+        super().__init__(pspec, True, LDSTException)
 
 
 class MMUPipeSpec(CommonPipeSpec):
-    regspec = (MMUInputData.regspec, MMUOutputData.regspec)
+    regspecklses = (MMUInputData, MMUOutputData)
     opsubsetkls = CompMMUOpSubset
index 0bb6ecad6a25318fadeb2e78b277e622915f5c2b..f5919b9a9dc9bc050bda852b1128c85d8e2e5ead 100644 (file)
@@ -13,33 +13,66 @@ class MMUTestCase(TestAccumulatorBase):
     # libre-soc has own SPR unit
     # other instructions here -> must be load/store
 
-    def case_mmu_ldst(self):
+    def cse_dcbz(self):
         lst = [
                 "dcbz 1,2",
+              ]
+
+        initial_regs = [0] * 32
+        initial_regs[1] = 0x2
+        initial_regs[2] = 0x2020
+
+        self.add_case(Program(lst, bigendian),
+                      initial_regs, initial_mem={})
+
+    def case_mmu_dar(self):
+        lst = [
+                "mfspr 1, 720",     # DAR to reg 1
+                "mtspr 19, 3",      # reg 3 to DAR
+              ]
+
+        initial_regs = [0] * 32
+        initial_regs[1] = 0x2
+        initial_regs[3] = 0x5
+
+        initial_sprs = {'DAR': 0x87654321,
+                        }
+        self.add_case(Program(lst, bigendian),
+                      initial_regs, initial_sprs, initial_mem={})
+
+    def case_mmu_ldst(self):
+        lst = [
+                "dcbz 1,0",
                 "tlbie 0,0,0,0,0", # RB,RS,RIC,PRS,R
                 "mtspr 18, 1",     # reg 1 to DSISR
                 "mtspr 19, 2",     # reg 2 to DAR
-                "mfspr 1, 18",     # DSISR to reg 1
-                "mfspr 2, 19",     # DAR to reg 2
+                "mfspr 5, 18",     # DSISR to reg 5
+                "mfspr 6, 19",     # DAR to reg 6
                 "mtspr 48, 3",    # set MMU PID
                 "mtspr 720, 4",    # set MMU PRTBL
-                "lhz 3, 0(1)"      # load some data
+                "lhz 3, 0(1)",     # load some data
+                "addi 7, 0, 1"
               ]
 
         initial_regs = [0] * 32
-        initial_regs[3] = 1
+        initial_regs[1] = 0x2
+        initial_regs[2] = 0x2020
+        initial_regs[3] = 5
         initial_regs[4] = 0xDEADBEEF
-        #initial_regs[1] = 0xDEADBEEF
 
-        #FIXME initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321}
-        initial_sprs = {}
+        initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321,
+                        'PIDR': 0xabcd, 'PRTBL': 0x0def}
         self.add_case(Program(lst, bigendian),
-                      initial_regs, initial_sprs)
+                      initial_regs, initial_sprs, initial_mem={})
 
 
 if __name__ == "__main__":
+    mem = {}
     unittest.main(exit=False)
     suite = unittest.TestSuite()
-    suite.addTest(TestRunner(MMUTestCase().test_data,microwatt_mmu=True))
+    suite.addTest(TestRunner(MMUTestCase().test_data,
+                             microwatt_mmu=True,
+                             svp64=False,
+                             rom=mem))
     runner = unittest.TextTestRunner()
     runner.run(suite)
index dc7d5c62846ee252cb3784d71d8d6e3e85e3a67e..e234ac22f524d9262a4085506aa19b91e7958d2b 100644 (file)
@@ -30,26 +30,27 @@ from soc.simple.test.test_core import (setup_regs, check_regs,
 
 debughang = 2
 
+
 class MMUTestCase(TestAccumulatorBase):
     # MMU handles MTSPR, MFSPR, DCBZ and TLBIE.
     # other instructions here -> must be load/store
 
     def case_mfspr_after_invalid_load(self):
-        lst = [ # TODO -- set SPR on both sinulator and port interface
-                "mfspr 1, 18", # DSISR to reg 1
-                "mfspr 2, 19", # DAR to reg 2
-                # TODO -- verify returned sprvals
-              ]
+        lst = [  # TODO -- set SPR on both sinulator and port interface
+            "mfspr 1, 18",  # DSISR to reg 1
+            "mfspr 2, 19",  # DAR to reg 2
+            # TODO -- verify returned sprvals
+        ]
 
         initial_regs = [0] * 32
 
-        #THOSE are currently broken -- initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321}
+        # THOSE are currently broken -- initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321}
         initial_sprs = {}
         self.add_case(Program(lst, bigendian),
                       initial_regs, initial_sprs)
 
-    #def case_ilang(self):
-    #    pspec = SPRPipeSpec(id_wid=2)
+    # def case_ilang(self):
+    #    pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
     #    alu = SPRBasePipe(pspec)
     #    vl = rtlil.convert(alu, ports=alu.ports())
     #    with open("trap_pipeline.il", "w") as f:
@@ -105,9 +106,11 @@ class TestRunner(unittest.TestCase):
             vld = yield fsm.n.o_valid
             while not vld:
                 yield
-                if debughang:  print("not valid -- hang")
+                if debughang:
+                    print("not valid -- hang")
                 vld = yield fsm.n.o_valid
-                if debughang==2: vld=1
+                if debughang == 2:
+                    vld = 1
             yield
 
     def run_all(self):
@@ -126,10 +129,10 @@ class TestRunner(unittest.TestCase):
                              reg_wid=64)
 
         m.submodules.core = core = NonProductionCore(pspec
-                                     # XXX NO absolutely do not do this.
-                                     # all options must go into the pspec
-                                     #, microwatt_mmu=True
-                                                        )
+                                                     # XXX NO absolutely do not do this.
+                                                     # all options must go into the pspec
+                                                     # , microwatt_mmu=True
+                                                     )
 
         comb += pdecode2.dec.raw_opcode_in.eq(instruction)
         sim = Simulator(m)
@@ -149,6 +152,7 @@ class TestRunner(unittest.TestCase):
                            traces=[]):
             sim.run()
 
+
 if __name__ == "__main__":
     unittest.main(exit=False)
     suite = unittest.TestSuite()
index 0701dd933ab8ceb7c45f10eb27df0b5261afe17c..e81dd174263a910be21531f13f080bad54cb7cb7 100644 (file)
@@ -31,6 +31,7 @@ import power_instruction_analyzer as pia
 
 debughang = 1
 
+
 def set_fsm_inputs(alu, dec2, sim):
     # TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
     # detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
@@ -45,16 +46,16 @@ def set_fsm_inputs(alu, dec2, sim):
     # yield from ALUHelpers.set_spr_spr1(alu, dec2, inp)
 
     overflow = None
-    a=None
-    b=None
+    a = None
+    b = None
     # TODO
     if 'xer_so' in inp:
         print("xer_so::::::::::::::::::::::::::::::::::::::::::::::::")
         so = inp['xer_so']
         print(so)
         overflow = pia.OverflowFlags(so=bool(so),
-                                      ov=False,
-                                      ov32=False)
+                                     ov=False,
+                                     ov32=False)
     if 'ra' in inp:
         a = inp['ra']
     if 'rb' in inp:
@@ -65,12 +66,14 @@ def set_fsm_inputs(alu, dec2, sim):
 
 def check_fsm_outputs(fsm, pdecode2, sim, code):
     # check that MMUOutputData is correct
-    return None #TODO
+    return None  # TODO
+
+# incomplete test - connect fsm inputs first
+
 
-#incomplete test - connect fsm inputs first
 class MMUIlangCase(TestAccumulatorBase):
-    #def case_ilang(self):
-    #    pspec = SPRPipeSpec(id_wid=2)
+    # def case_ilang(self):
+    #    pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
     #    alu = SPRBasePipe(pspec)
     #    vl = rtlil.convert(alu, ports=alu.ports())
     #    with open("trap_pipeline.il", "w") as f:
@@ -82,7 +85,7 @@ class TestRunner(unittest.TestCase):
     def __init__(self, test_data):
         super().__init__("run_all")
         self.test_data = test_data
-        #hack here -- all unit tests are affected
+        # hack here -- all unit tests are affected
         self.run_all()
 
     def check_fsm_outputs(self, alu, dec2, sim, code, pia_res):
@@ -98,26 +101,25 @@ class TestRunner(unittest.TestCase):
         sim_o = {}
         res = {}
 
-        #MMUOutputData does not have xer
+        # MMUOutputData does not have xer
 
         yield from ALUHelpers.get_cr_a(res, alu, dec2)
-        #yield from ALUHelpers.get_xer_ov(res, alu, dec2)
+        # yield from ALUHelpers.get_xer_ov(res, alu, dec2)
         yield from ALUHelpers.get_int_o(res, alu, dec2)
-        #yield from ALUHelpers.get_xer_so(res, alu, dec2)
-
+        # yield from ALUHelpers.get_xer_so(res, alu, dec2)
 
         print("res output", res)
 
         yield from ALUHelpers.get_sim_int_o(sim_o, sim, dec2)
         yield from ALUHelpers.get_wr_sim_cr_a(sim_o, sim, dec2)
-        #yield from ALUHelpers.get_sim_xer_ov(sim_o, sim, dec2)
-        #yield from ALUHelpers.get_sim_xer_so(sim_o, sim, dec2)
+        # yield from ALUHelpers.get_sim_xer_ov(sim_o, sim, dec2)
+        # yield from ALUHelpers.get_sim_xer_so(sim_o, sim, dec2)
 
         print("sim output", sim_o)
 
         print("power-instruction-analyzer result:")
         print(pia_res)
-        #if pia_res is not None:
+        # if pia_res is not None:
         #    with self.subTest(check="pia", sim_o=sim_o, pia_res=str(pia_res)):
         #        pia_o = pia_res_to_output(pia_res)
         #        ALUHelpers.check_int_o(self, res, pia_o, code)
@@ -126,15 +128,15 @@ class TestRunner(unittest.TestCase):
         #        #ALUHelpers.check_xer_so(self, res, pia_o, code)
 
         with self.subTest(check="sim", sim_o=sim_o, pia_res=str(pia_res)):
-            #ALUHelpers.check_int_o(self, res, sim_o, code) # mmu is not an alu
+            # ALUHelpers.check_int_o(self, res, sim_o, code) # mmu is not an alu
             ALUHelpers.check_cr_a(self, res, sim_o, code)
             #ALUHelpers.check_xer_ov(self, res, sim_o, code)
             #ALUHelpers.check_xer_so(self, res, sim_o, code)
 
-        #oe = yield dec2.e.do.oe.oe
-        #oe_ok = yield dec2.e.do.oe.ok
+        # oe = yield dec2.e.do.oe.oe
+        # oe_ok = yield dec2.e.do.oe.ok
         #print("oe, oe_ok", oe, oe_ok)
-        #if not oe or not oe_ok:
+        # if not oe or not oe_ok:
         #    # if OE not enabled, XER SO and OV must not be activated
         #    so_ok = yield alu.n.o_data.xer_so.ok
         #    ov_ok = yield alu.n.o_data.xer_ov.ok
@@ -181,7 +183,7 @@ class TestRunner(unittest.TestCase):
             print("dec2 spr/fast in", fast_out, spr_out)
 
             fn_unit = yield pdecode2.e.do.fn_unit
-            #FIXME this fails -- self.assertEqual(fn_unit, Function.SPR.value)
+            # FIXME this fails -- self.assertEqual(fn_unit, Function.SPR.value)
             pia_res = yield from set_fsm_inputs(fsm, pdecode2, sim)
             yield
             opname = code.split(' ')[0]
@@ -191,14 +193,15 @@ class TestRunner(unittest.TestCase):
             index = pc//4
             print("pc after %08x" % (pc))
 
-            vld = yield fsm.n.o_valid #fsm
+            vld = yield fsm.n.o_valid  # fsm
             while not vld:
                 yield
                 if debughang:
                     print("not valid -- hang")
                     return
                 vld = yield fsm.n.o_valid
-                if debughang==2: vld=1
+                if debughang == 2:
+                    vld = 1
             yield
 
             yield from self.check_fsm_outputs(fsm, pdecode2, sim, code, pia_res)
@@ -208,7 +211,7 @@ class TestRunner(unittest.TestCase):
         comb = m.d.comb
         instruction = Signal(32)
 
-        pspec = TestMemPspec(addr_wid=48,
+        pspec = TestMemPspec(addr_wid=64,
                              mask_wid=8,
                              reg_wid=64,
                              )
@@ -217,14 +220,14 @@ class TestRunner(unittest.TestCase):
 
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
 
-        pipe_spec = MMUPipeSpec(id_wid=2)
+        pipe_spec = MMUPipeSpec(id_wid=2, parent_pspec=None)
         ldst = LoadStore1(pspec)
         fsm = FSMMMUStage(pipe_spec)
         fsm.set_ldst_interface(ldst)
         m.submodules.fsm = fsm
         m.submodules.ldst = ldst
 
-        #FIXME connect fsm inputs
+        # FIXME connect fsm inputs
 
         comb += fsm.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
         comb += fsm.p.i_valid.eq(1)
@@ -247,6 +250,7 @@ class TestRunner(unittest.TestCase):
                            traces=[]):
             sim.run()
 
+
 if __name__ == "__main__":
     unittest.main(exit=False)
     suite = unittest.TestSuite()
index f1837baa2e0c5e3183a2fe84778f5c82f8785cee..a78294606b82f5c0a3aaced9051d8ee5eeeb6fe3 100644 (file)
@@ -84,18 +84,19 @@ class Driver(Elaboratable):
 
         # set up the mul stages.  do not add them to m.submodules, this
         # is handled by StageChain.setup().
-        pspec = MulPipeSpec(id_wid=2)
+        pspec = MulPipeSpec(id_wid=2, parent_pspec=None)
         pipe1 = MulMainStage1(pspec)
         pipe2 = MulMainStage2(pspec)
         pipe3 = MulMainStage3(pspec)
 
-        class Dummy: pass
-        dut = Dummy() # make a class into which dut.i and dut.o can be dropped
+        class Dummy:
+            pass
+        dut = Dummy()  # make a class into which dut.i and dut.o can be dropped
         dut.i = pipe1.ispec()
-        chain = [pipe1, pipe2, pipe3] # chain of 3 mul stages
+        chain = [pipe1, pipe2, pipe3]  # chain of 3 mul stages
 
-        StageChain(chain).setup(m, dut.i) # input linked here, through chain
-        dut.o = chain[-1].o # output is the last thing in the chain...
+        StageChain(chain).setup(m, dut.i)  # input linked here, through chain
+        dut.o = chain[-1].o  # output is the last thing in the chain...
 
         # convenience variables
         a = dut.i.ra
@@ -145,7 +146,7 @@ class Driver(Elaboratable):
         # setup random inputs
         comb += [a.eq(AnyConst(64)),
                  b.eq(AnyConst(64)),
-                ]
+                 ]
 
         comb += dut.i.ctx.op.eq(rec)
 
@@ -169,7 +170,7 @@ class Driver(Elaboratable):
             ###### HI-32 #####
 
             with m.Case(MicrOp.OP_MUL_H32):
-                comb += Assume(rec.is_32bit) # OP_MUL_H32 is a 32-bit op
+                comb += Assume(rec.is_32bit)  # OP_MUL_H32 is a 32-bit op
 
                 exp_prod = Signal(64)
                 expected_o = Signal.like(exp_prod)
@@ -186,7 +187,7 @@ class Driver(Elaboratable):
                     # differ, we negate the product.  This implies that
                     # the product is calculated from the absolute values
                     # of the inputs.
-                    prod = Signal.like(exp_prod) # intermediate product
+                    prod = Signal.like(exp_prod)  # intermediate product
                     comb += prod.eq(abs32_a * abs32_b)
                     comb += exp_prod.eq(Mux(ab32_sne, -prod, prod))
                     comb += expected_o.eq(Repl(exp_prod[32:64], 2))
@@ -210,7 +211,7 @@ class Driver(Elaboratable):
                     # differ, we negate the product.  This implies that
                     # the product is calculated from the absolute values
                     # of the inputs.
-                    prod = Signal.like(exp_prod) # intermediate product
+                    prod = Signal.like(exp_prod)  # intermediate product
                     comb += prod.eq(abs64_a * abs64_b)
                     comb += exp_prod.eq(Mux(ab64_sne, -prod, prod))
                     comb += Assert(o[0:64] == exp_prod[64:128])
@@ -285,6 +286,7 @@ class MulTestCase(FHDLTestCase):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=2)
         self.assertFormal(module, mode="cover", depth=2)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index a55e80d1d335d19bdb1ee04475291aaabc0d06fa..a5047be722cc36559018cfd9485c7b7b82ce70ac 100644 (file)
@@ -15,8 +15,6 @@ class MulIntermediateData(DivInputData):
 
 
 class MulOutputData(FUBaseData):
-    regspec = [('INT', 'o', '0:128'),
-               ('XER', 'xer_so', '32')] # XER bit 32: SO
     def __init__(self, pspec):
         super().__init__(pspec, False) # still input style
 
@@ -25,7 +23,12 @@ class MulOutputData(FUBaseData):
         self.data.append(self.neg_res)
         self.data.append(self.neg_res32)
 
+    @property
+    def regspec(self):
+        return [('INT', 'o', "0:%d" % (self.pspec.XLEN*2)), # 2xXLEN
+               ('XER', 'xer_so', '32')] # XER bit 32: SO
+
 
 class MulPipeSpec(CommonPipeSpec):
-    regspec = (DivInputData.regspec, DivMulOutputData.regspec)
+    regspecklses = (DivInputData, DivMulOutputData)
     opsubsetkls = CompMULOpSubset
index c5e696ae415afbe03e294c4f386c22316c34c7d0..a8a7fb4e5201ad479d22a9a61c7d7bb7dfa14034 100644 (file)
@@ -18,6 +18,7 @@ class MulMainStage1(PipeModBase):
         return MulIntermediateData(self.pspec) # pipeline stage output format
 
     def elaborate(self, platform):
+        XLEN = self.pspec.XLEN
         m = Module()
         comb = m.d.comb
 
@@ -35,8 +36,8 @@ class MulMainStage1(PipeModBase):
         comb += is_32bit.eq(op.is_32bit)
 
         # work out if a/b are negative (check 32-bit / signed)
-        comb += sign_a.eq(Mux(op.is_32bit, a[31], a[63]) & op.is_signed)
-        comb += sign_b.eq(Mux(op.is_32bit, b[31], b[63]) & op.is_signed)
+        comb += sign_a.eq(Mux(op.is_32bit, a[31], a[XLEN-1]) & op.is_signed)
+        comb += sign_b.eq(Mux(op.is_32bit, b[31], b[XLEN-1]) & op.is_signed)
         comb += sign32_a.eq(a[31] & op.is_signed)
         comb += sign32_b.eq(b[31] & op.is_signed)
 
@@ -47,8 +48,8 @@ class MulMainStage1(PipeModBase):
         # negation of a 64-bit value produces the same lower 32-bit
         # result as negation of just the lower 32-bits, so we don't
         # need to do anything special before negating
-        abs_a = Signal(64, reset_less=True)
-        abs_b = Signal(64, reset_less=True)
+        abs_a = Signal(XLEN, reset_less=True)
+        abs_b = Signal(XLEN, reset_less=True)
         comb += abs_a.eq(Mux(sign_a, -a, a))
         comb += abs_b.eq(Mux(sign_b, -b, b))
 
index bb9d8d8490235901a78cb1dd3fee08fda5c66ed3..30cb94966d3292916d94b7e809ad70f88e55e609 100644 (file)
@@ -146,7 +146,10 @@ class MulTestHelper(unittest.TestCase):
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
         pdecode = pdecode2.dec
 
-        pspec = MulPipeSpec(id_wid=2)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = MulPipeSpec(id_wid=2, parent_pspec=pps)
         m.submodules.alu = alu = MulBasePipe(pspec)
 
         comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
index c711a651786b4602a1466b70d1f3a050a438fae4..afa4e00701b368cc6f8a1b4139d3ccb4e4923732 100644 (file)
@@ -7,11 +7,11 @@ from openpower.test.mul.long_mul_cases import (MulTestCases2Arg,
 
 class TestPipeLong(MulTestHelper):
     def test_mul_pipe_2_arg(self):
-        self.run_all(MulTestCases2Arg().test_data, "mul_pipe_caller_long_2_arg",
-                     has_third_input=False)
+        self.run_all(MulTestCases2Arg({'soc'}).test_data,
+                     "mul_pipe_caller_long_2_arg", has_third_input=False)
 
     def helper_3_arg(self, subtest_index):
-        self.run_all(MulTestCases3Arg(subtest_index).test_data,
+        self.run_all(MulTestCases3Arg(subtest_index, {'soc'}).test_data,
                      f"mul_pipe_caller_long_3_arg_{subtest_index}",
                      has_third_input=True)
 
index 22af35ba90037441175670866306bdd1c6743c82..7411b586b7ad8c5481c41bc27b4b2cf78ab155cf 100644 (file)
@@ -6,7 +6,10 @@ from soc.fu.mul.pipeline import MulBasePipe
 
 class TestPipeIlang(unittest.TestCase):
     def write_ilang(self):
-        pspec = MulPipeSpec(id_wid=2)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = MulPipeSpec(id_wid=2, parent_pspec=pps)
         alu = MulBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("mul_pipeline.il", "w") as f:
index abee2df9a9c91050f7978ebac50b9b4acbd5e94e..427f5c6a0cdc52a7532a989e432782a9e57f1cb0 100644 (file)
@@ -17,12 +17,14 @@ class FUBaseData:
     """
 
     def __init__(self, pspec, output, exc_kls=None):
-        self.ctx = PipeContext(pspec) # context for ReservationStation usage
+        self.pspec = pspec
+        self.ctx = PipeContext(pspec)  # context for ReservationStation usage
         self.muxid = self.ctx.muxid
         self.data = []
         self.is_output = output
         # take regspec and create data attributes (in or out)
         # TODO: use widspec to create reduced bit mapping.
+        print (self.regspec)
         for i, (regfile, regname, widspec) in enumerate(self.regspec):
             wid = get_regspec_bitwidth([self.regspec], 0, i)
             if output:
@@ -42,22 +44,27 @@ class FUBaseData:
         if hasattr(self, "exception"):
             yield from self.exception.ports()
 
+    # convenience function to return 0:63 if XLEN=64, 0:31 if XLEN=32 etc.
+    @property
+    def intrange(self):
+        return "0:%d" % (self.pspec.XLEN-1)
+
     def eq(self, i):
         eqs = [self.ctx.eq(i.ctx)]
         assert len(self.data) == len(i.data), \
-               "length of %s mismatch against %s: %s %s" % \
-                   (repr(self), repr(i), repr(self.data), repr(i.data))
+            "length of %s mismatch against %s: %s %s" % \
+            (repr(self), repr(i), repr(self.data), repr(i.data))
         for j in range(len(self.data)):
             assert type(self.data[j]) == type(i.data[j]), \
-                   "type mismatch in FUBaseData %s %s" % \
-                   (repr(self.data[j]), repr(i.data[j]))
+                "type mismatch in FUBaseData %s %s" % \
+                (repr(self.data[j]), repr(i.data[j]))
             eqs.append(self.data[j].eq(i.data[j]))
         if hasattr(self, "exception"):
             eqs.append(self.exception.eq(i.exception))
         return eqs
 
     def ports(self):
-        return self.ctx.ports() # TODO: include self.data
+        return self.ctx.ports()  # TODO: include self.data
 
 
 # hmmm there has to be a better way than this
@@ -74,9 +81,27 @@ class CommonPipeSpec:
     """CommonPipeSpec: base class for all pipeline specifications
     see README.md for explanation of members.
     """
-    def __init__(self, id_wid):
+
+    def __init__(self, id_wid, parent_pspec):
         self.pipekls = SimpleHandshakeRedir
         self.id_wid = id_wid
         self.opkls = lambda _: self.opsubsetkls()
-        self.op_wid = get_rec_width(self.opkls(None)) # hmm..
+        self.op_wid = get_rec_width(self.opkls(None))  # hmm..
         self.stage = None
+        self.parent_pspec = parent_pspec
+
+    # forward attributes from parent_pspec
+    def __getattr__(self, name):
+        return getattr(self.parent_pspec, name)
+
+
+def get_pspec_draft_bitmanip(pspec):
+    """ True if the draft bitmanip instructions are enabled in the provided
+    pspec. The instructions enabled by this are draft instructions -- they are
+    not official OpenPower instructions, they are intended to be eventually
+    submitted to the OpenPower ISA WG.
+
+    https://libre-soc.org/openpower/sv/bitmanip/
+    """
+    # use `is True` to account for Mock absurdities
+    return getattr(pspec, "draft_bitmanip", False) is True
index 6804c593971fab5adcce20ddccaf797439a65e86..f5971aadff87b2cad67b9d6610ee929f2081f11f 100644 (file)
@@ -39,6 +39,7 @@ def get_regspec_bitwidth(regspec, srcdest, idx):
 class RegSpec:
     def __init__(self, rwid, n_src=None, n_dst=None, name=None):
         self._rwid = rwid
+        print ("RegSpec", rwid)
         if isinstance(rwid, int):
             # rwid: integer (covers all registers)
             self._n_src, self._n_dst = n_src, n_dst
index 5d8bae28fd3773f655679a9596902dc6644e2511..379211d623a01259f77c90229cae0d57f40228a7 100644 (file)
-# Proof of correctness for partitioned equal signal combiner
+# Proof of correctness for shift/rotate FU
 # Copyright (C) 2020 Michael Nolan <mtnolan2640@gmail.com>
 """
 Links:
 * https://bugs.libre-soc.org/show_bug.cgi?id=340
+
+run tests with:
+pip install pytest
+pip install pytest-xdist
+pytest -n auto src/soc/fu/shift_rot/formal/proof_main_stage.py
+because that tells pytest to run the tests in parallel, it will take a few
+minutes instead of an hour.
 """
 
+import unittest
+import enum
 from nmigen import (Module, Signal, Elaboratable, Mux, Cat, Repl,
-                    signed)
-from nmigen.asserts import Assert, AnyConst, Assume, Cover
+                    signed, Const, unsigned)
+from nmigen.asserts import Assert, AnyConst, Assume
 from nmutil.formaltest import FHDLTestCase
-from nmigen.cli import rtlil
+from nmutil.sim_util import do_sim
+from nmigen.sim import Delay
 
 from soc.fu.shift_rot.main_stage import ShiftRotMainStage
-from soc.fu.shift_rot.rotator import right_mask, left_mask
 from soc.fu.shift_rot.pipe_data import ShiftRotPipeSpec
-from soc.fu.shift_rot.sr_input_record import CompSROpSubset
 from openpower.decoder.power_enums import MicrOp
-from openpower.consts import field
 
-import unittest
-from nmutil.extend import exts
+
+@enum.unique
+class TstOp(enum.Enum):
+    """ops we're testing, the idea is if we run a separate formal proof for
+    each instruction, we end up covering them all and each runs much faster,
+    also the formal proofs can be run in parallel."""
+    SHL = MicrOp.OP_SHL
+    SHR = MicrOp.OP_SHR
+    RLC32 = MicrOp.OP_RLC, 32
+    RLC64 = MicrOp.OP_RLC, 64
+    RLCL = MicrOp.OP_RLCL
+    RLCR = MicrOp.OP_RLCR
+    EXTSWSLI = MicrOp.OP_EXTSWSLI
+    TERNLOG = MicrOp.OP_TERNLOG
+    # grev removed -- leaving code for later use in grevlut
+    # GREV32 = MicrOp.OP_GREV, 32
+    # GREV64 = MicrOp.OP_GREV, 64
+
+    @property
+    def op(self):
+        if isinstance(self.value, tuple):
+            return self.value[0]
+        return self.value
+
+
+def eq_any_const(sig: Signal):
+    return sig.eq(AnyConst(sig.shape(), src_loc_at=1))
+
+
+class Mask(Elaboratable):
+    # copied from qemu's mask fn:
+    # https://gitlab.com/qemu-project/qemu/-/blob/477c3b934a47adf7de285863f59d6e4503dd1a6d/target/ppc/internal.h#L21
+    def __init__(self):
+        self.start = Signal(6)
+        self.end = Signal(6)
+        self.out = Signal(64)
+
+    def elaborate(self, platform):
+        m = Module()
+        max_val = Const(~0, unsigned(64))
+        max_bit = 63
+        with m.If(self.start == 0):
+            m.d.comb += self.out.eq(max_val << (max_bit - self.end))
+        with m.Elif(self.end == max_bit):
+            m.d.comb += self.out.eq(max_val >> self.start)
+        with m.Else():
+            ret = (max_val >> self.start) ^ ((max_val >> self.end) >> 1)
+            m.d.comb += self.out.eq(Mux(self.start > self.end, ~ret, ret))
+        return m
+
+
+class TstMask(unittest.TestCase):
+    def test_mask(self):
+        dut = Mask()
+
+        def case(start, end, expected):
+            with self.subTest(start=start, end=end):
+                yield dut.start.eq(start)
+                yield dut.end.eq(end)
+                yield Delay(1e-6)
+                out = yield dut.out
+                with self.subTest(out=hex(out), expected=hex(expected)):
+                    self.assertEqual(expected, out)
+
+        def process():
+            for start in range(64):
+                for end in range(64):
+                    expected = 0
+                    if start > end:
+                        for i in range(start, 64):
+                            expected |= 1 << (63 - i)
+                        for i in range(0, end + 1):
+                            expected |= 1 << (63 - i)
+                    else:
+                        for i in range(start, end + 1):
+                            expected |= 1 << (63 - i)
+                    yield from case(start, end, expected)
+        with do_sim(self, dut, [dut.start, dut.end, dut.out]) as sim:
+            sim.add_process(process)
+            sim.run()
+
+
+def rotl64(v, amt):
+    v |= Const(0, 64)  # convert to value at least 64-bits wide
+    amt |= Const(0, 6)  # convert to value at least 6-bits wide
+    return (Cat(v[:64], v[:64]) >> (64 - amt[:6]))[:64]
+
+
+def rotl32(v, amt):
+    v |= Const(0, 32)  # convert to value at least 32-bits wide
+    return rotl64(Cat(v[:32], v[:32]), amt)
 
 
 # This defines a module to drive the device under test and assert
 # properties about its outputs
 class Driver(Elaboratable):
-    def __init__(self):
-        # inputs and outputs
-        pass
+    def __init__(self, which):
+        assert isinstance(which, TstOp) or which is None
+        self.which = which
 
     def elaborate(self, platform):
         m = Module()
         comb = m.d.comb
 
-        rec = CompSROpSubset()
-        # Setup random inputs for dut.op.  do them explicitly so that
-        # we can see which ones cause failures in the debug report
-        #for p in rec.ports():
-        #    comb += p.eq(AnyConst(p.width))
-        comb += rec.insn_type.eq(AnyConst(rec.insn_type.width))
-        comb += rec.fn_unit.eq(AnyConst(rec.fn_unit.width))
-        comb += rec.imm_data.imm.eq(AnyConst(rec.imm_data.imm.width))
-        comb += rec.imm_data.imm_ok.eq(AnyConst(rec.imm_data.imm_ok.width))
-        comb += rec.rc.rc.eq(AnyConst(rec.rc.rc.width))
-        comb += rec.rc.rc_ok.eq(AnyConst(rec.rc.rc_ok.width))
-        comb += rec.oe.oe.eq(AnyConst(rec.oe.oe.width))
-        comb += rec.oe.oe_ok.eq(AnyConst(rec.oe.oe_ok.width))
-        comb += rec.write_cr0.eq(AnyConst(rec.write_cr0.width))
-        comb += rec.input_carry.eq(AnyConst(rec.input_carry.width))
-        comb += rec.output_carry.eq(AnyConst(rec.output_carry.width))
-        comb += rec.input_cr.eq(AnyConst(rec.input_cr.width))
-        comb += rec.is_32bit.eq(AnyConst(rec.is_32bit.width))
-        comb += rec.is_signed.eq(AnyConst(rec.is_signed.width))
-        comb += rec.insn.eq(AnyConst(rec.insn.width))
-
-
-        pspec = ShiftRotPipeSpec(id_wid=2)
+        pspec = ShiftRotPipeSpec(id_wid=2, parent_pspec=None)
+        pspec.draft_bitmanip = True
         m.submodules.dut = dut = ShiftRotMainStage(pspec)
 
-        # convenience variables
-        rs = dut.i.rs  # register to shift
-        b = dut.i.rb   # register containing amount to shift by
-        ra = dut.i.a   # source register if masking is to be done
-        carry_in = dut.i.xer_ca[0]
-        carry_in32 = dut.i.xer_ca[1]
-        carry_out = dut.o.xer_ca
-        o = dut.o.o.data
-        print ("fields", rec.fields)
-        itype = rec.insn_type
-
-        # instruction fields
-        m_fields = dut.fields.FormM
-        md_fields = dut.fields.FormMD
-
-        # setup random inputs
-        comb += rs.eq(AnyConst(64))
-        comb += ra.eq(AnyConst(64))
-        comb += b.eq(AnyConst(64))
-        comb += carry_in.eq(AnyConst(1))
-        comb += carry_in32.eq(AnyConst(1))
-
-        # copy operation
-        comb += dut.i.ctx.op.eq(rec)
+        # Set inputs to formal variables
+        comb += [
+            eq_any_const(dut.i.ctx.op.insn_type),
+            eq_any_const(dut.i.ctx.op.fn_unit),
+            eq_any_const(dut.i.ctx.op.imm_data.data),
+            eq_any_const(dut.i.ctx.op.imm_data.ok),
+            eq_any_const(dut.i.ctx.op.rc.rc),
+            eq_any_const(dut.i.ctx.op.rc.ok),
+            eq_any_const(dut.i.ctx.op.oe.oe),
+            eq_any_const(dut.i.ctx.op.oe.ok),
+            eq_any_const(dut.i.ctx.op.write_cr0),
+            eq_any_const(dut.i.ctx.op.input_carry),
+            eq_any_const(dut.i.ctx.op.output_carry),
+            eq_any_const(dut.i.ctx.op.input_cr),
+            eq_any_const(dut.i.ctx.op.is_32bit),
+            eq_any_const(dut.i.ctx.op.is_signed),
+            eq_any_const(dut.i.ctx.op.insn),
+            eq_any_const(dut.i.xer_ca),
+            eq_any_const(dut.i.ra),
+            eq_any_const(dut.i.rb),
+            eq_any_const(dut.i.rc),
+        ]
 
         # check that the operation (op) is passed through (and muxid)
         comb += Assert(dut.o.ctx.op == dut.i.ctx.op)
         comb += Assert(dut.o.ctx.muxid == dut.i.ctx.muxid)
 
-        # signed and signed/32 versions of input rs
-        a_signed = Signal(signed(64))
-        a_signed_32 = Signal(signed(32))
-        comb += a_signed.eq(rs)
-        comb += a_signed_32.eq(rs[0:32])
-
-        # masks: start-left
-        mb = Signal(7, reset_less=True)
-        ml = Signal(64, reset_less=True)
-
-        # clear left?
-        with m.If((itype == MicrOp.OP_RLC) | (itype == MicrOp.OP_RLCL)):
-            with m.If(rec.is_32bit):
-                comb += mb.eq(m_fields.MB)
-            with m.Else():
-                comb += mb.eq(md_fields.mb)
-        with m.Else():
-            with m.If(rec.is_32bit):
-                comb += mb.eq(b[0:6])
-            with m.Else():
-                comb += mb.eq(b+32)
-        comb += ml.eq(left_mask(m, mb))
-
-        # masks: end-right
-        me = Signal(7, reset_less=True)
-        mr = Signal(64, reset_less=True)
-
-        # clear right?
-        with m.If((itype == MicrOp.OP_RLC) | (itype == MicrOp.OP_RLCR)):
-            with m.If(rec.is_32bit):
-                comb += me.eq(m_fields.ME)
-            with m.Else():
-                comb += me.eq(md_fields.me)
-        with m.Else():
-            with m.If(rec.is_32bit):
-                comb += me.eq(b[0:6])
-            with m.Else():
-                comb += me.eq(63-b)
-        comb += mr.eq(right_mask(m, me))
-
-        # must check Data.ok
-        o_ok = Signal()
-        comb += o_ok.eq(1)
-
-        # main assertion of arithmetic operations
-        with m.Switch(itype):
-
-            # left-shift: 64/32-bit
-            with m.Case(MicrOp.OP_SHL):
-                comb += Assume(ra == 0)
-                with m.If(rec.is_32bit):
-                    comb += Assert(o[0:32] == ((rs << b[0:6]) & 0xffffffff))
-                    comb += Assert(o[32:64] == 0)
-                with m.Else():
-                    comb += Assert(o == ((rs << b[0:7]) & ((1 << 64)-1)))
-
-            # right-shift: 64/32-bit / signed
-            with m.Case(MicrOp.OP_SHR):
-                comb += Assume(ra == 0)
-                with m.If(~rec.is_signed):
-                    with m.If(rec.is_32bit):
-                        comb += Assert(o[0:32] == (rs[0:32] >> b[0:6]))
-                        comb += Assert(o[32:64] == 0)
-                    with m.Else():
-                        comb += Assert(o == (rs >> b[0:7]))
-                with m.Else():
-                    with m.If(rec.is_32bit):
-                        comb += Assert(o[0:32] == (a_signed_32 >> b[0:6]))
-                        comb += Assert(o[32:64] == Repl(rs[31], 32))
-                    with m.Else():
-                        comb += Assert(o == (a_signed >> b[0:7]))
-
-            # extswsli: 32/64-bit moded
-            with m.Case(MicrOp.OP_EXTSWSLI):
-                comb += Assume(ra == 0)
-                with m.If(rec.is_32bit):
-                    comb += Assert(o[0:32] == ((rs << b[0:6]) & 0xffffffff))
-                    comb += Assert(o[32:64] == 0)
-                with m.Else():
-                    # sign-extend to 64 bit
-                    a_s = Signal(64, reset_less=True)
-                    comb += a_s.eq(exts(rs, 32, 64))
-                    comb += Assert(o == ((a_s << b[0:7]) & ((1 << 64)-1)))
-
-            # rlwinm, rlwnm, rlwimi
-            # *CAN* these even be 64-bit capable?  I don't think they are.
-            with m.Case(MicrOp.OP_RLC):
-                comb += Assume(ra == 0)
-                comb += Assume(rec.is_32bit)
-
-                # Duplicate some signals so that they're much easier to find
-                # in gtkwave.
-                # Pro-tip: when debugging, factor out expressions into
-                # explicitly named
-                # signals, and search using a unique grep-tag (RLC in my case).
-                #   After
-                # debugging, resubstitute values to comply with surrounding
-                # code norms.
-
-                mrl = Signal(64, reset_less=True, name='MASK_FOR_RLC')
-                with m.If(mb > me):
-                    comb += mrl.eq(ml | mr)
-                with m.Else():
-                    comb += mrl.eq(ml & mr)
-
-                ainp = Signal(64, reset_less=True, name='A_INP_FOR_RLC')
-                comb += ainp.eq(field(rs, 32, 63))
-
-                sh = Signal(6, reset_less=True, name='SH_FOR_RLC')
-                comb += sh.eq(b[0:6])
-
-                exp_shl = Signal(64, reset_less=True,
-                                    name='A_SHIFTED_LEFT_BY_SH_FOR_RLC')
-                comb += exp_shl.eq((ainp << sh) & 0xFFFFFFFF)
-
-                exp_shr = Signal(64, reset_less=True,
-                                    name='A_SHIFTED_RIGHT_FOR_RLC')
-                comb += exp_shr.eq((ainp >> (32 - sh)) & 0xFFFFFFFF)
-
-                exp_rot = Signal(64, reset_less=True,
-                                    name='A_ROTATED_LEFT_FOR_RLC')
-                comb += exp_rot.eq(exp_shl | exp_shr)
-
-                exp_ol = Signal(32, reset_less=True, name='EXPECTED_OL_FOR_RLC')
-                comb += exp_ol.eq(field((exp_rot & mrl) | (ainp & ~mrl),
-                                    32, 63))
-
-                act_ol = Signal(32, reset_less=True, name='ACTUAL_OL_FOR_RLC')
-                comb += act_ol.eq(field(o, 32, 63))
-
-                # If I uncomment the following lines, I can confirm that all
-                # 32-bit rotations work.  If I uncomment only one of the
-                # following lines, I can confirm that all 32-bit rotations
-                # work.  When I remove/recomment BOTH lines, however, the
-                # assertion fails.  Why??
-
-#               comb += Assume(mr == 0xFFFFFFFF)
-#               comb += Assume(ml == 0xFFFFFFFF)
-                #with m.If(rec.is_32bit):
-                #    comb += Assert(act_ol == exp_ol)
-                #    comb += Assert(field(o, 0, 31) == 0)
-
-            #TODO
-            with m.Case(MicrOp.OP_RLCR):
-                pass
-            with m.Case(MicrOp.OP_RLCL):
-                pass
-            with m.Default():
-                comb += o_ok.eq(0)
-
-        # check that data ok was only enabled when op actioned
-        comb += Assert(dut.o.o.ok == o_ok)
+        if self.which is None:
+            for i in TstOp:
+                comb += Assume(dut.i.ctx.op.insn_type != i.op)
+            comb += Assert(~dut.o.o.ok)
+        else:
+            # we're only checking a particular operation:
+            comb += Assume(dut.i.ctx.op.insn_type == self.which.op)
+            comb += Assert(dut.o.o.ok)
+
+            # dispatch to check fn for each op
+            getattr(self, f"_check_{self.which.name.lower()}")(m, dut)
 
         return m
 
+    def _check_shl(self, m, dut):
+        m.d.comb += Assume(dut.i.ra == 0)
+        expected = Signal(64)
+        with m.If(dut.i.ctx.op.is_32bit):
+            m.d.comb += expected.eq((dut.i.rs << dut.i.rb[:6])[:32])
+        with m.Else():
+            m.d.comb += expected.eq((dut.i.rs << dut.i.rb[:7])[:64])
+        m.d.comb += Assert(dut.o.o.data == expected)
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    def _check_shr(self, m, dut):
+        m.d.comb += Assume(dut.i.ra == 0)
+        expected = Signal(64)
+        carry = Signal()
+        shift_in_s = Signal(signed(128))
+        shift_roundtrip = Signal(signed(128))
+        shift_in_u = Signal(128)
+        shift_amt = Signal(7)
+        with m.If(dut.i.ctx.op.is_32bit):
+            m.d.comb += [
+                shift_amt.eq(dut.i.rb[:6]),
+                shift_in_s.eq(dut.i.rs[:32].as_signed()),
+                shift_in_u.eq(dut.i.rs[:32]),
+            ]
+        with m.Else():
+            m.d.comb += [
+                shift_amt.eq(dut.i.rb[:7]),
+                shift_in_s.eq(dut.i.rs.as_signed()),
+                shift_in_u.eq(dut.i.rs),
+            ]
+
+        with m.If(dut.i.ctx.op.is_signed):
+            m.d.comb += [
+                expected.eq(shift_in_s >> shift_amt),
+                shift_roundtrip.eq((shift_in_s >> shift_amt) << shift_amt),
+                carry.eq((shift_in_s < 0) & (shift_roundtrip != shift_in_s)),
+            ]
+        with m.Else():
+            m.d.comb += [
+                expected.eq(shift_in_u >> shift_amt),
+                carry.eq(0),
+            ]
+        m.d.comb += Assert(dut.o.o.data == expected)
+        m.d.comb += Assert(dut.o.xer_ca.data == Repl(carry, 2))
+
+    def _check_rlc32(self, m, dut):
+        m.d.comb += Assume(dut.i.ctx.op.is_32bit)
+        # rlwimi, rlwinm, and rlwnm
+
+        m.submodules.mask = mask = Mask()
+        expected = Signal(64)
+        rot = Signal(64)
+        m.d.comb += rot.eq(rotl32(dut.i.rs[:32], dut.i.rb[:5]))
+        m.d.comb += mask.start.eq(dut.fields.FormM.MB[:] + 32)
+        m.d.comb += mask.end.eq(dut.fields.FormM.ME[:] + 32)
+
+        # for rlwinm and rlwnm, ra is guaranteed to be 0, so that part of
+        # the expression turns into a no-op
+        m.d.comb += expected.eq((rot & mask.out) | (dut.i.ra & ~mask.out))
+        m.d.comb += Assert(dut.o.o.data == expected)
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    def _check_rlc64(self, m, dut):
+        m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+        # rldic and rldimi
+
+        # `rb` is always a 6-bit immediate
+        m.d.comb += Assume(dut.i.rb[6:] == 0)
+
+        m.submodules.mask = mask = Mask()
+        expected = Signal(64)
+        rot = Signal(64)
+        m.d.comb += rot.eq(rotl64(dut.i.rs, dut.i.rb[:6]))
+        mb = dut.fields.FormMD.mb[:]
+        m.d.comb += mask.start.eq(Cat(mb[1:6], mb[0]))
+        m.d.comb += mask.end.eq(63 - dut.i.rb[:6])
+
+        # for rldic, ra is guaranteed to be 0, so that part of
+        # the expression turns into a no-op
+        m.d.comb += expected.eq((rot & mask.out) | (dut.i.ra & ~mask.out))
+        m.d.comb += Assert(dut.o.o.data == expected)
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    def _check_rlcl(self, m, dut):
+        m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+        # rldicl and rldcl
+
+        m.d.comb += Assume(~dut.i.ctx.op.is_signed)
+        m.d.comb += Assume(dut.i.ra == 0)
+
+        m.submodules.mask = mask = Mask()
+        m.d.comb += mask.end.eq(63)
+        mb = dut.fields.FormMD.mb[:]
+        m.d.comb += mask.start.eq(Cat(mb[1:6], mb[0]))
+
+        rot = Signal(64)
+        m.d.comb += rot.eq(rotl64(dut.i.rs, dut.i.rb[:6]))
+
+        expected = Signal(64)
+        m.d.comb += expected.eq(rot & mask.out)
+
+        m.d.comb += Assert(dut.o.o.data == expected)
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    def _check_rlcr(self, m, dut):
+        m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+        # rldicr and rldcr
+
+        m.d.comb += Assume(~dut.i.ctx.op.is_signed)
+        m.d.comb += Assume(dut.i.ra == 0)
+
+        m.submodules.mask = mask = Mask()
+        m.d.comb += mask.start.eq(0)
+        me = dut.fields.FormMD.me[:]
+        m.d.comb += mask.end.eq(Cat(me[1:6], me[0]))
+
+        rot = Signal(64)
+        m.d.comb += rot.eq(rotl64(dut.i.rs, dut.i.rb[:6]))
+
+        expected = Signal(64)
+        m.d.comb += expected.eq(rot & mask.out)
+
+        m.d.comb += Assert(dut.o.o.data == expected)
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    def _check_extswsli(self, m, dut):
+        m.d.comb += Assume(dut.i.ra == 0)
+        m.d.comb += Assume(dut.i.rb[6:] == 0)
+        m.d.comb += Assume(~dut.i.ctx.op.is_32bit)  # all instrs. are 64-bit
+        expected = Signal(64)
+        m.d.comb += expected.eq((dut.i.rs[0:32].as_signed() << dut.i.rb[:6]))
+        m.d.comb += Assert(dut.o.o.data == expected)
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    def _check_ternlog(self, m, dut):
+        lut = dut.fields.FormTLI.TLI[:]
+        for i in range(64):
+            idx = Cat(dut.i.rb[i], dut.i.ra[i], dut.i.rc[i])
+            for j in range(8):
+                with m.If(j == idx):
+                    m.d.comb += Assert(dut.o.o.data[i] == lut[j])
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    # grev removed -- leaving code for later use in grevlut
+    def _check_grev32(self, m, dut):
+        m.d.comb += Assume(dut.i.ctx.op.is_32bit)
+        # assert zero-extended
+        m.d.comb += Assert(dut.o.o.data[32:] == 0)
+        i = Signal(5)
+        m.d.comb += eq_any_const(i)
+        idx = dut.i.rb[0: 5] ^ i
+        m.d.comb += Assert((dut.o.o.data >> i)[0] == (dut.i.ra >> idx)[0])
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    # grev removed -- leaving code for later use in grevlut
+    def _check_grev64(self, m, dut):
+        m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+        i = Signal(6)
+        m.d.comb += eq_any_const(i)
+        idx = dut.i.rb[0: 6] ^ i
+        m.d.comb += Assert((dut.o.o.data >> i)[0] == (dut.i.ra >> idx)[0])
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
 
 class ALUTestCase(FHDLTestCase):
-    def test_formal(self):
-        module = Driver()
+    def run_it(self, which):
+        module = Driver(which)
         self.assertFormal(module, mode="bmc", depth=2)
         self.assertFormal(module, mode="cover", depth=2)
-    def test_ilang(self):
-        dut = Driver()
-        vl = rtlil.convert(dut, ports=[])
-        with open("main_stage.il", "w") as f:
-            f.write(vl)
+
+    def test_none(self):
+        self.run_it(None)
+
+    def test_shl(self):
+        self.run_it(TstOp.SHL)
+
+    def test_shr(self):
+        self.run_it(TstOp.SHR)
+
+    def test_rlc32(self):
+        self.run_it(TstOp.RLC32)
+
+    def test_rlc64(self):
+        self.run_it(TstOp.RLC64)
+
+    def test_rlcl(self):
+        self.run_it(TstOp.RLCL)
+
+    def test_rlcr(self):
+        self.run_it(TstOp.RLCR)
+
+    def test_extswsli(self):
+        self.run_it(TstOp.EXTSWSLI)
+
+    def test_ternlog(self):
+        self.run_it(TstOp.TERNLOG)
+
+    @unittest.skip("grev removed -- leaving code for later use in grevlut")
+    def test_grev32(self):
+        self.run_it(TstOp.GREV32)
+
+    @unittest.skip("grev removed -- leaving code for later use in grevlut")
+    def test_grev64(self):
+        self.run_it(TstOp.GREV64)
+
+
+# check that all test cases are covered
+for i in TstOp:
+    assert callable(getattr(ALUTestCase, f"test_{i.name.lower()}"))
 
 
 if __name__ == '__main__':
index b8ec704199a800c6df652524612581e07d885bb2..2735927839b73d1d8f13f9713f9c9f1fe3b00192 100644 (file)
@@ -8,6 +8,7 @@
 # output stage
 from nmigen import (Module, Signal, Cat, Repl, Mux, Const)
 from nmutil.pipemodbase import PipeModBase
+from soc.fu.pipe_data import get_pspec_draft_bitmanip
 from soc.fu.shift_rot.pipe_data import (ShiftRotOutputData,
                                         ShiftRotInputData)
 from nmutil.lut import BitwiseLut
@@ -21,6 +22,7 @@ from openpower.decoder.power_fieldsn import SignalBitRange
 class ShiftRotMainStage(PipeModBase):
     def __init__(self, pspec):
         super().__init__(pspec, "main")
+        self.draft_bitmanip = get_pspec_draft_bitmanip(pspec)
         self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
         self.fields.create_specs()
 
@@ -31,16 +33,19 @@ class ShiftRotMainStage(PipeModBase):
         return ShiftRotOutputData(self.pspec)
 
     def elaborate(self, platform):
+        XLEN = self.pspec.XLEN
         m = Module()
         comb = m.d.comb
         op = self.i.ctx.op
         o = self.o.o
 
-        bitwise_lut = BitwiseLut(input_count=3, width=64)
-        m.submodules.bitwise_lut = bitwise_lut
-        comb += bitwise_lut.inputs[0].eq(self.i.rb)
-        comb += bitwise_lut.inputs[1].eq(self.i.ra)
-        comb += bitwise_lut.inputs[2].eq(self.i.rc)
+        bitwise_lut = None
+        if self.draft_bitmanip:
+            bitwise_lut = BitwiseLut(input_count=3, width=XLEN)
+            m.submodules.bitwise_lut = bitwise_lut
+            comb += bitwise_lut.inputs[0].eq(self.i.rb)
+            comb += bitwise_lut.inputs[1].eq(self.i.ra)
+            comb += bitwise_lut.inputs[2].eq(self.i.rc)
 
         # NOTE: the sh field immediate is read in by PowerDecode2
         # (actually DecodeRB), whereupon by way of rb "immediate" mode
@@ -57,7 +62,7 @@ class ShiftRotMainStage(PipeModBase):
         comb += mb_extra.eq(md_fields['mb'][0:-1][0])
 
         # set up microwatt rotator module
-        m.submodules.rotator = rotator = Rotator()
+        m.submodules.rotator = rotator = Rotator(XLEN)
         comb += [
             rotator.me.eq(me),
             rotator.mb.eq(mb),
@@ -71,12 +76,17 @@ class ShiftRotMainStage(PipeModBase):
 
         comb += o.ok.eq(1)  # defaults to enabled
 
+        # instruction rotate type
+        mode = Signal(4, reset_less=True)
+        comb += Cat(rotator.right_shift,
+                    rotator.clear_left,
+                    rotator.clear_right,
+                    rotator.sign_ext_rs).eq(mode)
+
         # outputs from the microwatt rotator module
         comb += [o.data.eq(rotator.result_o),
                  self.o.xer_ca.data.eq(Repl(rotator.carry_out_o, 2))]
 
-        # instruction rotate type
-        mode = Signal(4, reset_less=True)
         with m.Switch(op.insn_type):
             with m.Case(MicrOp.OP_SHL):
                 comb += mode.eq(0b0000)  # L-shift
@@ -90,20 +100,16 @@ class ShiftRotMainStage(PipeModBase):
                 comb += mode.eq(0b0100)  # clear R
             with m.Case(MicrOp.OP_EXTSWSLI):
                 comb += mode.eq(0b1000)  # L-ext
-            with m.Case(MicrOp.OP_TERNLOG):
-                # TODO: this only works for ternaryi, change to get lut value
-                # from register when we implement other variants
-                comb += bitwise_lut.lut.eq(self.fields.FormTLI.TLI[:])
-                comb += o.data.eq(bitwise_lut.output)
-                comb += self.o.xer_ca.data.eq(0)
+            if self.draft_bitmanip:
+                with m.Case(MicrOp.OP_TERNLOG):
+                    # TODO: this only works for ternlogi, change to get lut
+                    # value from register when we implement other variants
+                    comb += bitwise_lut.lut.eq(self.fields.FormTLI.TLI[:])
+                    comb += o.data.eq(bitwise_lut.output)
+                    comb += self.o.xer_ca.data.eq(0)
             with m.Default():
                 comb += o.ok.eq(0)  # otherwise disable
 
-        comb += Cat(rotator.right_shift,
-                    rotator.clear_left,
-                    rotator.clear_right,
-                    rotator.sign_ext_rs).eq(mode)
-
         ###### sticky overflow and context, both pass-through #####
 
         comb += self.o.xer_so.data.eq(self.i.xer_so)
index fd2336dda5f50c9aed5da5558d7f2eb419dab265..d783d017ed3851ea2dbb55b2f56b2e20d2ef66eb 100644 (file)
@@ -4,43 +4,52 @@ from soc.fu.alu.pipe_data import ALUOutputData
 
 
 class ShiftRotInputData(FUBaseData):
-    regspec = [('INT', 'ra', '0:63'),      # RA
-               ('INT', 'rb', '0:63'),      # RB
-               ('INT', 'rc', '0:63'),      # RS
-               ('XER', 'xer_so', '32'), # XER bit 32: SO
-               ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32
     def __init__(self, pspec):
         super().__init__(pspec, False)
         # convenience
         self.a, self.b, self.rs = self.ra, self.rb, self.rc
 
+    @property
+    def regspec(self):
+        return [('INT', 'ra', self.intrange),  # RA
+               ('INT', 'rb', self.intrange),  # RB/immediate
+               ('INT', 'rc', self.intrange),  # RB/immediate
+               ('XER', 'xer_so', '32'), # XER bit 32: SO
+               ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32
+
 
 # input to shiftrot final stage (common output)
 class ShiftRotOutputData(FUBaseData):
-    regspec = [('INT', 'o', '0:63'),        # RT
-               ('CR', 'cr_a', '0:3'),
-               ('XER', 'xer_so', '32'),    # bit0: so
-               ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
-               ]
     def __init__(self, pspec):
         super().__init__(pspec, True)
         # convenience
         self.cr0 = self.cr_a
 
+    @property
+    def regspec(self):
+        return [('INT', 'o', self.intrange),
+               ('CR', 'cr_a', '0:3'),
+               ('XER', 'xer_so', '32'),    # bit0: so
+               ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
+               ]
+
 
 # output from shiftrot final stage (common output) - note that XER.so
 # is *not* included (the only reason it's in the input is because of CR0)
 class ShiftRotOutputDataFinal(FUBaseData):
-    regspec = [('INT', 'o', '0:63'),        # RT
-               ('CR', 'cr_a', '0:3'),
-               ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
-               ]
     def __init__(self, pspec):
         super().__init__(pspec, True)
         # convenience
         self.cr0 = self.cr_a
 
+    @property
+    def regspec(self):
+        return [('INT', 'o', self.intrange),
+               ('CR', 'cr_a', '0:3'),
+               ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
+               ]
+
 
 class ShiftRotPipeSpec(CommonPipeSpec):
-    regspec = (ShiftRotInputData.regspec, ShiftRotOutputDataFinal.regspec)
+    regspecklses = (ShiftRotInputData, ShiftRotOutputDataFinal)
     opsubsetkls = CompSROpSubset
index 80e46038166e89194147b5c6d3a45f818fa417e8..67dc034c61d07a7f37c7ef5a6ec222743abc0221 100644 (file)
@@ -4,11 +4,15 @@ from soc.fu.shift_rot.input_stage import ShiftRotInputStage
 from soc.fu.shift_rot.main_stage import ShiftRotMainStage
 from soc.fu.shift_rot.output_stage import ShiftRotOutputStage
 
-class ShiftRotStages(PipeModBaseChain):
+class ShiftRotStart(PipeModBaseChain):
     def get_chain(self):
         inp = ShiftRotInputStage(self.pspec)
+        return [inp]
+
+class ShiftRotStage(PipeModBaseChain):
+    def get_chain(self):
         main = ShiftRotMainStage(self.pspec)
-        return [inp, main]
+        return [main]
 
 
 class ShiftRotStageEnd(PipeModBaseChain):
@@ -21,13 +25,15 @@ class ShiftRotBasePipe(ControlBase):
     def __init__(self, pspec):
         ControlBase.__init__(self)
         self.pspec = pspec
-        self.pipe1 = ShiftRotStages(pspec)
-        self.pipe2 = ShiftRotStageEnd(pspec)
-        self._eqs = self.connect([self.pipe1, self.pipe2])
+        self.pipe1 = ShiftRotStart(pspec)
+        self.pipe2 = ShiftRotStage(pspec)
+        self.pipe3 = ShiftRotStageEnd(pspec)
+        self._eqs = self.connect([self.pipe1, self.pipe2, self.pipe3])
 
     def elaborate(self, platform):
         m = ControlBase.elaborate(self, platform)
         m.submodules.pipe1 = self.pipe1
         m.submodules.pipe2 = self.pipe2
+        m.submodules.pipe3 = self.pipe3
         m.d.comb += self._eqs
         return m
index 7c3d811c8fa0402a70d5a3e1551e8ecb83873280..eac042fedcece092fec572ed75dd9759f852728e 100644 (file)
@@ -11,18 +11,18 @@ from nmutil.mask import Mask
 
 
 # note BE bit numbering
-def right_mask(m, mask_begin):
-    ret = Signal(64, name="right_mask", reset_less=True)
-    with m.If(mask_begin <= 64):
-        m.d.comb += ret.eq((1 << (64-mask_begin)) - 1)
+def right_mask(m, mask_begin, width):
+    ret = Signal(width, name="right_mask", reset_less=True)
+    with m.If(mask_begin <= width):
+        m.d.comb += ret.eq((1 << (width-mask_begin)) - 1)
     with m.Else():
         m.d.comb += ret.eq(0)
     return ret
 
 
-def left_mask(m, mask_end):
-    ret = Signal(64, name="left_mask", reset_less=True)
-    m.d.comb += ret.eq(~((1 << (63-mask_end)) - 1))
+def left_mask(m, mask_end, width):
+    ret = Signal(width, name="left_mask", reset_less=True)
+    m.d.comb += ret.eq(~((1 << (width-1-mask_end)) - 1))
     return ret
 
 
@@ -45,14 +45,15 @@ class Rotator(Elaboratable):
         * clear_right = 1 when insn_type is OP_RLC or OP_RLCR
     """
 
-    def __init__(self):
+    def __init__(self, width):
+        self.width = width
         # input
         self.me = Signal(5, reset_less=True)        # ME field
         self.mb = Signal(5, reset_less=True)        # MB field
         # extra bit of mb in MD-form
         self.mb_extra = Signal(1, reset_less=True)
-        self.ra = Signal(64, reset_less=True)       # RA
-        self.rs = Signal(64, reset_less=True)       # RS
+        self.ra = Signal(width, reset_less=True)       # RA
+        self.rs = Signal(width, reset_less=True)       # RS
         self.shift = Signal(7, reset_less=True)     # RB[0:7]
         self.is_32bit = Signal(reset_less=True)
         self.right_shift = Signal(reset_less=True)
@@ -61,10 +62,11 @@ class Rotator(Elaboratable):
         self.clear_right = Signal(reset_less=True)
         self.sign_ext_rs = Signal(reset_less=True)
         # output
-        self.result_o = Signal(64, reset_less=True)
+        self.result_o = Signal(width, reset_less=True)
         self.carry_out_o = Signal(reset_less=True)
 
     def elaborate(self, platform):
+        width = self.width
         m = Module()
         comb = m.d.comb
         ra, rs = self.ra, self.rs
@@ -75,11 +77,11 @@ class Rotator(Elaboratable):
         sh = Signal(7, reset_less=True)
         mb = Signal(7, reset_less=True)
         me = Signal(7, reset_less=True)
-        mr = Signal(64, reset_less=True)
-        ml = Signal(64, reset_less=True)
+        mr = Signal(width, reset_less=True)
+        ml = Signal(width, reset_less=True)
         output_mode = Signal(2, reset_less=True)
         hi32 = Signal(32, reset_less=True)
-        repl32 = Signal(64, reset_less=True)
+        repl32 = Signal(width, reset_less=True)
 
         # First replicate bottom 32 bits to both halves if 32-bit
         with m.If(self.is_32bit):
@@ -88,7 +90,8 @@ class Rotator(Elaboratable):
             # sign-extend bottom 32 bits
             comb += hi32.eq(Repl(rs[31], 32))
         with m.Else():
-            comb += hi32.eq(rs[32:64])
+            if width == 64:
+                comb += hi32.eq(rs[32:64])
         comb += repl32.eq(Cat(rs[0:32], hi32))
 
         shift_signed = Signal(signed(6))
@@ -101,7 +104,7 @@ class Rotator(Elaboratable):
             comb += rot_count.eq(self.shift[0:6])
 
         # ROTL submodule
-        m.submodules.rotl = rotl = ROTL(64)
+        m.submodules.rotl = rotl = ROTL(width)
         comb += rotl.a.eq(repl32)
         comb += rotl.b.eq(rot_count)
         comb += rot.eq(rotl.o)
@@ -139,16 +142,16 @@ class Rotator(Elaboratable):
             comb += me.eq(Cat(~sh[0:6], sh[6]))
 
         # Calculate left and right masks
-        m.submodules.right_mask = right_mask = Mask(64)
-        with m.If(mb <= 64):
-            comb += right_mask.shift.eq(64-mb)
+        m.submodules.right_mask = right_mask = Mask(width)
+        with m.If(mb <= width):
+            comb += right_mask.shift.eq(width-mb)
             comb += mr.eq(right_mask.mask)
         with m.Else():
             comb += mr.eq(0)
         #comb += mr.eq(right_mask(m, mb))
 
-        m.submodules.left_mask = left_mask = Mask(64)
-        comb += left_mask.shift.eq(63-me)
+        m.submodules.left_mask = left_mask = Mask(width)
+        comb += left_mask.shift.eq(width-1-me)
         comb += ml.eq(~left_mask.mask)
         #comb += ml.eq(left_mask(m, me))
 
@@ -159,7 +162,8 @@ class Rotator(Elaboratable):
         # 10 for rldicl, sr[wd]
         # 1z for sra[wd][i], z = 1 if rs is negative
         with m.If((self.clear_left & ~self.clear_right) | self.right_shift):
-            comb += output_mode.eq(Cat(self.arith & repl32[63], Const(1, 1)))
+            comb += output_mode.eq(Cat(self.arith &
+                                       repl32[width-1], Const(1, 1)))
         with m.Else():
             mbgt = self.clear_right & (mb[0:6] > me[0:6])
             comb += output_mode.eq(Cat(mbgt, Const(0, 1)))
@@ -186,7 +190,7 @@ if __name__ == '__main__':
     comb = m.d.comb
     mr = Signal(64)
     mb = Signal(6)
-    comb += mr.eq(left_mask(m, mb))
+    comb += mr.eq(left_mask(m, mb, 64))
 
     def loop():
         for i in range(64):
index 27a1d4c495526b22a92246aa20599c2c7d4a24a9..8898224d5a089ff3039a69f8ea3543998cbee67d 100644 (file)
@@ -3,12 +3,13 @@ from nmigen.back.pysim import Simulator, Delay, Settle
 from nmutil.formaltest import FHDLTestCase
 from nmigen.cli import rtlil
 from soc.fu.shift_rot.maskgen import MaskGen
-from openpower.decoder.helpers import MASK
+from openpower.decoder.helpers import ISACallerHelper
 import random
 import unittest
 
 class MaskGenTestCase(FHDLTestCase):
     def test_maskgen(self):
+        MASK = ISACallerHelper(64, FPSCR=None).MASK
         m = Module()
         comb = m.d.comb
         m.submodules.dut = dut = MaskGen(64)
index ea1aba389132f028fbe975a8d9b4d9183b5ad336..cfa1c67492d2f0b7b7b01a0445a3c29f05cbfb66 100644 (file)
@@ -17,6 +17,7 @@ from nmigen import Module, Signal
 from nmutil.sim_tmp_alternative import Simulator, Settle
 
 from openpower.test.shift_rot.shift_rot_cases import ShiftRotTestCase
+from openpower.test.bitmanip.bitmanip_cases import BitManipTestCase
 
 
 def get_cu_inputs(dec2, sim):
@@ -70,7 +71,11 @@ def set_alu_inputs(alu, dec2, sim):
 class ShiftRotIlangCase(TestAccumulatorBase):
 
     def case_ilang(self):
-        pspec = ShiftRotPipeSpec(id_wid=2)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = ShiftRotPipeSpec(id_wid=2, parent_pspec=pps)
+        pspec.draft_bitmanip = True
         alu = ShiftRotBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("shift_rot_pipeline.il", "w") as f:
@@ -136,7 +141,11 @@ class TestRunner(unittest.TestCase):
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
         pdecode = pdecode2.dec
 
-        pspec = ShiftRotPipeSpec(id_wid=2)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = ShiftRotPipeSpec(id_wid=2, parent_pspec=pps)
+        pspec.draft_bitmanip = True
         m.submodules.alu = alu = ShiftRotBasePipe(pspec)
 
         comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
@@ -174,13 +183,13 @@ class TestRunner(unittest.TestCase):
         yield from ALUHelpers.get_xer_ca(res, alu, dec2)
         yield from ALUHelpers.get_int_o(res, alu, dec2)
 
-        print ("hw outputs", res)
+        print("hw outputs", res)
 
         yield from ALUHelpers.get_sim_int_o(sim_o, sim, dec2)
         yield from ALUHelpers.get_wr_sim_cr_a(sim_o, sim, dec2)
         yield from ALUHelpers.get_wr_sim_xer_ca(sim_o, sim, dec2)
 
-        print ("sim outputs", sim_o)
+        print("sim outputs", sim_o)
 
         ALUHelpers.check_cr_a(self, res, sim_o, "CR%d %s" % (cridx, code))
         ALUHelpers.check_xer_ca(self, res, sim_o, code)
@@ -191,6 +200,7 @@ if __name__ == "__main__":
     unittest.main(exit=False)
     suite = unittest.TestSuite()
     suite.addTest(TestRunner(ShiftRotTestCase().test_data))
+    suite.addTest(TestRunner(BitManipTestCase().test_data))
     suite.addTest(TestRunner(ShiftRotIlangCase().test_data))
 
     runner = unittest.TextTestRunner()
index 1431a0386d595a1252c19c1254d0c050295d748e..db9f86a84ed32947c76101e3dfa270a11685a8f5 100644 (file)
@@ -24,6 +24,8 @@ from openpower.decoder.power_fields import DecodeFields
 from openpower.decoder.power_fieldsn import SignalBitRange
 
 # use POWER numbering. sigh.
+
+
 def xer_bit(name):
     return 63-XER_bits[name]
 
@@ -46,16 +48,16 @@ class Driver(Elaboratable):
             width = p.width
             comb += p.eq(AnyConst(width))
 
-        pspec = SPRPipeSpec(id_wid=2)
+        pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.dut = dut = SPRMainStage(pspec)
 
         # frequently used aliases
         a = dut.i.a
         ca_in = dut.i.xer_ca[0]   # CA carry in
-        ca32_in = dut.i.xer_ca[1] # CA32 carry in 32
+        ca32_in = dut.i.xer_ca[1]  # CA32 carry in 32
         so_in = dut.i.xer_so      # SO sticky overflow
         ov_in = dut.i.xer_ov[0]   # XER OV in
-        ov32_in = dut.i.xer_ov[1] # XER OV32 in
+        ov32_in = dut.i.xer_ov[1]  # XER OV32 in
         o = dut.o.o
 
         # setup random inputs
@@ -71,8 +73,8 @@ class Driver(Elaboratable):
         comb += dut.i.ctx.op.eq(rec)
 
         # check that the operation (op) is passed through (and muxid)
-        comb += Assert(dut.o.ctx.op == dut.i.ctx.op )
-        comb += Assert(dut.o.ctx.muxid == dut.i.ctx.muxid )
+        comb += Assert(dut.o.ctx.op == dut.i.ctx.op)
+        comb += Assert(dut.o.ctx.muxid == dut.i.ctx.muxid)
 
         # MTSPR
         fields = DecodeFields(SignalBitRange, [dut.i.ctx.op.insn])
index 6d9d13a6b85985d456da76347c6ebcda69f98dd9..b3a49cb642e9509732eaa3763599180b718a41f9 100644 (file)
@@ -19,7 +19,7 @@ class SPRMainStage(PipeModBase):
         super().__init__(pspec, "spr_main")
         # test if regfiles are reduced
         self.regreduce_en = (hasattr(pspec, "regreduce") and
-                                            (pspec.regreduce == True))
+                             (pspec.regreduce == True))
 
         self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
         self.fields.create_specs()
@@ -44,6 +44,7 @@ class SPRMainStage(PipeModBase):
         so_i, ov_i, ca_i = self.i.xer_so, self.i.xer_ov, self.i.xer_ca
         so_o, ov_o, ca_o = self.o.xer_so, self.o.xer_ov, self.o.xer_ca
         o, spr1_o, fast1_o = self.o.o, self.o.spr1, self.o.fast1
+        state1_i, state1_o = self.i.state1, self.o.state1
 
         # take copy of D-Form TO field
         x_fields = self.fields.FormXFX
@@ -55,9 +56,18 @@ class SPRMainStage(PipeModBase):
             #### MTSPR ####
             with m.Case(MicrOp.OP_MTSPR):
                 with m.Switch(spr):
-                    # fast SPRs first
+                    # State SPRs first, note that this triggers a regfile write
+                    # which is monitored right the way down in TestIssuerBase.
+                    with m.Case(SPR.DEC, SPR.TB):
+                        comb += state1_o.data.eq(a_i)
+                        comb += state1_o.ok.eq(1)
+
+                    # Fast SPRs second: anything in FAST regs
                     with m.Case(SPR.CTR, SPR.LR, SPR.TAR, SPR.SRR0,
-                                SPR.SRR1, SPR.XER, SPR.DEC):
+                                SPR.SRR1, SPR.XER, SPR.HSRR0, SPR.HSRR1,
+                                SPR.SPRG0_priv, SPR.SPRG1_priv,
+                                SPR.SPRG2_priv, SPR.SPRG3,
+                                SPR.HSPRG0, SPR.HSPRG1, SPR.SVSRR0):
                         comb += fast1_o.data.eq(a_i)
                         comb += fast1_o.ok.eq(1)
                         # XER is constructed
@@ -83,15 +93,25 @@ class SPRMainStage(PipeModBase):
             with m.Case(MicrOp.OP_MFSPR):
                 comb += o.ok.eq(1)
                 with m.Switch(spr):
-                    # fast SPRs first
-                    with m.Case(SPR.CTR, SPR.LR, SPR.TAR, SPR.SRR0, SPR.SRR1,
-                                SPR.XER, SPR.DEC, SPR.TB):
+                    # state SPRs first
+                    with m.Case(SPR.DEC, SPR.TB):
+                        comb += o.data.eq(state1_i)
+                    # TBU is upper 32-bits of State Reg
+                    with m.Case(SPR.TBU):
+                        comb += o.data[0:32].eq(state1_i[32:64])
+
+                    # fast SPRs second
+                    with m.Case(SPR.CTR, SPR.LR, SPR.TAR, SPR.SRR0,
+                                SPR.SRR1, SPR.XER, SPR.HSRR0, SPR.HSRR1,
+                                SPR.SPRG0_priv, SPR.SPRG1_priv,
+                                SPR.SPRG2_priv, SPR.SPRG3,
+                                SPR.HSPRG0, SPR.HSPRG1, SPR.SVSRR0):
                         comb += o.data.eq(fast1_i)
                         with m.If(spr == SPR.XER):
                             # bits 0:31 and 35:43 are treated as reserved
                             # and return 0s when read using mfxer
                             comb += o[32:64].eq(0)       # MBS0 bits 0-31
-                            comb += o[63-43:64-35].eq(0) # MSB0 bits 35-43
+                            comb += o[63-43:64-35].eq(0)  # MSB0 bits 35-43
                             # sticky
                             comb += o[63-XER_bits['SO']].eq(so_i)
                             # overflow
@@ -100,9 +120,6 @@ class SPRMainStage(PipeModBase):
                             # carry
                             comb += o[63-XER_bits['CA']].eq(ca_i[0])
                             comb += o[63-XER_bits['CA32']].eq(ca_i[1])
-                    with m.Case(SPR.TBU):
-                        comb += o.data[0:32].eq(fast1_i[32:64])
-
                     # slow SPRs TODO
                     with m.Default():
                         comb += o.data.eq(spr1_i)
index bd0ed97e4e0a2dc4165d8b3e942d6d4575badc84..21db95827ccc53d3f5d67454ef13f5881aff7d51 100644 (file)
@@ -19,6 +19,7 @@ class SPRInputData(FUBaseData):
     regspec = [('INT', 'ra', '0:63'),        # RA
                ('SPR', 'spr1', '0:63'),      # SPR (slow)
                ('FAST', 'fast1', '0:63'),    # SPR (fast: LR, CTR etc)
+               ('STATE', 'state1', '0:63'),  # SPR (DEC/TB)
                ('XER', 'xer_so', '32'),      # XER bit 32: SO
                ('XER', 'xer_ov', '33,44'),   # XER bit 34/45: CA/CA32
                ('XER', 'xer_ca', '34,45')]   # bit0: ov, bit1: ov32
@@ -27,11 +28,16 @@ class SPRInputData(FUBaseData):
         # convenience
         self.a = self.ra
 
+# note that state1 gets a corresponding "state1" write port created
+# by core.py which is "monitored" by TestIssuerBase (hack-job, sigh).
+# when writes are spotted then the DEC/TB FSM resets and re-reads
+# DEC/TB.
 
 class SPROutputData(FUBaseData):
     regspec = [('INT', 'o', '0:63'),        # RT
                ('SPR', 'spr1', '0:63'),     # SPR (slow)
                ('FAST', 'fast1', '0:63'),   # SPR (fast: LR, CTR etc)
+               ('STATE', 'state1', '0:63'), # SPR (DEC/TB)
                ('XER', 'xer_so', '32'),     # XER bit 32: SO
                ('XER', 'xer_ov', '33,44'),  # XER bit 34/45: CA/CA32
                ('XER', 'xer_ca', '34,45')]  # bit0: ov, bit1: ov32
@@ -40,5 +46,5 @@ class SPROutputData(FUBaseData):
 
 
 class SPRPipeSpec(CommonPipeSpec):
-    regspec = (SPRInputData.regspec, SPROutputData.regspec)
+    regspecklses = (SPRInputData, SPROutputData)
     opsubsetkls = CompSPROpSubset
index d6aa34ea6b972ca0b68b34030f74dc0a9212ab47..894212bcb68549221fc8119fdfbc999e165e4b35 100644 (file)
@@ -61,7 +61,7 @@ def set_alu_inputs(alu, dec2, sim):
 
 class SPRIlangCase(TestAccumulatorBase):
     def case_ilang(self):
-        pspec = SPRPipeSpec(id_wid=2)
+        pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
         alu = SPRBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("trap_pipeline.il", "w") as f:
@@ -139,7 +139,7 @@ class TestRunner(unittest.TestCase):
 
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
 
-        pspec = SPRPipeSpec(id_wid=2)
+        pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.alu = alu = SPRBasePipe(pspec)
 
         comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
index 235df615a896928e43c036520acd46e6049858bf..b94f7e732d255cc5aa1a063e012c9edd354a1c79 100644 (file)
@@ -37,7 +37,7 @@ class Driver(Elaboratable):
         comb = m.d.comb
 
         rec = CompTrapOpSubset()
-        pspec = TrapPipeSpec(id_wid=2)
+        pspec = TrapPipeSpec(id_wid=2, parent_pspec=None)
 
         m.submodules.dut = dut = TrapMainStage(pspec)
 
@@ -202,7 +202,7 @@ class Driver(Elaboratable):
             ###################
 
             with m.Case(MicrOp.OP_MTMSRD):
-                msr_od = msr_o.data # another "shortener"
+                msr_od = msr_o.data  # another "shortener"
 
                 with m.If(L == 0):
                     # if (MSR[29:31] != 0b010) | (SRR1[29:31] != 0b000) then
@@ -216,7 +216,7 @@ class Driver(Elaboratable):
                     # MSR[48] <- (RS)[48] | (RS)[49]
                     # MSR[58] <- (RS)[58] | (RS)[49]
                     # MSR[59] <- (RS)[59] | (RS)[49]
-                    PR = field(rs, 49) # alias/copy of SRR1 PR field
+                    PR = field(rs, 49)  # alias/copy of SRR1 PR field
                     comb += [
                         Assert(field(msr_od, 48) == field(rs, 48) | PR),
                         Assert(field(msr_od, 58) == field(rs, 58) | PR),
@@ -263,7 +263,7 @@ class Driver(Elaboratable):
             # RFID.  v3.0B p955
             ###################
             with m.Case(MicrOp.OP_RFID):
-                msr_od = msr_o.data # another "shortener"
+                msr_od = msr_o.data  # another "shortener"
                 comb += [
                     Assert(msr_o.ok),
                     Assert(nia_o.ok),
@@ -280,7 +280,7 @@ class Driver(Elaboratable):
 
                 # if (MSR[29:31] != 0b010) | (SRR1[29:31] != 0b000) then
                 #     MSR[29:31] <- SRR1[29:31]
-                with m.If((field(msr_i , 29, 31) != 0b010) |
+                with m.If((field(msr_i, 29, 31) != 0b010) |
                           (field(srr1_i, 29, 31) != 0b000)):
                     comb += Assert(F(msr_od, 29, 31) == F(srr1_i, 29, 31))
                 with m.Else():
@@ -290,7 +290,7 @@ class Driver(Elaboratable):
                 # MSR[48] <- (RS)[48] | (RS)[49]
                 # MSR[58] <- (RS)[58] | (RS)[49]
                 # MSR[59] <- (RS)[59] | (RS)[49]
-                PR = field(srr1_i, 49) # alias/copy of SRR1 PR field
+                PR = field(srr1_i, 49)  # alias/copy of SRR1 PR field
                 comb += [
                     Assert(field(msr_od, 48) == field(srr1_i, 48) | PR),
                     Assert(field(msr_od, 58) == field(srr1_i, 58) | PR),
@@ -373,4 +373,3 @@ class TrapMainStageTestCase(FHDLTestCase):
 
 if __name__ == '__main__':
     unittest.main()
-
index c597b75e7e01f57375ff20d2fd05cb1f6e8c686e..8127e226e34afaf5cf87489bb86fa04a39a544e1 100644 (file)
@@ -24,7 +24,8 @@ from openpower.consts import MSR, PI, TT, field, field_slice
 
 
 def msr_copy(msr_o, msr_i, zero_me=True):
-    """msr_copy
+    """msr_copy (also used to copy relevant bits into SRR1)
+
     ISA says this:
     Defined MSR bits are classified as either full func tion or partial
     function. Full function MSR bits are saved in SRR1 or HSRR1 when
@@ -42,11 +43,11 @@ def msr_copy(msr_o, msr_i, zero_me=True):
     return l
 
 
-def msr_check_pr(m, msr):
+def msr_check_pr(m, d_in, msr):
     """msr_check_pr: checks "problem state"
     """
     comb = m.d.comb
-    with m.If(msr[MSR.PR]):
+    with m.If(d_in[MSR.PR]):
         comb += msr[MSR.EE].eq(1) # set external interrupt bit
         comb += msr[MSR.IR].eq(1) # set instruction relocation bit
         comb += msr[MSR.DR].eq(1) # set data relocation bit
@@ -57,6 +58,8 @@ class TrapMainStage(PipeModBase):
         super().__init__(pspec, "main")
         self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
         self.fields.create_specs()
+        self.kaivb = Signal(64) # KAIVB SPR
+        self.state_reset = Signal() # raise high to reset KAIVB cache
 
     def trap(self, m, trap_addr, return_addr):
         """trap.  sets new PC, stores MSR and old PC in SRR1 and SRR0
@@ -65,19 +68,33 @@ class TrapMainStage(PipeModBase):
         op = self.i.ctx.op
         msr_i = op.msr
         svstate_i = op.svstate
+
+        exc = LDSTException("trapexc")
+        comb += exc.eq(op.ldst_exc)
+        srr1_i = exc.srr1 # new SRR1 bits come from exception
         nia_o = self.o.nia
         svsrr0_o, srr0_o, srr1_o = self.o.svsrr0, self.o.srr0, self.o.srr1
 
-        # trap address
+        # trap address, including KAIVB override
         comb += nia_o.data.eq(trap_addr)
+        comb += nia_o.data[13:].eq(self.kaivb[13:])
         comb += nia_o.ok.eq(1)
 
         # addr to begin from on return
         comb += srr0_o.data.eq(return_addr)
         comb += srr0_o.ok.eq(1)
 
-        # take a copy of the current MSR into SRR1
-        comb += msr_copy(srr1_o.data, msr_i) # old MSR
+        # take a copy of the current MSR into SRR1, but first copy old SRR1
+        # this preserves the bits of SRR1 that are not supposed to change:
+        # MSR.IR,DR,PMM,RI,LE (0-5) and MR,FP,ME,FE0 (11-14)
+        # i would suggest reading v3.0C p1063 Book III section 7.2.1 for
+        # advice but it's so obscure and indirect, that it's just easier
+        # to copy microwatt behaviour.  see writeback.vhdl
+        # IMPORTANT: PowerDecoder2 needed to actually read SRR1 for
+        # it to have the contents *of* SRR1 to copy over!
+        comb += msr_copy(srr1_o.data, msr_i, False)  # old MSR
+        comb += srr1_o.data[16:22].eq(srr1_i[0:6])   # IR,DR,PMM,RI,LE
+        comb += srr1_o.data[27:31].eq(srr1_i[11:15]) # MR,FP,ME,FE0
         comb += srr1_o.ok.eq(1)
 
         # take a copy of the current SVSTATE into SVSRR0
@@ -125,7 +142,7 @@ class TrapMainStage(PipeModBase):
 
     def elaborate(self, platform):
         m = Module()
-        comb = m.d.comb
+        comb, sync = m.d.comb, m.d.sync
         op = self.i.ctx.op
 
         # convenience variables
@@ -137,6 +154,10 @@ class TrapMainStage(PipeModBase):
         srr0_o, srr1_o, svsrr0_o = self.o.srr0, self.o.srr1, self.o.svsrr0
         traptype, trapaddr = op.traptype, op.trapaddr
 
+        # hard reset of KAIVB
+        with m.If(self.state_reset):
+            sync += self.kaivb.eq(0)
+
         # take copy of D-Form TO field
         i_fields = self.fields.FormD
         to = Signal(i_fields.TO[0:-1].shape())
@@ -187,6 +208,16 @@ class TrapMainStage(PipeModBase):
         # TODO: some #defines for the bits n stuff.
         with m.Switch(op.insn_type):
 
+            ##############
+            # KAIVB https://bugs.libre-soc.org/show_bug.cgi?id=859
+
+            with m.Case(MicrOp.OP_MTSPR):
+                sync += self.kaivb.eq(a_i)
+
+            with m.Case(MicrOp.OP_MFSPR):
+                comb += o.data.eq(self.kaivb)
+                comb += o.ok.eq(1)
+
             ###############
             # TDI/TWI/TD/TW.  v3.0B p90-91
 
@@ -204,7 +235,10 @@ class TrapMainStage(PipeModBase):
                         comb += srr1_o.data[PI.FP].eq(1)
                     with m.If(traptype & TT.ADDR):
                         comb += srr1_o.data[PI.ADR].eq(1)
-                    with m.If(traptype & TT.MEMEXC):
+                    with m.If((traptype & TT.MEMEXC).bool() &
+                              (trapaddr == 0x400)):
+                        # Instruction Storage Interrupt (ISI - 0x400)
+                        #           v3.0C Book III Chap 7.5.5 p1085
                         # decode exception bits, store in SRR1
                         exc = LDSTException("trapexc")
                         comb += exc.eq(op.ldst_exc)
@@ -233,9 +267,10 @@ class TrapMainStage(PipeModBase):
             # MTMSR/D.  v3.0B p TODO - move to MSR
 
             with m.Case(MicrOp.OP_MTMSRD, MicrOp.OP_MTMSR):
-                L = self.fields.FormX.L[0:-1] # X-Form field L
+                # L => bit 16 in LSB0, bit 15 in MSB0 order
+                L = self.fields.FormX.L1[0:1] # X-Form field L1
                 # start with copy of msr
-                comb += msr_o.eq(msr_i)
+                comb += msr_o.data.eq(msr_i)
                 with m.If(L):
                     # just update RI..EE
                     comb += msr_o.data[MSR.RI].eq(a_i[MSR.RI])
@@ -257,7 +292,8 @@ class TrapMainStage(PipeModBase):
                         # mtmsr - 32-bit, only room for bottom 32 LSB flags
                         for stt, end in [(1,12), (13, 32)]:
                             comb += msr_o.data[stt:end].eq(a_i[stt:end])
-                    msr_check_pr(m, msr_o.data)
+                    # check problem state: if set, not permitted to set EE,IR,DR
+                    msr_check_pr(m, a_i, msr_o.data)
 
                 # Per https://bugs.libre-soc.org/show_bug.cgi?id=325#c123,
                 # this actually *is* in the microwatt code now.
@@ -265,9 +301,13 @@ class TrapMainStage(PipeModBase):
                 # hypervisor stuff.  here: bits 3 (HV) and 51 (ME) were
                 # copied over by msr_copy but if HV was not set we need
                 # the *original* (msr_i) bits
-                with m.If(~msr_i[MSR.HV]):
-                    comb += msr_o.data[MSR.HV].eq(msr_i[MSR.HV])
-                    comb += msr_o.data[MSR.ME].eq(msr_i[MSR.ME])
+                # XXX taking this out to see what happens when running
+                # linux-5.7 microwatt buildroot.  microwatt does not
+                # implement HV, so this is unlikely to work.  0x900
+                # linux kernel exception handling tends to support this
+                # with m.If(~msr_i[MSR.HV]):
+                #     comb += msr_o.data[MSR.HV].eq(msr_i[MSR.HV])
+                #     comb += msr_o.data[MSR.ME].eq(msr_i[MSR.ME])
 
                 comb += msr_o.ok.eq(1)
 
@@ -295,14 +335,18 @@ class TrapMainStage(PipeModBase):
                 # MSR was in srr1: copy it over, however *caveats below*
                 comb += msr_copy(msr_o.data, srr1_i, zero_me=False) # don't zero
 
-                with m.If(~self.i.ctx.op.insn[9]): # XXX BAD HACK! (hrfid)
-                    with m.If(field(msr_i, 3)): # HV
-                        comb += field(msr_o, 51).eq(field(srr1_i, 51)) # ME
-                    with m.Else():
-                        comb += field(msr_o, 51).eq(field(msr_i, 51)) # ME
-
-                # check problem state
-                msr_check_pr(m, msr_o.data)
+                if False: # XXX no - not doing hypervisor yet
+                    with m.If(~self.i.ctx.op.insn[9]): # XXX BAD HACK! (hrfid)
+                        with m.If(field(msr_i, 3)): # HV
+                            comb += field(msr_o.data, 51).eq(field(srr1_i, 51)) # ME
+                        with m.Else():
+                            comb += field(msr_o.data, 51).eq(field(msr_i, 51)) # ME
+                else:
+                    # same as microwatt: treat MSR.ME rfid same as hrfid
+                    comb += field(msr_o.data, 51).eq(field(srr1_i, 51)) # ME
+
+                # check problem state: if set, not permitted to set EE,IR,DR
+                msr_check_pr(m, srr1_i, msr_o.data)
 
                 # don't understand but it's in the spec.  again: bits 32-34
                 # are copied from srr1_i and need *restoring* to msr_i
index 93a135b81c3056292338bcd65f263897a5e468dc..b9c829bccc1811a1e7e334aba22fc3b09e7a907d 100644 (file)
@@ -36,5 +36,5 @@ class TrapOutputData(FUBaseData):
 
 
 class TrapPipeSpec(CommonPipeSpec):
-    regspec = (TrapInputData.regspec, TrapOutputData.regspec)
+    regspecklses = (TrapInputData, TrapOutputData)
     opsubsetkls = CompTrapOpSubset
index a634bc0570784ed31eb4f77059f2936fe5a67b8c..dff1f4139db5b9c78fdaeb962dfffb75b8ade168 100644 (file)
@@ -66,7 +66,7 @@ def set_alu_inputs(alu, dec2, sim):
 class TrapIlangCase(TestAccumulatorBase):
 
     def case_ilang(self):
-        pspec = TrapPipeSpec(id_wid=2)
+        pspec = TrapPipeSpec(id_wid=2, parent_pspec=None)
         alu = TrapBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("trap_pipeline.il", "w") as f:
@@ -74,24 +74,86 @@ class TrapIlangCase(TestAccumulatorBase):
 
 
 class TestRunner(unittest.TestCase):
-    def __init__(self, test_data):
-        super().__init__("run_all")
-        self.test_data = test_data
 
-    def run_all(self):
+    def execute(self, alu, instruction, pdecode2, test):
+        program = test.program
+        sim = ISA(pdecode2, test.regs, test.sprs, test.cr,
+                  test.mem, test.msr,
+                  bigendian=bigendian)
+        gen = program.generate_instructions()
+        instructions = list(zip(gen, program.assembly.splitlines()))
+
+        msr = sim.msr.value
+        pc = sim.pc.CIA.value
+        print("starting msr, pc %08x, %08x" % (msr, pc))
+        index = pc//4
+        while index < len(instructions):
+            ins, code = instructions[index]
+
+            print("pc %08x msr %08x instr: %08x" % (pc, msr, ins))
+            print(code)
+            if 'XER' in sim.spr:
+                so = 1 if sim.spr['XER'][XER_bits['SO']] else 0
+                ov = 1 if sim.spr['XER'][XER_bits['OV']] else 0
+                ov32 = 1 if sim.spr['XER'][XER_bits['OV32']] else 0
+                print("before: so/ov/32", so, ov, ov32)
+
+            # ask the decoder to decode this binary data (endian'd)
+            yield pdecode2.dec.bigendian.eq(bigendian)  # l/big?
+            yield pdecode2.state.msr.eq(msr)  # set MSR in pdecode2
+            yield pdecode2.state.pc.eq(pc)  # set CIA in pdecode2
+            yield instruction.eq(ins)          # raw binary instr.
+            yield Settle()
+            fn_unit = yield pdecode2.e.do.fn_unit
+            asmcode = yield pdecode2.e.asmcode
+            dec_asmcode = yield pdecode2.dec.op.asmcode
+            print("asmcode", asmcode, dec_asmcode)
+            self.assertEqual(fn_unit, Function.TRAP.value)
+            alu_o = yield from set_alu_inputs(alu, pdecode2, sim)
+
+            # set valid for one cycle, propagate through pipeline...
+            yield alu.p.i_valid.eq(1)
+            yield
+            yield alu.p.i_valid.eq(0)
+
+            opname = code.split(' ')[0]
+            yield from sim.call(opname)
+            pc = sim.pc.CIA.value
+            index = pc//4
+            print("pc after %08x" % (pc))
+            msr = sim.msr.value
+            print("msr after %08x" % (msr))
+
+            vld = yield alu.n.o_valid
+            while not vld:
+                yield
+                vld = yield alu.n.o_valid
+            yield
+
+            yield from self.check_alu_outputs(alu, pdecode2, sim, code)
+            yield Settle()
+
+    def test_it(self):
+        test_data = TrapTestCase().test_data
         m = Module()
         comb = m.d.comb
         instruction = Signal(32)
 
-        pdecode = create_pdecode()
-
-        m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
+        fn_name = "TRAP"
+        opkls = TrapPipeSpec.opsubsetkls
 
-        pspec = TrapPipeSpec(id_wid=2)
+        pdecode = create_pdecode()
+        m.submodules.pdecode2 = pdecode2 = PowerDecode2(
+            pdecode, opkls, fn_name)
+        pdecode = pdecode2.dec
+
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = TrapPipeSpec(id_wid=2, parent_pspec=pps)
         m.submodules.alu = alu = TrapBasePipe(pspec)
 
         comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
-        comb += alu.p.i_valid.eq(1)
         comb += alu.n.i_ready.eq(1)
         comb += pdecode2.dec.raw_opcode_in.eq(instruction)
         sim = Simulator(m)
@@ -99,57 +161,11 @@ class TestRunner(unittest.TestCase):
         sim.add_clock(1e-6)
 
         def process():
-            for test in self.test_data:
+            for test in test_data:
                 print(test.name)
                 program = test.program
                 with self.subTest(test.name):
-                    sim = ISA(pdecode2, test.regs, test.sprs, test.cr,
-                              test.mem, test.msr,
-                              bigendian=bigendian)
-                    gen = program.generate_instructions()
-                    instructions = list(zip(gen, program.assembly.splitlines()))
-
-                    msr = sim.msr.value
-                    pc = sim.pc.CIA.value
-                    print("starting msr, pc %08x, %08x" % (msr, pc))
-                    index = pc//4
-                    while index < len(instructions):
-                        ins, code = instructions[index]
-
-                        print("pc %08x msr %08x instr: %08x" % (pc, msr, ins))
-                        print(code)
-                        if 'XER' in sim.spr:
-                            so = 1 if sim.spr['XER'][XER_bits['SO']] else 0
-                            ov = 1 if sim.spr['XER'][XER_bits['OV']] else 0
-                            ov32 = 1 if sim.spr['XER'][XER_bits['OV32']] else 0
-                            print("before: so/ov/32", so, ov, ov32)
-
-                        # ask the decoder to decode this binary data (endian'd)
-                        yield pdecode2.dec.bigendian.eq(bigendian)  # l/big?
-                        yield pdecode2.state.msr.eq(msr)  # set MSR in pdecode2
-                        yield pdecode2.state.pc.eq(pc)  # set CIA in pdecode2
-                        yield instruction.eq(ins)          # raw binary instr.
-                        yield Settle()
-                        fn_unit = yield pdecode2.e.do.fn_unit
-                        self.assertEqual(fn_unit, Function.TRAP.value)
-                        alu_o = yield from set_alu_inputs(alu, pdecode2, sim)
-                        yield
-                        opname = code.split(' ')[0]
-                        yield from sim.call(opname)
-                        pc = sim.pc.CIA.value
-                        index = pc//4
-                        print("pc after %08x" % (pc))
-                        msr = sim.msr.value
-                        print("msr after %08x" % (msr))
-
-                        vld = yield alu.n.o_valid
-                        while not vld:
-                            yield
-                            vld = yield alu.n.o_valid
-                        yield
-
-                        yield from self.check_alu_outputs(alu, pdecode2,
-                                                          sim, code)
+                    yield from self.execute(alu, instruction, pdecode2, test)
 
         sim.add_sync_process(process)
         with sim.write_vcd("alu_simulator.vcd", "simulator.gtkw",
@@ -158,14 +174,6 @@ class TestRunner(unittest.TestCase):
 
     def check_alu_outputs(self, alu, dec2, sim, code):
 
-        rc = yield dec2.e.do.rc.data
-        cridx_ok = yield dec2.e.write_cr.ok
-        cridx = yield dec2.e.write_cr.data
-
-        print("check extra output", repr(code), cridx_ok, cridx)
-        if rc:
-            self.assertEqual(cridx, 0, code)
-
         sim_o = {}
         res = {}
 
@@ -196,10 +204,4 @@ class TestRunner(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(exit=False)
-    suite = unittest.TestSuite()
-    suite.addTest(TestRunner(TrapTestCase().test_data))
-    suite.addTest(TestRunner(TrapIlangCase().test_data))
-
-    runner = unittest.TextTestRunner()
-    runner.run(suite)
+    unittest.main()
index 521ab590be1461051ea4ae9f4b265d00cd54ec82..107bc0f4c7e8d5f5f0275f7062c0681cf531eb2c 100644 (file)
@@ -20,7 +20,7 @@ class CompTrapOpSubset(CompOpSubsetBase):
                   ('is_32bit', 1),
                   ('traptype', TT.size), # see trap main_stage.py, PowerDecoder2
                   ('trapaddr', 13),
-                  ('ldst_exc', len(LDSTException._exc_types)),
+                  ('ldst_exc', LDSTException.length), # blech
                   ]
 
         super().__init__(layout, name=name)
index ede33a1b03913307f33ce8080652d315c0403ce9..a5ed8f0d7338a0fcbb70cceb7eac417dd235c804 100644 (file)
@@ -16,6 +16,9 @@
 # highest priority interrupt currently presented (which is allowed
 # via XICS)
 #
+# Bugreports:
+#
+# * https://bugs.libre-soc.org/show_bug.cgi?id=407
 """
 from nmigen import Elaboratable, Module, Signal, Cat, Const, Record, Array, Mux
 from nmutil.iocontrol import RecordObject
@@ -72,9 +75,10 @@ def bswap(v):
 
 class XICS_ICP(Elaboratable):
 
-    def __init__(self):
-        class Spec: pass
-        spec = Spec()
+    def __init__(self, spec=None):
+        if spec is None:
+            class Spec: pass
+            spec = Spec()
         spec.addr_wid = 30
         spec.mask_wid = 4
         spec.reg_wid = 32
@@ -223,12 +227,13 @@ class Xive(RecordObject):
 
 
 class XICS_ICS(Elaboratable):
-    def __init__(self, SRC_NUM=16, PRIO_BITS=8):
+    def __init__(self, spec=None, SRC_NUM=16, PRIO_BITS=8):
         self.SRC_NUM = SRC_NUM
         self.PRIO_BITS = PRIO_BITS
         self.pri_masked = (1<<self.PRIO_BITS)-1
-        class Spec: pass
-        spec = Spec()
+        if spec is None:
+            class Spec: pass
+            spec = Spec()
         spec.addr_wid = 30
         spec.mask_wid = 4
         spec.reg_wid = 32
index b55917aafa6bbc9f16e1d97dc095e929c31aa81a..0f03df1546c8cf6ab91ef63b04713dca768a84c4 160000 (submodule)
@@ -1 +1 @@
-Subproject commit b55917aafa6bbc9f16e1d97dc095e929c31aa81a
+Subproject commit 0f03df1546c8cf6ab91ef63b04713dca768a84c4
index f84a01ccc2f3b98f57ffe16c37cb5a2c206cf24b..f249d5547330b632d0499f7917edca5dc15e259b 100644 (file)
@@ -18,6 +18,12 @@ def make_wb_layout(spec, cti=True):
     addr_wid, mask_wid, data_wid = spec.addr_wid, spec.mask_wid, spec.reg_wid
     adr_lsbs = log2_int(mask_wid) # LSBs of addr covered by mask
     badwid = spec.addr_wid-adr_lsbs    # MSBs (not covered by mask)
+    # test if microwatt compatibility is to be enabled
+    microwatt_compat = (hasattr(spec, "microwatt_compat") and
+                               (spec.microwatt_compat == True))
+    # test if fabric compatibility is to be enabled
+    fabric_compat = (hasattr(spec, "fabric_compat") and
+                               (spec.fabric_compat == True))
 
     res = [
     ("adr",   badwid  , DIR_FANOUT),
@@ -30,6 +36,9 @@ def make_wb_layout(spec, cti=True):
     ("we",            1, DIR_FANOUT),
     ("err",           1, DIR_FANIN)
     ]
+    # microwatt needs a stall signal (operates in pipeline mode)
+    if microwatt_compat or fabric_compat:
+        res.append(("stall", 1, DIR_FANIN))
     if not cti:
         return res
     return res + [
index c3f33393bde72951b27aa72664795c572913a7d0..2427a680a94ad5f7dac71b013579dba05bfea27c 100644 (file)
@@ -56,7 +56,8 @@ class Register(Elaboratable):
 
     def elaborate(self, platform):
         m = Module()
-        self.reg = reg = Signal(self.width, name="reg", reset=self.reset)
+        self.reg = reg = Signal(self.width, name="reg", reset=self.reset,
+                                attrs={'syn_ramstyle': "block_ram"})
 
         if self.synced:
             domain = m.d.sync
@@ -107,13 +108,17 @@ class RegFileArray(Elaboratable):
         and read-en signals (per port).
     """
 
-    def __init__(self, width, depth, synced=True, fwd_bus_mode=True):
+    def __init__(self, width, depth, synced=True, fwd_bus_mode=True,
+                                     resets=None):
+        if resets is None:
+            resets = [0] * depth
         self.synced = synced
         self.width = width
         self.depth = depth
         self.regs = Array(Register(width, synced=synced,
-                                   writethru=fwd_bus_mode) \
-                          for _ in range(self.depth))
+                                   writethru=fwd_bus_mode,
+                                   resetval=rst) \
+                          for rst in resets)
         self._rdports = []
         self._wrports = []
 
@@ -195,7 +200,8 @@ class RegFileMem(Elaboratable):
         self.fwd_bus_mode = fwd_bus_mode
         self.synced = synced
         self.width, self.depth = width, depth
-        self.memory = Memory(width=width, depth=depth)
+        self.memory = Memory(width=width, depth=depth,
+                             attrs={'syn_ramstyle': "block_ram"})
         self._rdports = {}
         self._wrports = {}
 
@@ -285,7 +291,9 @@ class RegFile(Elaboratable):
     def elaborate(self, platform):
         m = Module()
         bsz = int(log(self.width) / log(2))
-        regs = Array(Signal(self.width, name="reg") for _ in range(self.depth))
+        regs = Array(Signal(self.width, name="reg",
+                            attrs={'syn_ramstyle': "block_ram"}) \
+                    for _ in range(self.depth))
 
         # read ports. has write-through detection (returns data written)
         for rp in self._rdports:
index 28f8172d74774bdc8c0a95a4406a21520256f7d6..5ef301a8bf8bdbda55ad86131fd5f54ea66b5fe8 100644 (file)
@@ -70,20 +70,36 @@ class StateRegs(RegFileArray, StateRegsEnum):
     (d_rd2)
 
     """
-    def __init__(self, svp64_en=False, regreduce_en=False):
-        super().__init__(64, StateRegsEnum.N_REGS)
+    def __init__(self, svp64_en=False, regreduce_en=False, resets=None):
+        super().__init__(64, StateRegsEnum.N_REGS, resets=resets)
         wr_spec, rd_spec = self.get_port_specs()
         create_ports(self, wr_spec, rd_spec)
 
     def get_port_specs(self):
-        w_port_spec = {'nia': "nia",
+        w_port_spec = { # these 3 allow writing state by Function Units
+                        # strictly speaking this should not be allowed,
+                        # the information should be passed back to Issuer
+                        # to work out what to do
+                        'nia': "nia",
                         'msr': "msr",
                         'svstate': "svstate",
-                        'sv': "sv", # writing SVSTATE (issuer)
-                        'd_wr1': "d_wr1"} # writing PC (issuer)
-        r_port_spec = {'cia': "cia", # reading PC (issuer)
+                        'issue': "issue", # writing DEC/TB
+                        'state1': "state1", # SPR pipeline
+                        # these 3 allow writing state by Issuer
+                        'sv': "sv", # writing SVSTATE
+                        'd_wr1': "d_wr1", # writing PC
+                        'd_wr2': "d_wr2"} # writing MSR
+        r_port_spec = { # these are for reading state by Issuer but
+                        # the FUs do not read them: they are passed in
+                        # because of multi-issue / pipelining / etc.
+                        # the state could be totally different and is
+                        # only known *at* issue time, *by* the issuer
+                        'cia': "cia", # reading PC (issuer)
                         'msr': "msr", # reading MSR (issuer)
                         'sv': "sv", # reading SV (issuer)
+                        # SPR and DEC/TB FSM
+                        'issue': "issue", # reading DEC/TB
+                        'state1': "state1", # SPR pipeline
                         }
         return w_port_spec, r_port_spec
 
@@ -97,8 +113,8 @@ class IntRegs(RegFileMem): #class IntRegs(RegFileArray):
     * Array-based unary-indexed (not binary-indexed)
     * write-through capability (read on same cycle as write)
     """
-    def __init__(self, svp64_en=False, regreduce_en=False):
-        super().__init__(64, 32, fwd_bus_mode=False)
+    def __init__(self, svp64_en=False, regreduce_en=False, reg_wid=64):
+        super().__init__(reg_wid, 32, fwd_bus_mode=False)
         self.svp64_en = svp64_en
         self.regreduce_en = regreduce_en
         wr_spec, rd_spec = self.get_port_specs()
@@ -125,7 +141,7 @@ class IntRegs(RegFileMem): #class IntRegs(RegFileArray):
 class FastRegs(RegFileMem, FastRegsEnum): #RegFileArray):
     """FastRegs
 
-    FAST regfile  - CTR, LR, TAR, SRR1, SRR2, XER, TB, DEC, SVSRR0
+    FAST regfile  - CTR, LR, TAR, SRR1, SRR2, XER, SVSRR0
 
     * QTY 6of 64-bit registers
     * 3R2W
@@ -143,10 +159,9 @@ class FastRegs(RegFileMem, FastRegsEnum): #RegFileArray):
 
     def get_port_specs(self):
         w_port_spec = {'fast1': "dest1",
-                       'issue': "issue", # writing DEC/TB
                        }
         r_port_spec = {'fast1': "src1",
-                       'issue': "issue", # reading DEC/TB
+                        'dmi': "dmi" # needed for Debug (DMI)
                         }
         if not self.regreduce_en:
             r_port_spec['fast2'] = "src2"
@@ -255,7 +270,8 @@ class RegFiles:
               ('fast', FastRegs),
               ('state', StateRegs),
               ('spr', SPRRegs),]
-    def __init__(self, pspec, make_hazard_vecs=False):
+    def __init__(self, pspec, make_hazard_vecs=False,
+                      state_resets=None): # state file reset values
         # test is SVP64 is to be enabled
         svp64_en = hasattr(pspec, "svp64") and (pspec.svp64 == True)
 
@@ -263,10 +279,20 @@ class RegFiles:
         regreduce_en = hasattr(pspec, "regreduce") and \
                       (pspec.regreduce == True)
 
+        # get Integer File register width
+        reg_wid = 64
+        if isinstance(pspec.XLEN, int):
+            reg_wid = pspec.XLEN
+
         self.rf = {} # register file dict
         # create regfiles here, Factory style
         for (name, kls) in RegFiles.regkls:
-            rf = self.rf[name] = kls(svp64_en, regreduce_en)
+            kwargs = {'svp64_en': svp64_en, 'regreduce_en': regreduce_en}
+            if name == 'state':
+                kwargs['resets'] = state_resets
+            if name == 'int':
+                kwargs['reg_wid'] = reg_wid
+            rf = self.rf[name] = kls(**kwargs)
             # also add these as instances, self.state, self.fast, self.cr etc.
             setattr(self, name, rf)
 
@@ -303,7 +329,8 @@ class RegFiles:
 if __name__ == '__main__':
     m = Module()
     from soc.config.test.test_loadstore import TestMemPspec
-    pspec = TestMemPspec()
+    pspec = TestMemPspec(regreduce_en=True,
+                         XLEN=32) # integer reg width = 32
     rf = RegFiles(pspec, make_hazard_vecs=True)
     rf.elaborate_into(m, None)
     vl = rtlil.convert(m)
diff --git a/src/soc/regfile/sram_wrapper.py b/src/soc/regfile/sram_wrapper.py
new file mode 100644 (file)
index 0000000..e4223f5
--- /dev/null
@@ -0,0 +1,1472 @@
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Cesar Strauss <cestrauss@gmail.com>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+
+"""
+Wrapper around a single port (1R or 1W) SRAM, to make a multi-port regfile.
+
+This SRAM primitive has one cycle delay for reads, and, after a write,
+it reads the value just written. The goal is to use it to make at least an
+1W2R regfile.
+
+See https://bugs.libre-soc.org/show_bug.cgi?id=781 and
+https://bugs.libre-soc.org/show_bug.cgi?id=502
+"""
+
+import unittest
+
+from nmigen import Elaboratable, Module, Memory, Signal, Repl, Mux
+from nmigen.back import rtlil
+from nmigen.sim import Simulator
+from nmigen.asserts import Assert, Assume, Past, AnyConst
+
+from nmutil.formaltest import FHDLTestCase
+from nmutil.gtkw import write_gtkw
+
+
+class SinglePortSRAM(Elaboratable):
+    """
+    Model of a single port SRAM, which can be simulated, verified and/or
+    synthesized to an FPGA.
+
+    :param addr_width: width of the address bus
+    :param data_width: width of the data bus
+    :param we_width: number of write enable lines
+
+    .. note:: The debug read port is meant only to assist in formal proofs!
+    """
+    def __init__(self, addr_width, data_width, we_width):
+        self.addr_width = addr_width
+        self.data_width = data_width
+        self.we_width = we_width
+        # interface signals
+        self.d = Signal(data_width); """ write data"""
+        self.q = Signal(data_width); """read data"""
+        self.a = Signal(addr_width); """ read/write address"""
+        self.we = Signal(we_width); """write enable"""
+        # debug signals, only used in formal proofs
+        self.dbg_addr = Signal(addr_width); """debug: address under test"""
+        lanes = range(we_width)
+        self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+        gran = self.data_width // self.we_width
+        self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+        self.dbg_wrote = Signal(); """debug: data is valid"""
+
+    def elaborate(self, platform):
+        m = Module()
+        # backing memory
+        depth = 1 << self.addr_width
+        gran = self.data_width // self.we_width
+        mem = Memory(width=self.data_width, depth=depth)
+        # create read and write ports
+        # By connecting the same address to both ports, they behave, in fact,
+        # as a single, "half-duplex" port.
+        # The transparent attribute means that, on a write, we read the new
+        # value, on the next cycle
+        # Note that nmigen memories have a one cycle delay, for reads,
+        # by default
+        m.submodules.rdport = rdport = mem.read_port(transparent=True)
+        m.submodules.wrport = wrport = mem.write_port(granularity=gran)
+        # duplicate the address to both ports
+        m.d.comb += wrport.addr.eq(self.a)
+        m.d.comb += rdport.addr.eq(self.a)
+        # write enable
+        m.d.comb += wrport.en.eq(self.we)
+        # read and write data
+        m.d.comb += wrport.data.eq(self.d)
+        m.d.comb += self.q.eq(rdport.data)
+
+        # the following is needed for induction, where an unreachable state
+        # (memory and holding register differ) is turned into an illegal one
+        if platform == "formal":
+            # the debug port is an asynchronous read port, allowing direct
+            # access to a given memory location by the formal engine
+            m.submodules.dbgport = dbgport = mem.read_port(domain="comb")
+            # first, get the value stored in our memory location,
+            # using its debug port
+            stored = Signal(self.data_width)
+            m.d.comb += dbgport.addr.eq(self.dbg_addr)
+            m.d.comb += stored.eq(dbgport.data)
+            # now, ensure that the value stored in memory is always in sync
+            # with the holding register
+            with m.If(self.dbg_wrote):
+                m.d.sync += Assert(self.dbg_data ==
+                                   stored.word_select(self.dbg_lane, gran))
+
+        return m
+
+    def ports(self):
+        return [
+            self.d,
+            self.a,
+            self.we,
+            self.q
+        ]
+
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+
+class SinglePortSRAMTestCase(FHDLTestCase):
+    @staticmethod
+    def test_simple_rtlil():
+        """
+        Generate a simple SRAM. Try ``read_rtlil mem_simple.il; proc; show``
+        from a yosys prompt, to see the memory primitives, and
+        ``read_rtlil mem_simple.il; synth; show`` to see it implemented as
+        flip-flop RAM
+        """
+        dut = SinglePortSRAM(2, 4, 2)
+        create_ilang(dut, dut.ports(), "mem_simple")
+
+    @staticmethod
+    def test_blkram_rtlil():
+        """
+        Generates a bigger SRAM.
+        Try ``read_rtlil mem_blkram.il; synth_ecp5; show`` from a yosys
+        prompt, to see it implemented as block RAM
+        """
+        dut = SinglePortSRAM(10, 16, 2)
+        create_ilang(dut, dut.ports(), "mem_blkram")
+
+    def test_sram_model(self):
+        """
+        Simulate some read/write/modify operations on the SRAM model
+        """
+        dut = SinglePortSRAM(7, 32, 4)
+        sim = Simulator(dut)
+        sim.add_clock(1e-6)
+
+        def process():
+            # 1) write 0x12_34_56_78 to address 0
+            yield dut.a.eq(0)
+            yield dut.d.eq(0x12_34_56_78)
+            yield dut.we.eq(0b1111)
+            yield
+            # 2) write 0x9A_BC_DE_F0 to address 1
+            yield dut.a.eq(1)
+            yield dut.d.eq(0x9A_BC_DE_F0)
+            yield dut.we.eq(0b1111)
+            yield
+            # ... and read value just written to address 0
+            self.assertEqual((yield dut.q), 0x12_34_56_78)
+            # 3) prepare to read from address 0
+            yield dut.d.eq(0)
+            yield dut.we.eq(0b0000)
+            yield dut.a.eq(0)
+            yield
+            # ... and read value just written to address 1
+            self.assertEqual((yield dut.q), 0x9A_BC_DE_F0)
+            # 4) prepare to read from address 1
+            yield dut.a.eq(1)
+            yield
+            # ... and read value from address 0
+            self.assertEqual((yield dut.q), 0x12_34_56_78)
+            # 5) write 0x9A and 0xDE to bytes 1 and 3, leaving
+            # bytes 0 and 2 unchanged
+            yield dut.a.eq(0)
+            yield dut.d.eq(0x9A_FF_DE_FF)
+            yield dut.we.eq(0b1010)
+            yield
+            # ... and read value from address 1
+            self.assertEqual((yield dut.q), 0x9A_BC_DE_F0)
+            # 6) nothing more to do
+            yield dut.d.eq(0)
+            yield dut.we.eq(0)
+            yield
+            # ... other than confirm that bytes 1 and 3 were modified
+            # correctly
+            self.assertEqual((yield dut.q), 0x9A_34_DE_78)
+
+        sim.add_sync_process(process)
+        traces = ['rdport.clk', 'a[6:0]', 'we[3:0]', 'd[31:0]', 'q[31:0]']
+        write_gtkw('test_sram_model.gtkw', 'test_sram_model.vcd',
+                   traces, module='top')
+        sim_writer = sim.write_vcd('test_sram_model.vcd')
+        with sim_writer:
+            sim.run()
+
+    def test_model_sram_proof(self):
+        """
+        Formal proof of the single port SRAM model
+        """
+        m = Module()
+        # 128 x 32-bit, 8-bit granularity
+        m.submodules.dut = dut = SinglePortSRAM(7, 32, 4)
+        gran = len(dut.d) // len(dut.we)  # granularity
+        # choose a single random memory location to test
+        a_const = AnyConst(dut.a.shape())
+        # choose a single byte lane to test
+        lane = AnyConst(range(dut.we_width))
+        # holding data register
+        d_reg = Signal(gran)
+        # for some reason, simulated formal memory is not zeroed at reset
+        # ... so, remember whether we wrote it, at least once.
+        wrote = Signal()
+        # if our memory location and byte lane is being written
+        # ... capture the data in our holding register
+        with m.If((dut.a == a_const) & dut.we.bit_select(lane, 1)):
+            m.d.sync += d_reg.eq(dut.d.word_select(lane, gran))
+            m.d.sync += wrote.eq(1)
+        # if our memory location is being read
+        # ... and the holding register has valid data
+        # ... then its value must match the memory output, on the given lane
+        with m.If((Past(dut.a) == a_const) & wrote):
+            m.d.sync += Assert(d_reg == dut.q.word_select(lane, gran))
+
+        # pass our state to the device under test, so it can ensure that
+        # its state is in sync with ours, for induction
+        m.d.comb += [
+            dut.dbg_addr.eq(a_const),
+            dut.dbg_lane.eq(lane),
+            dut.dbg_data.eq(d_reg),
+            dut.dbg_wrote.eq(wrote),
+        ]
+
+        self.assertFormal(m, mode="prove", depth=2)
+
+
+class PhasedDualPortRegfile(Elaboratable):
+    """
+    Builds, from a pair of 1RW blocks, a pseudo 1W/1R RAM, where the
+    read port works every cycle, but the write port is only available on
+    either even (1eW/1R) or odd (1oW/1R) cycles.
+
+    :param addr_width: width of the address bus
+    :param data_width: width of the data bus
+    :param we_width: number of write enable lines
+    :param write_phase: indicates on which phase the write port will
+                        accept data
+    :param transparent: whether a simultaneous read and write returns the
+                        new value (True) or the old value (False)
+
+    .. note:: The debug read port is meant only to assist in formal proofs!
+    """
+
+    def __init__(self, addr_width, data_width, we_width, write_phase,
+                 transparent=False):
+        self.addr_width = addr_width
+        self.data_width = data_width
+        self.we_width = we_width
+        self.write_phase = write_phase
+        self.transparent = transparent
+        # interface signals
+        self.wr_addr_i = Signal(addr_width); """write port address"""
+        self.wr_data_i = Signal(data_width); """write port data"""
+        self.wr_we_i = Signal(we_width); """write port enable"""
+        self.rd_addr_i = Signal(addr_width); """read port address"""
+        self.rd_data_o = Signal(data_width); """read port data"""
+        self.phase = Signal(); """even/odd cycle indicator"""
+        # debug signals, only used in formal proofs
+        self.dbg_addr = Signal(addr_width); """debug: address under test"""
+        lanes = range(we_width)
+        self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+        gran = self.data_width // self.we_width
+        self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+        self.dbg_wrote = Signal(); """debug: data is valid"""
+
+    def elaborate(self, platform):
+        m = Module()
+        # granularity
+        # instantiate the two 1RW memory blocks
+        mem1 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+        mem2 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+        m.submodules.mem1 = mem1
+        m.submodules.mem2 = mem2
+        # wire write port to first memory, and its output to the second
+        m.d.comb += mem1.d.eq(self.wr_data_i)
+        m.d.comb += mem2.d.eq(mem1.q)
+        # holding registers for the write port of the second memory
+        last_wr_addr = Signal(self.addr_width)
+        last_wr_we = Signal(self.we_width)
+        # do the read and write address coincide?
+        same_read_write = Signal()
+        with m.If(self.phase == self.write_phase):
+            # write phase, start a write on the first memory
+            m.d.comb += mem1.a.eq(self.wr_addr_i)
+            m.d.comb += mem1.we.eq(self.wr_we_i)
+            # save write address and write select for repeating the write
+            # on the second memory, later
+            m.d.sync += last_wr_we.eq(self.wr_we_i)
+            m.d.sync += last_wr_addr.eq(self.wr_addr_i)
+            # start a read on the second memory
+            m.d.comb += mem2.a.eq(self.rd_addr_i)
+            # output previously read data from the first memory
+            m.d.comb += self.rd_data_o.eq(mem1.q)
+            if self.transparent:
+                # remember whether we are reading from the same location we are
+                # writing
+                m.d.sync += same_read_write.eq(self.rd_addr_i == self.wr_addr_i)
+        with m.Else():
+            # read phase, write last written data on second memory
+            m.d.comb += mem2.a.eq(last_wr_addr)
+            m.d.comb += mem2.we.eq(last_wr_we)
+            # start a read on the first memory
+            m.d.comb += mem1.a.eq(self.rd_addr_i)
+            if self.transparent:
+                with m.If(same_read_write):
+                    # when transparent, and read and write addresses coincide,
+                    # output the data just written
+                    m.d.comb += self.rd_data_o.eq(mem1.q)
+                with m.Else():
+                    # otherwise, output previously read data
+                    # from the second memory
+                    m.d.comb += self.rd_data_o.eq(mem2.q)
+            else:
+                # always output the read data from the second memory,
+                # if not transparent
+                m.d.comb += self.rd_data_o.eq(mem2.q)
+
+        if platform == "formal":
+            # pass our state to the device under test, so it can ensure that
+            # its state is in sync with ours, for induction
+            m.d.comb += [
+                # pass the address and write lane under test to both memories
+                mem1.dbg_addr.eq(self.dbg_addr),
+                mem2.dbg_addr.eq(self.dbg_addr),
+                mem1.dbg_lane.eq(self.dbg_lane),
+                mem2.dbg_lane.eq(self.dbg_lane),
+                # the second memory copies its state from the first memory,
+                # after a cycle, so it has a one cycle delay
+                mem1.dbg_data.eq(self.dbg_data),
+                mem2.dbg_data.eq(Past(self.dbg_data)),
+                mem1.dbg_wrote.eq(self.dbg_wrote),
+                mem2.dbg_wrote.eq(Past(self.dbg_wrote)),
+            ]
+
+        return m
+
+    def ports(self):
+        return [
+            self.wr_addr_i,
+            self.wr_data_i,
+            self.wr_we_i,
+            self.rd_addr_i,
+            self.rd_data_o,
+            self.phase
+        ]
+
+
+class PhasedDualPortRegfileTestCase(FHDLTestCase):
+
+    def do_test_phased_dual_port_regfile(self, write_phase, transparent):
+        """
+        Simulate some read/write/modify operations on the phased write memory
+        """
+        dut = PhasedDualPortRegfile(7, 32, 4, write_phase, transparent)
+        sim = Simulator(dut)
+        sim.add_clock(1e-6)
+
+        # compare read data with previously written data
+        # and start a new read
+        def read(rd_addr_i, expected=None):
+            if expected is not None:
+                self.assertEqual((yield dut.rd_data_o), expected)
+            yield dut.rd_addr_i.eq(rd_addr_i)
+
+        # start a write, and set write phase
+        def write(wr_addr_i, wr_we_i, wr_data_i):
+            yield dut.wr_addr_i.eq(wr_addr_i)
+            yield dut.wr_we_i.eq(wr_we_i)
+            yield dut.wr_data_i.eq(wr_data_i)
+            yield dut.phase.eq(write_phase)
+
+        # disable writes, and start read phase
+        def skip_write():
+            yield dut.wr_addr_i.eq(0)
+            yield dut.wr_we_i.eq(0)
+            yield dut.wr_data_i.eq(0)
+            yield dut.phase.eq(~write_phase)
+
+        # writes a few values on the write port, and read them back
+        # ... reads can happen every cycle
+        # ... writes, only every two cycles.
+        # since reads have a one cycle delay, the expected value on
+        # each read refers to the last read performed, not the
+        # current one, which is in progress.
+        def process():
+            yield from read(0)
+            yield from write(0x42, 0b1111, 0x12345678)
+            yield
+            yield from read(0x42)
+            yield from skip_write()
+            yield
+            yield from read(0x42)
+            yield from write(0x43, 0b1111, 0x9ABCDEF0)
+            yield
+            yield from read(0x43, 0x12345678)
+            yield from skip_write()
+            yield
+            yield from read(0x42, 0x12345678)
+            yield from write(0x43, 0b1001, 0xF0FFFF9A)
+            yield
+            yield from read(0x43, 0x9ABCDEF0)
+            yield from skip_write()
+            yield
+            yield from read(0x43, 0x12345678)
+            yield from write(0x42, 0b0110, 0xFF5634FF)
+            yield
+            yield from read(0x42, 0xF0BCDE9A)
+            yield from skip_write()
+            yield
+            yield from read(0, 0xF0BCDE9A)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0, 0x12563478)
+            yield from skip_write()
+            yield
+            # try reading and writing to the same location, simultaneously
+            yield from read(0x42)
+            yield from write(0x42, 0b0101, 0x55AA9966)
+            yield
+            # ... and read again
+            yield from read(0x42)
+            yield from skip_write()
+            yield
+            if transparent:
+                # returns the value just written
+                yield from read(0, 0x12AA3466)
+            else:
+                # returns the old value
+                yield from read(0, 0x12563478)
+            yield from write(0, 0, 0)
+            yield
+            # after a cycle, always returns the new value
+            yield from read(0, 0x12AA3466)
+            yield from skip_write()
+
+        sim.add_sync_process(process)
+        debug_file = f'test_phased_dual_port_{write_phase}'
+        if transparent:
+            debug_file += '_transparent'
+        traces = ['clk', 'phase',
+                  'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+                  'rd_addr_i[6:0]', 'rd_data_o[31:0]']
+        write_gtkw(debug_file + '.gtkw',
+                   debug_file + '.vcd',
+                   traces, module='top', zoom=-22)
+        sim_writer = sim.write_vcd(debug_file + '.vcd')
+        with sim_writer:
+            sim.run()
+
+    def test_phased_dual_port_regfile(self):
+        """test both types (odd and even write ports) of phased write memory"""
+        with self.subTest("writes happen on phase 0"):
+            self.do_test_phased_dual_port_regfile(0, False)
+        with self.subTest("writes happen on phase 1"):
+            self.do_test_phased_dual_port_regfile(1, False)
+        """test again, with a transparent read port"""
+        with self.subTest("writes happen on phase 0 (transparent reads)"):
+            self.do_test_phased_dual_port_regfile(0, True)
+        with self.subTest("writes happen on phase 1 (transparent reads)"):
+            self.do_test_phased_dual_port_regfile(1, True)
+
+    def do_test_phased_dual_port_regfile_proof(self, write_phase, transparent):
+        """
+        Formal proof of the pseudo 1W/1R regfile
+        """
+        m = Module()
+        # 128 x 32-bit, 8-bit granularity
+        dut = PhasedDualPortRegfile(7, 32, 4, write_phase, transparent)
+        m.submodules.dut = dut
+        gran = dut.data_width // dut.we_width  # granularity
+        # choose a single random memory location to test
+        a_const = AnyConst(dut.addr_width)
+        # choose a single byte lane to test
+        lane = AnyConst(range(dut.we_width))
+        # drive alternating phases
+        m.d.comb += Assume(dut.phase != Past(dut.phase))
+        # holding data register
+        d_reg = Signal(gran)
+        # for some reason, simulated formal memory is not zeroed at reset
+        # ... so, remember whether we wrote it, at least once.
+        wrote = Signal()
+        # if our memory location and byte lane is being written,
+        # capture the data in our holding register
+        with m.If((dut.wr_addr_i == a_const)
+                  & dut.wr_we_i.bit_select(lane, 1)
+                  & (dut.phase == dut.write_phase)):
+            m.d.sync += d_reg.eq(dut.wr_data_i.word_select(lane, gran))
+            m.d.sync += wrote.eq(1)
+        # if our memory location is being read,
+        # and the holding register has valid data,
+        # then its value must match the memory output, on the given lane
+        with m.If(Past(dut.rd_addr_i) == a_const):
+            if transparent:
+                with m.If(wrote):
+                    rd_lane = dut.rd_data_o.word_select(lane, gran)
+                    m.d.sync += Assert(d_reg == rd_lane)
+            else:
+                # with a non-transparent read port, the read value depends
+                # on whether there is a simultaneous write, or not
+                with m.If((Past(dut.wr_addr_i) == a_const)
+                          & Past(dut.phase) == dut.write_phase):
+                    # simultaneous write -> check against last written value
+                    with m.If(Past(wrote)):
+                        rd_lane = dut.rd_data_o.word_select(lane, gran)
+                        m.d.sync += Assert(Past(d_reg) == rd_lane)
+                with m.Else():
+                    # otherwise, check against current written value
+                    with m.If(wrote):
+                        rd_lane = dut.rd_data_o.word_select(lane, gran)
+                        m.d.sync += Assert(d_reg == rd_lane)
+
+        # pass our state to the device under test, so it can ensure that
+        # its state is in sync with ours, for induction
+        m.d.comb += [
+            # address and mask under test
+            dut.dbg_addr.eq(a_const),
+            dut.dbg_lane.eq(lane),
+            # state of our holding register
+            dut.dbg_data.eq(d_reg),
+            dut.dbg_wrote.eq(wrote),
+        ]
+
+        self.assertFormal(m, mode="prove", depth=3)
+
+    def test_phased_dual_port_regfile_proof(self):
+        """test both types (odd and even write ports) of phased write memory"""
+        with self.subTest("writes happen on phase 0"):
+            self.do_test_phased_dual_port_regfile_proof(0, False)
+        with self.subTest("writes happen on phase 1"):
+            self.do_test_phased_dual_port_regfile_proof(1, False)
+        # test again, with transparent read ports
+        with self.subTest("writes happen on phase 0 (transparent reads)"):
+            self.do_test_phased_dual_port_regfile_proof(0, True)
+        with self.subTest("writes happen on phase 1 (transparent reads)"):
+            self.do_test_phased_dual_port_regfile_proof(1, True)
+
+
+class DualPortRegfile(Elaboratable):
+    """
+    Builds, from a pair of phased 1W/1R blocks, a true 1W/1R RAM, where both
+    read and write ports work every cycle.
+    It employs a Last Value Table, that tracks to which memory each address was
+    last written.
+
+    :param addr_width: width of the address bus
+    :param data_width: width of the data bus
+    :param we_width: number of write enable lines
+    :param transparent: whether a simultaneous read and write returns the
+                        new value (True) or the old value (False)
+    """
+
+    def __init__(self, addr_width, data_width, we_width, transparent=True):
+        self.addr_width = addr_width
+        self.data_width = data_width
+        self.we_width = we_width
+        self.transparent = transparent
+        # interface signals
+        self.wr_addr_i = Signal(addr_width); """write port address"""
+        self.wr_data_i = Signal(data_width); """write port data"""
+        self.wr_we_i = Signal(we_width); """write port enable"""
+        self.rd_addr_i = Signal(addr_width); """read port address"""
+        self.rd_data_o = Signal(data_width); """read port data"""
+        # debug signals, only used in formal proofs
+        # address and write lane under test
+        self.dbg_addr = Signal(addr_width); """debug: address under test"""
+        lanes = range(we_width)
+        self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+        # upstream state, to keep in sync with ours
+        gran = self.data_width // self.we_width
+        self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+        self.dbg_wrote = Signal(); """debug: data is valid"""
+        self.dbg_wrote_phase = Signal(); """debug: the phase data was written"""
+        self.dbg_phase = Signal(); """debug: current phase"""
+
+    def elaborate(self, platform):
+        m = Module()
+        # depth and granularity
+        depth = 1 << self.addr_width
+        gran = self.data_width // self.we_width
+        # instantiate the two phased 1R/1W memory blocks
+        mem0 = PhasedDualPortRegfile(
+            self.addr_width, self.data_width, self.we_width, 0,
+            self.transparent)
+        mem1 = PhasedDualPortRegfile(
+            self.addr_width, self.data_width, self.we_width, 1,
+            self.transparent)
+        m.submodules.mem0 = mem0
+        m.submodules.mem1 = mem1
+        # instantiate the backing memory (FFRAM or LUTRAM)
+        # for the Last Value Table
+        # it should have the same number and port types of the desired
+        # memory, but just one bit per write lane
+        lvt_mem = Memory(width=self.we_width, depth=depth)
+        lvt_wr = lvt_mem.write_port(granularity=1)
+        lvt_rd = lvt_mem.read_port(transparent=self.transparent)
+        if not self.transparent:
+            # for some reason, formal proofs don't recognize the default
+            # reset value for this signal
+            m.d.comb += lvt_rd.en.eq(1)
+        m.submodules.lvt_wr = lvt_wr
+        m.submodules.lvt_rd = lvt_rd
+        # generate and wire the phases for the phased memories
+        phase = Signal()
+        m.d.sync += phase.eq(~phase)
+        m.d.comb += [
+            mem0.phase.eq(phase),
+            mem1.phase.eq(phase),
+        ]
+        m.d.comb += [
+            # wire the write ports, directly
+            mem0.wr_addr_i.eq(self.wr_addr_i),
+            mem1.wr_addr_i.eq(self.wr_addr_i),
+            mem0.wr_we_i.eq(self.wr_we_i),
+            mem1.wr_we_i.eq(self.wr_we_i),
+            mem0.wr_data_i.eq(self.wr_data_i),
+            mem1.wr_data_i.eq(self.wr_data_i),
+            # also wire the read addresses
+            mem0.rd_addr_i.eq(self.rd_addr_i),
+            mem1.rd_addr_i.eq(self.rd_addr_i),
+            # wire read and write ports to the LVT
+            lvt_wr.addr.eq(self.wr_addr_i),
+            lvt_wr.en.eq(self.wr_we_i),
+            lvt_rd.addr.eq(self.rd_addr_i),
+            # the data for the LVT is the phase on which the value was
+            # written
+            lvt_wr.data.eq(Repl(phase, self.we_width)),
+        ]
+        for i in range(self.we_width):
+            # select the right memory to assign to the output read port,
+            # in this byte lane, according to the LVT contents
+            m.d.comb += self.rd_data_o.word_select(i, gran).eq(
+                Mux(
+                    lvt_rd.data[i],
+                    mem1.rd_data_o.word_select(i, gran),
+                    mem0.rd_data_o.word_select(i, gran)))
+
+        if platform == "formal":
+            # pass upstream state to the memories, so they can ensure that
+            # their state are in sync with upstream, for induction
+            m.d.comb += [
+                # address and write lane under test
+                mem0.dbg_addr.eq(self.dbg_addr),
+                mem1.dbg_addr.eq(self.dbg_addr),
+                mem0.dbg_lane.eq(self.dbg_lane),
+                mem1.dbg_lane.eq(self.dbg_lane),
+                # upstream state
+                mem0.dbg_data.eq(self.dbg_data),
+                mem1.dbg_data.eq(self.dbg_data),
+                # the memory, on which the write ends up, depends on which
+                # phase it was written
+                mem0.dbg_wrote.eq(self.dbg_wrote & ~self.dbg_wrote_phase),
+                mem1.dbg_wrote.eq(self.dbg_wrote & self.dbg_wrote_phase),
+            ]
+            # sync phase to upstream
+            m.d.comb += Assert(self.dbg_phase == phase)
+            # this debug port for the LVT is an asynchronous read port,
+            # allowing direct access to a given memory location
+            # by the formal engine
+            m.submodules.dbgport = dbgport = lvt_mem.read_port(domain='comb')
+            # first, get the value stored in our memory location,
+            stored = Signal(self.we_width)
+            m.d.comb += dbgport.addr.eq(self.dbg_addr)
+            m.d.comb += stored.eq(dbgport.data)
+            # now, ensure that the value stored in memory is always in sync
+            # with the expected value (which memory the value was written to)
+            with m.If(self.dbg_wrote):
+                m.d.comb += Assert(stored.bit_select(self.dbg_lane, 1)
+                                   == self.dbg_wrote_phase)
+        return m
+
+    def ports(self):
+        return [
+            self.wr_addr_i,
+            self.wr_data_i,
+            self.wr_we_i,
+            self.rd_addr_i,
+            self.rd_data_o
+        ]
+
+
+class DualPortRegfileTestCase(FHDLTestCase):
+
+    def do_test_dual_port_regfile(self, transparent):
+        """
+        Simulate some read/write/modify operations on the dual port register
+        file
+        """
+        dut = DualPortRegfile(7, 32, 4, transparent)
+        sim = Simulator(dut)
+        sim.add_clock(1e-6)
+
+        expected = None
+        last_expected = None
+
+        # compare read data with previously written data
+        # and start a new read
+        def read(rd_addr_i, next_expected=None):
+            nonlocal expected, last_expected
+            if expected is not None:
+                self.assertEqual((yield dut.rd_data_o), expected)
+            yield dut.rd_addr_i.eq(rd_addr_i)
+            # account for the read latency
+            expected = last_expected
+            last_expected = next_expected
+
+        # start a write
+        def write(wr_addr_i, wr_we_i, wr_data_i):
+            yield dut.wr_addr_i.eq(wr_addr_i)
+            yield dut.wr_we_i.eq(wr_we_i)
+            yield dut.wr_data_i.eq(wr_data_i)
+
+        def process():
+            # write a pair of values, one for each memory
+            yield from read(0)
+            yield from write(0x42, 0b1111, 0x87654321)
+            yield
+            yield from read(0x42, 0x87654321)
+            yield from write(0x43, 0b1111, 0x0FEDCBA9)
+            yield
+            # skip a beat
+            yield from read(0x43, 0x0FEDCBA9)
+            yield from write(0, 0, 0)
+            yield
+            # write again, but now they switch memories
+            yield from read(0)
+            yield from write(0x42, 0b1111, 0x12345678)
+            yield
+            yield from read(0x42, 0x12345678)
+            yield from write(0x43, 0b1111, 0x9ABCDEF0)
+            yield
+            yield from read(0x43, 0x9ABCDEF0)
+            yield from write(0, 0, 0)
+            yield
+            # test partial writes
+            yield from read(0)
+            yield from write(0x42, 0b1001, 0x78FFFF12)
+            yield
+            yield from read(0)
+            yield from write(0x43, 0b0110, 0xFFDEABFF)
+            yield
+            yield from read(0x42, 0x78345612)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0x43, 0x9ADEABF0)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0)
+            yield from write(0, 0, 0)
+            yield
+            if transparent:
+                # returns the value just written
+                yield from read(0x42, 0x78AA5666)
+            else:
+                # returns the old value
+                yield from read(0x42, 0x78345612)
+            yield from write(0x42, 0b0101, 0x55AA9966)
+            yield
+            # after a cycle, always returns the new value
+            yield from read(0x42, 0x78AA5666)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0)
+            yield from write(0, 0, 0)
+
+        sim.add_sync_process(process)
+        debug_file = 'test_dual_port_regfile'
+        if transparent:
+            debug_file += '_transparent'
+        traces = ['clk', 'phase',
+                  {'comment': 'write port'},
+                  'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+                  {'comment': 'read port'},
+                  'rd_addr_i[6:0]', 'rd_data_o[31:0]',
+                  {'comment': 'LVT write port'},
+                  'phase', 'lvt_mem_w_addr[6:0]', 'lvt_mem_w_en[3:0]',
+                  'lvt_mem_w_data[3:0]',
+                  {'comment': 'LVT read port'},
+                  'lvt_mem_r_addr[6:0]', 'lvt_mem_r_data[3:0]',
+                  {'comment': 'backing memory'},
+                  'mem0.rd_data_o[31:0]',
+                  'mem1.rd_data_o[31:0]',
+                  ]
+        write_gtkw(debug_file + '.gtkw',
+                   debug_file + '.vcd',
+                   traces, module='top', zoom=-22)
+        sim_writer = sim.write_vcd(debug_file + '.vcd')
+        with sim_writer:
+            sim.run()
+
+    def test_dual_port_regfile(self):
+        with self.subTest("non-transparent reads"):
+            self.do_test_dual_port_regfile(False)
+        with self.subTest("transparent reads"):
+            self.do_test_dual_port_regfile(True)
+
+    def do_test_dual_port_regfile_proof(self, transparent=True):
+        """
+        Formal proof of the 1W/1R regfile
+        """
+        m = Module()
+        # 128 x 32-bit, 8-bit granularity
+        dut = DualPortRegfile(7, 32, 4, transparent)
+        m.submodules.dut = dut
+        gran = dut.data_width // dut.we_width  # granularity
+        # choose a single random memory location to test
+        a_const = AnyConst(dut.addr_width)
+        # choose a single byte lane to test
+        lane = AnyConst(range(dut.we_width))
+        # holding data register
+        d_reg = Signal(gran)
+        # keep track of the phase, so we can remember which memory
+        # we wrote to
+        phase = Signal()
+        m.d.sync += phase.eq(~phase)
+        # for some reason, simulated formal memory is not zeroed at reset
+        # ... so, remember whether we wrote it, at least once.
+        wrote = Signal()
+        # ... and on which phase it was written
+        wrote_phase = Signal()
+        # if our memory location and byte lane is being written,
+        # capture the data in our holding register
+        with m.If((dut.wr_addr_i == a_const)
+                  & dut.wr_we_i.bit_select(lane, 1)):
+            m.d.sync += d_reg.eq(dut.wr_data_i.word_select(lane, gran))
+            m.d.sync += wrote.eq(1)
+            m.d.sync += wrote_phase.eq(phase)
+        # if our memory location is being read,
+        # and the holding register has valid data,
+        # then its value must match the memory output, on the given lane
+        with m.If(Past(dut.rd_addr_i) == a_const):
+            if transparent:
+                with m.If(wrote):
+                    rd_lane = dut.rd_data_o.word_select(lane, gran)
+                    m.d.sync += Assert(d_reg == rd_lane)
+            else:
+                # with a non-transparent read port, the read value depends
+                # on whether there is a simultaneous write, or not
+                with m.If(Past(dut.wr_addr_i) == a_const):
+                    # simultaneous write -> check against last written value
+                    with m.If(wrote & Past(wrote)):
+                        rd_lane = dut.rd_data_o.word_select(lane, gran)
+                        m.d.sync += Assert(Past(d_reg) == rd_lane)
+                with m.Else():
+                    # otherwise, check against current written value
+                    with m.If(wrote):
+                        rd_lane = dut.rd_data_o.word_select(lane, gran)
+                        m.d.sync += Assert(d_reg == rd_lane)
+
+        m.d.comb += [
+            dut.dbg_addr.eq(a_const),
+            dut.dbg_lane.eq(lane),
+            dut.dbg_data.eq(d_reg),
+            dut.dbg_wrote.eq(wrote),
+            dut.dbg_wrote_phase.eq(wrote_phase),
+            dut.dbg_phase.eq(phase),
+        ]
+
+        self.assertFormal(m, mode="prove", depth=3)
+
+    def test_dual_port_regfile_proof(self):
+        """
+        Formal check of 1W/1R regfile (transparent and not)
+        """
+        with self.subTest("transparent reads"):
+            self.do_test_dual_port_regfile_proof(True)
+        with self.subTest("non-transparent reads"):
+            self.do_test_dual_port_regfile_proof(False)
+
+
+class PhasedReadPhasedWriteFullReadSRAM(Elaboratable):
+    """
+    Builds, from three 1RW blocks, a pseudo 1W/2R SRAM, with:
+
+    * one full read port, which works every cycle,
+    * one write port, which is only available on either even or odd cycles,
+    * an extra transparent read port, available only on the same cycles as the
+      write port
+
+    This type of SRAM is useful for a XOR-based 6x1RW implementation of
+    a 1R/1W register file.
+
+    :param addr_width: width of the address bus
+    :param data_width: width of the data bus
+    :param we_width: number of write enable lines
+    :param write_phase: indicates on which phase the write port will
+                        accept data
+    :param transparent: whether a simultaneous read and write returns the
+                        new value (True) or the old value (False) on the full
+                        read port
+
+    .. note:: The debug read port is meant only to assist in formal proofs!
+    """
+
+    def __init__(self, addr_width, data_width, we_width, write_phase,
+                 transparent=True):
+        self.addr_width = addr_width
+        self.data_width = data_width
+        self.we_width = we_width
+        self.write_phase = write_phase
+        self.transparent = transparent
+        # interface signals
+        self.wr_addr_i = Signal(addr_width); """phased write port address"""
+        self.wr_data_i = Signal(data_width); """phased write port data"""
+        self.wr_we_i = Signal(we_width); """phased write port enable"""
+        self.rd_addr_i = Signal(addr_width); """full read port address"""
+        self.rd_data_o = Signal(data_width); """full read port data"""
+        self.rdp_addr_i = Signal(addr_width); """phased read port address"""
+        self.rdp_data_o = Signal(data_width); """phased read port data"""
+        self.phase = Signal(); """even/odd cycle indicator"""
+        # debug signals, only used in formal proofs
+        self.dbg_addr = Signal(addr_width); """debug: address under test"""
+        lanes = range(we_width)
+        self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+        gran = self.data_width // self.we_width
+        self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+        self.dbg_wrote = Signal(); """debug: data is valid"""
+
+    def elaborate(self, platform):
+        m = Module()
+        # instantiate the 1RW memory blocks
+        mem1 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+        mem2 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+        mem3 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+        m.submodules.mem1 = mem1
+        m.submodules.mem2 = mem2
+        m.submodules.mem3 = mem3
+        # wire input write data to first memory, and its output to the others
+        m.d.comb += [
+            mem1.d.eq(self.wr_data_i),
+            mem2.d.eq(mem1.q),
+            mem3.d.eq(mem1.q)
+        ]
+        # holding registers for the write port of the other memories
+        last_wr_addr = Signal(self.addr_width)
+        last_wr_we = Signal(self.we_width)
+        # do read and write addresses coincide?
+        same_read_write = Signal()
+        same_phased_read_write = Signal()
+        with m.If(self.phase == self.write_phase):
+            # write phase, start a write on the first memory
+            m.d.comb += mem1.a.eq(self.wr_addr_i)
+            m.d.comb += mem1.we.eq(self.wr_we_i)
+            # save write address and write select for repeating the write
+            # on the other memories, one cycle later
+            m.d.sync += last_wr_we.eq(self.wr_we_i)
+            m.d.sync += last_wr_addr.eq(self.wr_addr_i)
+            # start a read on the other memories
+            m.d.comb += mem2.a.eq(self.rd_addr_i)
+            m.d.comb += mem3.a.eq(self.rdp_addr_i)
+            # output previously read data from the first memory
+            m.d.comb += self.rd_data_o.eq(mem1.q)
+            # remember whether we are reading from the same location as we
+            # are writing
+            m.d.sync += same_phased_read_write.eq(
+                self.rdp_addr_i == self.wr_addr_i)
+            if self.transparent:
+                m.d.sync += same_read_write.eq(self.rd_addr_i == self.wr_addr_i)
+        with m.Else():
+            # read phase, write last written data on the other memories
+            m.d.comb += [
+                mem2.a.eq(last_wr_addr),
+                mem2.we.eq(last_wr_we),
+                mem3.a.eq(last_wr_addr),
+                mem3.we.eq(last_wr_we),
+            ]
+            # start a read on the first memory
+            m.d.comb += mem1.a.eq(self.rd_addr_i)
+            # output the read data from the second memory
+            if self.transparent:
+                with m.If(same_read_write):
+                    # when transparent, and read and write addresses coincide,
+                    # output the data just written
+                    m.d.comb += self.rd_data_o.eq(mem1.q)
+                with m.Else():
+                    # otherwise, output previously read data
+                    # from the second memory
+                    m.d.comb += self.rd_data_o.eq(mem2.q)
+            else:
+                # always output the read data from the second memory,
+                # if not transparent
+                m.d.comb += self.rd_data_o.eq(mem2.q)
+            with m.If(same_phased_read_write):
+                # if read and write addresses coincide,
+                # output the data just written
+                m.d.comb += self.rdp_data_o.eq(mem1.q)
+            with m.Else():
+                # otherwise, output previously read data
+                # from the third memory
+                m.d.comb += self.rdp_data_o.eq(mem3.q)
+
+        if platform == "formal":
+            # pass our state to the device under test, so it can ensure that
+            # its state is in sync with ours, for induction
+            m.d.comb += [
+                # pass the address and write lane under test to both memories
+                mem1.dbg_addr.eq(self.dbg_addr),
+                mem2.dbg_addr.eq(self.dbg_addr),
+                mem3.dbg_addr.eq(self.dbg_addr),
+                mem1.dbg_lane.eq(self.dbg_lane),
+                mem2.dbg_lane.eq(self.dbg_lane),
+                mem3.dbg_lane.eq(self.dbg_lane),
+                # the other memories copy their state from the first memory,
+                # after a cycle, so they have a one cycle delay
+                mem1.dbg_data.eq(self.dbg_data),
+                mem2.dbg_data.eq(Past(self.dbg_data)),
+                mem3.dbg_data.eq(Past(self.dbg_data)),
+                mem1.dbg_wrote.eq(self.dbg_wrote),
+                mem2.dbg_wrote.eq(Past(self.dbg_wrote)),
+                mem3.dbg_wrote.eq(Past(self.dbg_wrote)),
+            ]
+
+        return m
+
+
+class PhasedReadPhasedWriteFullReadSRAMTestCase(FHDLTestCase):
+
+    def do_test_case(self, write_phase, transparent):
+        """
+        Simulate some read/write/modify operations
+        """
+        dut = PhasedReadPhasedWriteFullReadSRAM(7, 32, 4, write_phase,
+                                                transparent)
+        sim = Simulator(dut)
+        sim.add_clock(1e-6)
+
+        expected = None
+        last_expected = None
+
+        # compare read data with previously written data
+        # and start a new read
+        def read(rd_addr_i, next_expected=None):
+            nonlocal expected, last_expected
+            if expected is not None:
+                self.assertEqual((yield dut.rd_data_o), expected)
+            yield dut.rd_addr_i.eq(rd_addr_i)
+            # account for the read latency
+            expected = last_expected
+            last_expected = next_expected
+
+        expected2 = None
+
+        # same as above, but for the phased read port
+        def phased_read(rdp_addr_i, next_expected2=None):
+            nonlocal expected2
+            if expected2 is not None:
+                self.assertEqual((yield dut.rdp_data_o), expected2)
+            yield dut.rdp_addr_i.eq(rdp_addr_i)
+            # account for the read latency
+            expected2 = next_expected2
+
+        # start a write
+        def write(wr_addr_i, wr_we_i, wr_data_i):
+            yield dut.wr_addr_i.eq(wr_addr_i)
+            yield dut.wr_we_i.eq(wr_we_i)
+            yield dut.wr_data_i.eq(wr_data_i)
+            yield dut.phase.eq(write_phase)
+
+        # disable writes, and start read phase
+        def skip_write():
+            yield dut.wr_addr_i.eq(0)
+            yield dut.wr_we_i.eq(0)
+            yield dut.wr_data_i.eq(0)
+            yield dut.phase.eq(~write_phase)
+            # also skip reading from the phased read port
+            yield dut.rdp_addr_i.eq(0)
+
+        # writes a few values on the write port, and read them back
+        def process():
+            yield from read(0)
+            yield from phased_read(0)
+            yield from write(0x42, 0b1111, 0x12345678)
+            yield
+            yield from read(0x42, 0x12345678)
+            yield from skip_write()
+            yield
+            yield from read(0x42, 0x12345678)
+            yield from phased_read(0x42, 0x12345678)
+            yield from write(0x43, 0b1111, 0x9ABCDEF0)
+            yield
+            yield from read(0x43, 0x9ABCDEF0)
+            yield from skip_write()
+            yield
+            yield from read(0x42, 0x12345678)
+            yield from phased_read(0x42, 0x12345678)
+            yield from write(0x43, 0b1001, 0xF0FFFF9A)
+            yield
+            yield from read(0x43, 0xF0BCDE9A)
+            yield from skip_write()
+            yield
+            yield from read(0x43, 0xF0BCDE9A)
+            yield from phased_read(0x43, 0xF0BCDE9A)
+            yield from write(0x42, 0b0110, 0xFF5634FF)
+            yield
+            yield from read(0x42, 0x12563478)
+            yield from skip_write()
+            yield
+            yield from read(0)
+            yield from phased_read(0)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0)
+            yield from skip_write()
+            yield
+            # try reading and writing at the same time
+            if transparent:
+                # transparent port, return the value just written
+                yield from read(0x42, 0x12AA3466)
+            else:
+                # ... otherwise, return the old value
+                yield from read(0x42, 0x12563478)
+            # transparent port, always return the value just written
+            yield from phased_read(0x42, 0x12AA3466)
+            yield from write(0x42, 0b0101, 0x55AA9966)
+            yield
+            # after a cycle, always returns the new value
+            yield from read(0x42, 0x12AA3466)
+            yield from skip_write()
+            yield
+            yield from read(0)
+            yield from phased_read(0)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0)
+            yield from skip_write()
+
+        sim.add_sync_process(process)
+        debug_file = 'test_phased_read_write_sram_' + str(write_phase)
+        if transparent:
+            debug_file += '_transparent'
+        traces = ['clk', 'phase',
+                  {'comment': 'phased write port'},
+                  'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+                  {'comment': 'full read port'},
+                  'rd_addr_i[6:0]', 'rd_data_o[31:0]',
+                  {'comment': 'phased read port'},
+                  'rdp_addr_i[6:0]', 'rdp_data_o[31:0]']
+        write_gtkw(debug_file + '.gtkw',
+                   debug_file + '.vcd',
+                   traces, module='top', zoom=-22)
+        sim_writer = sim.write_vcd(debug_file + '.vcd')
+        with sim_writer:
+            sim.run()
+
+    def test_case(self):
+        """test both types (odd and even write ports) of phased memory"""
+        with self.subTest("writes happen on phase 0"):
+            self.do_test_case(0, True)
+        with self.subTest("writes happen on phase 1"):
+            self.do_test_case(1, True)
+        with self.subTest("writes happen on phase 0 (non-transparent reads)"):
+            self.do_test_case(0, False)
+        with self.subTest("writes happen on phase 1 (non-transparent reads)"):
+            self.do_test_case(1, False)
+
+    def do_test_formal(self, write_phase, transparent):
+        """
+        Formal proof of the pseudo 1W/2R regfile
+        """
+        m = Module()
+        # 128 x 32-bit, 8-bit granularity
+        dut = PhasedReadPhasedWriteFullReadSRAM(7, 32, 4, write_phase,
+                                                transparent)
+        m.submodules.dut = dut
+        gran = dut.data_width // dut.we_width  # granularity
+        # choose a single random memory location to test
+        a_const = AnyConst(dut.addr_width)
+        # choose a single byte lane to test
+        lane = AnyConst(range(dut.we_width))
+        # drive alternating phases
+        m.d.comb += Assume(dut.phase != Past(dut.phase))
+        # holding data register
+        d_reg = Signal(gran)
+        # for some reason, simulated formal memory is not zeroed at reset
+        # ... so, remember whether we wrote it, at least once.
+        wrote = Signal()
+        # if our memory location and byte lane is being written,
+        # capture the data in our holding register
+        with m.If((dut.wr_addr_i == a_const)
+                  & dut.wr_we_i.bit_select(lane, 1)
+                  & (dut.phase == dut.write_phase)):
+            m.d.sync += d_reg.eq(dut.wr_data_i.word_select(lane, gran))
+            m.d.sync += wrote.eq(1)
+        # if our memory location is being read,
+        # and the holding register has valid data,
+        # then its value must match the memory output, on the given lane
+        with m.If(Past(dut.rd_addr_i) == a_const):
+            if transparent:
+                with m.If(wrote):
+                    rd_lane = dut.rd_data_o.word_select(lane, gran)
+                    m.d.sync += Assert(d_reg == rd_lane)
+            else:
+                # with a non-transparent read port, the read value depends
+                # on whether there is a simultaneous write, or not
+                with m.If((Past(dut.wr_addr_i) == a_const)
+                          & Past(dut.phase) == dut.write_phase):
+                    # simultaneous write -> check against last written value
+                    with m.If(Past(wrote)):
+                        rd_lane = dut.rd_data_o.word_select(lane, gran)
+                        m.d.sync += Assert(Past(d_reg) == rd_lane)
+                with m.Else():
+                    # otherwise, check against current written value
+                    with m.If(wrote):
+                        rd_lane = dut.rd_data_o.word_select(lane, gran)
+                        m.d.sync += Assert(d_reg == rd_lane)
+        # same for the phased read port, except it's always transparent
+        # and the port works only on the write phase
+        with m.If((Past(dut.rdp_addr_i) == a_const) & wrote
+                  & (Past(dut.phase) == dut.write_phase)):
+            rdp_lane = dut.rdp_data_o.word_select(lane, gran)
+            m.d.sync += Assert(d_reg == rdp_lane)
+
+        # pass our state to the device under test, so it can ensure that
+        # its state is in sync with ours, for induction
+        m.d.comb += [
+            # address and mask under test
+            dut.dbg_addr.eq(a_const),
+            dut.dbg_lane.eq(lane),
+            # state of our holding register
+            dut.dbg_data.eq(d_reg),
+            dut.dbg_wrote.eq(wrote),
+        ]
+
+        self.assertFormal(m, mode="prove", depth=3)
+
+    def test_formal(self):
+        """test both types (odd and even write ports) of phased write memory"""
+        with self.subTest("writes happen on phase 0"):
+            self.do_test_formal(0, False)
+        with self.subTest("writes happen on phase 1"):
+            self.do_test_formal(1, False)
+        # test again, with transparent read ports
+        with self.subTest("writes happen on phase 0 (transparent reads)"):
+            self.do_test_formal(0, True)
+        with self.subTest("writes happen on phase 1 (transparent reads)"):
+            self.do_test_formal(1, True)
+
+
+class DualPortXorRegfile(Elaboratable):
+    """
+    Builds, from a pair of phased 1W/2R blocks, a true 1W/1R RAM, where both
+    write and (non-transparent) read ports work every cycle.
+
+    It employs a XOR trick, as follows:
+
+    1) Like before, there are two memories, each reading on every cycle, and
+       writing on alternate cycles
+    2) Instead of a MUX, the read port is a direct XOR of the two memories.
+    3) Writes happens in two cycles:
+
+        First, read the current value of the *other* memory, at the write
+        location.
+
+        Then, on *this* memory, write that read value, XORed with the desired
+        value.
+
+    This recovers the desired value when read:
+    (other XOR desired) XOR other = desired
+
+    :param addr_width: width of the address bus
+    :param data_width: width of the data bus
+    :param we_width: number of write enable lines
+    :param transparent: whether a simultaneous read and write returns the
+                        new value (True) or the old value (False) on the full
+                        read port
+    """
+
+    def __init__(self, addr_width, data_width, we_width, transparent):
+        self.addr_width = addr_width
+        self.data_width = data_width
+        self.we_width = we_width
+        self.transparent = transparent
+        # interface signals
+        self.wr_addr_i = Signal(addr_width); """write port address"""
+        self.wr_data_i = Signal(data_width); """write port data"""
+        self.wr_we_i = Signal(we_width); """write port enable"""
+        self.rd_addr_i = Signal(addr_width); """read port address"""
+        self.rd_data_o = Signal(data_width); """read port data"""
+
+    def elaborate(self, platform):
+        m = Module()
+        # instantiate the two phased 1W/2R memory blocks
+        mem0 = PhasedReadPhasedWriteFullReadSRAM(
+            self.addr_width, self.data_width, self.we_width, 0, True)
+        mem1 = PhasedReadPhasedWriteFullReadSRAM(
+            self.addr_width, self.data_width, self.we_width, 1, True)
+        m.submodules.mem0 = mem0
+        m.submodules.mem1 = mem1
+        # generate and wire the phases for the phased memories
+        phase = Signal()
+        m.d.sync += phase.eq(~phase)
+        m.d.comb += [
+            mem0.phase.eq(phase),
+            mem1.phase.eq(phase),
+        ]
+        # store the write information for the next cycle
+        last_addr = Signal(self.addr_width)
+        last_we = Signal(self.we_width)
+        last_data = Signal(self.data_width)
+        m.d.sync += [
+            last_addr.eq(self.wr_addr_i),
+            last_we.eq(self.wr_we_i),
+            last_data.eq(self.wr_data_i),
+        ]
+        # read path
+        # wire read address to memories, and XOR their output
+        xor_data = Signal(self.data_width)
+        m.d.comb += [
+            mem0.rd_addr_i.eq(self.rd_addr_i),
+            mem1.rd_addr_i.eq(self.rd_addr_i),
+            xor_data.eq(mem0.rd_data_o ^ mem1.rd_data_o),
+        ]
+        if self.transparent:
+            # do the read and write addresses coincide?
+            same_read_write = Signal()
+            m.d.sync += same_read_write.eq(self.rd_addr_i == self.wr_addr_i)
+            gran = self.data_width // self.we_width
+            for i in range(self.we_width):
+                # when simultaneously reading and writing to the same location
+                # and write lane, bypass the memory, and output the write
+                # holding register instead
+                with m.If(same_read_write & last_we[i]):
+                    m.d.comb += self.rd_data_o.word_select(i, gran).eq(
+                        last_data.word_select(i, gran))
+                # otherwise, output the xor data
+                with m.Else():
+                    m.d.comb += self.rd_data_o.word_select(i, gran).eq(
+                        xor_data.word_select(i, gran))
+        # when not transparent, just output the memory contents (xor data)
+        else:
+            m.d.comb += self.rd_data_o.eq(xor_data)
+        # write path
+        # 1) read the memory location which is about to be written
+        m.d.comb += [
+            mem0.rdp_addr_i.eq(self.wr_addr_i),
+            mem1.rdp_addr_i.eq(self.wr_addr_i),
+        ]
+        # 2) write the XOR of the other memory data, and the desired value
+        m.d.comb += [
+            mem0.wr_addr_i.eq(last_addr),
+            mem1.wr_addr_i.eq(last_addr),
+            mem0.wr_we_i.eq(last_we),
+            mem1.wr_we_i.eq(last_we),
+            mem0.wr_data_i.eq(last_data ^ mem1.rdp_data_o),
+            mem1.wr_data_i.eq(last_data ^ mem0.rdp_data_o),
+        ]
+        return m
+
+
+class DualPortXorRegfileTestCase(FHDLTestCase):
+
+    def do_test_case(self, transparent):
+        """
+        Simulate some read/write/modify operations on the dual port register
+        file
+        """
+        dut = DualPortXorRegfile(7, 32, 4, transparent)
+        sim = Simulator(dut)
+        sim.add_clock(1e-6)
+
+        expected = None
+        last_expected = None
+
+        # compare read data with previously written data
+        # and start a new read
+        def read(rd_addr_i, next_expected=None):
+            nonlocal expected, last_expected
+            if expected is not None:
+                self.assertEqual((yield dut.rd_data_o), expected)
+            yield dut.rd_addr_i.eq(rd_addr_i)
+            # account for the read latency
+            expected = last_expected
+            last_expected = next_expected
+
+        # start a write
+        def write(wr_addr_i, wr_we_i, wr_data_i):
+            yield dut.wr_addr_i.eq(wr_addr_i)
+            yield dut.wr_we_i.eq(wr_we_i)
+            yield dut.wr_data_i.eq(wr_data_i)
+
+        def process():
+            # write a pair of values, one for each memory
+            yield from read(0)
+            yield from write(0x42, 0b1111, 0x87654321)
+            yield
+            yield from read(0x42, 0x87654321)
+            yield from write(0x43, 0b1111, 0x0FEDCBA9)
+            yield
+            # skip a beat
+            yield from read(0x43, 0x0FEDCBA9)
+            yield from write(0, 0, 0)
+            yield
+            # write again, but now they switch memories
+            yield from read(0)
+            yield from write(0x42, 0b1111, 0x12345678)
+            yield
+            yield from read(0x42, 0x12345678)
+            yield from write(0x43, 0b1111, 0x9ABCDEF0)
+            yield
+            yield from read(0x43, 0x9ABCDEF0)
+            yield from write(0, 0, 0)
+            yield
+            # test partial writes
+            yield from read(0)
+            yield from write(0x42, 0b1001, 0x78FFFF12)
+            yield
+            yield from read(0)
+            yield from write(0x43, 0b0110, 0xFFDEABFF)
+            yield
+            yield from read(0x42, 0x78345612)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0x43, 0x9ADEABF0)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0)
+            yield from write(0, 0, 0)
+            yield
+            # test simultaneous read and write
+            if transparent:
+                # transparent reads, returns the new value
+                yield from read(0x42, 0x78AA5666)
+            else:
+                # non-transparent read: returns the old value
+                yield from read(0x42, 0x78345612)
+            yield from write(0x42, 0b0101, 0x55AA9966)
+            yield
+            # after a cycle, returns the new value
+            yield from read(0x42, 0x78AA5666)
+            yield from write(0, 0, 0)
+            yield
+            # settle down
+            yield from read(0)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0)
+            yield from write(0, 0, 0)
+
+        sim.add_sync_process(process)
+        debug_file = 'test_dual_port_xor_regfile'
+        if transparent:
+            debug_file += '_transparent'
+        traces = ['clk', 'phase',
+                  {'comment': 'write port'},
+                  'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+                  {'comment': 'read port'},
+                  'rd_addr_i[6:0]', 'rd_data_o[31:0]',
+                  ]
+        write_gtkw(debug_file + '.gtkw',
+                   debug_file + '.vcd',
+                   traces, module='top', zoom=-22)
+        sim_writer = sim.write_vcd(debug_file + '.vcd')
+        with sim_writer:
+            sim.run()
+
+    def test_case(self):
+        with self.subTest("non-transparent reads"):
+            self.do_test_case(False)
+        with self.subTest("transparent reads"):
+            self.do_test_case(True)
+
+
+if __name__ == "__main__":
+    unittest.main()
index 428b19f29bf5743b2a50b0d5e0a9652368d58636..9a4abacc3135e647ae4be3d9a8b7882e7ce68fe4 100644 (file)
@@ -90,8 +90,8 @@ def bitvector_remap(regfile, rfile, port):
     # 3 bits, unary: return the port
     if regfile == 'XER':
         return port
-    # 3 bits, unary: return the port
-    if regfile == 'SVSTATE':
+    # 5 bits, unary: return the port
+    if regfile == 'STATE':
         return port
     # 9 bits (9 entries), might be unary already
     if regfile == 'FAST':
@@ -149,14 +149,34 @@ class NonProductionCore(ControlBase):
 
         # link LoadStore1 into MMU
         mmu = self.fus.get_fu('mmu0')
+        ldst0 = self.fus.get_fu('ldst0')
         print ("core pspec", pspec.ldst_ifacetype)
         print ("core mmu", mmu)
         if mmu is not None:
-            print ("core lsmem.lsi", l0.cmpi.lsmem.lsi)
-            mmu.alu.set_ldst_interface(l0.cmpi.lsmem.lsi)
+            lsi = l0.cmpi.lsmem.lsi # a LoadStore1 Interface object
+            print ("core lsmem.lsi", lsi)
+            mmu.alu.set_ldst_interface(lsi)
+            # urr store I-Cache in core so it is easier to get at
+            self.icache = lsi.icache
+
+        # alternative reset values for STATE regs. these probably shouldn't
+        # be set, here, instead have them done by Issuer. which they are.
+        # as well. because core.state overrides them. sigh.
+        self.msr_at_reset = 0x0
+        self.pc_at_reset = 0x0
+        if hasattr(pspec, "msr_reset") and isinstance(pspec.msr_reset, int):
+            self.msr_at_reset = pspec.msr_reset
+        if hasattr(pspec, "pc_reset") and isinstance(pspec.pc_reset, int):
+            self.pc_at_reset = pspec.pc_reset
+        state_resets = [self.pc_at_reset,  # PC at reset
+                        self.msr_at_reset, # MSR at reset
+                        0x0,               # SVSTATE at reset
+                        0x0,               # DEC at reset
+                        0x0]               # TB at reset
 
         # register files (yes plural)
-        self.regs = RegFiles(pspec, make_hazard_vecs=self.make_hazard_vecs)
+        self.regs = RegFiles(pspec, make_hazard_vecs=self.make_hazard_vecs,
+                                    state_resets=state_resets)
 
         # set up input and output: unusual requirement to set data directly
         # (due to the way that the core is set up in a different domain,
@@ -194,6 +214,7 @@ class NonProductionCore(ControlBase):
                                             svp64_en=self.svp64_en,
                                             regreduce_en=self.regreduce_en)
             self.des[funame] = self.decoders[funame].do
+            print ("create decoder subset", funame, opkls, self.des[funame])
 
         # create per-Function Unit write-after-write hazard signals
         # yes, really, this should have been added in ReservationStations
@@ -205,6 +226,10 @@ class NonProductionCore(ControlBase):
         if "mmu0" in self.decoders:
             self.decoders["mmu0"].mmu0_spr_dec = self.decoders["spr0"]
 
+        # allow pausing of the DEC/TB FSM back in Issuer, by spotting
+        # if there is an MTSPR instruction
+        self.pause_dec_tb = Signal()
+
     # next 3 functions are Stage API Compliance
     def setup(self, m, i):
         pass
@@ -418,6 +443,20 @@ class NonProductionCore(ControlBase):
                                     # is a waw hazard. decoder has to still
                                     # be asserted in order to detect that, tho
                                     comb += fu.oper_i.eq_from(do)
+                                    if funame == 'mmu0':
+                                        # URRR this is truly dreadful.
+                                        # OP_FETCH_FAILED is a "fake" op.
+                                        # no instruction creates it.  OP_TRAP
+                                        # uses the *main* decoder: this is
+                                        # a *Satellite* decoder that reacts
+                                        # on *insn_in*... not fake ops. gaah.
+                                        main_op = self.ireg.e.do
+                                        with m.If(main_op.insn_type ==
+                                                  MicrOp.OP_FETCH_FAILED):
+                                            comb += fu.oper_i.insn_type.eq(
+                                                  MicrOp.OP_FETCH_FAILED)
+                                            comb += fu.oper_i.fn_unit.eq(
+                                                  Function.MMU)
                                     # issue when valid (and no write-hazard)
                                     comb += fu.issue_i.eq(~self.waw_hazard)
                                     # instruction ok, indicate ready
@@ -484,6 +523,14 @@ class NonProductionCore(ControlBase):
                     funame.lower().startswith('trap')):
                     with m.If(fu.busy_o):
                         comb += busy_o.eq(1)
+                # for SPR pipeline pause dec/tb FSM to avoid race condition
+                # TODO: really this should be much more sophisticated,
+                # spot MTSPR, spot that DEC/TB is what is to be updated.
+                # a job for PowerDecoder2, there
+                if funame.lower().startswith('spr'):
+                    with m.If(fu.busy_o #& fu.oper_i.insn_type == OP_MTSPR
+                        ):
+                        comb += self.pause_dec_tb.eq(1)
 
         # return both the function unit "enable" dict as well as the "busy".
         # the "busy-or-issued" can be passed in to the Read/Write port
@@ -1121,7 +1168,7 @@ class NonProductionCore(ControlBase):
 if __name__ == '__main__':
     pspec = TestMemPspec(ldst_ifacetype='testpi',
                          imem_ifacetype='',
-                         addr_wid=48,
+                         addr_wid=64,
                          allow_overlap=True,
                          mask_wid=8,
                          reg_wid=64)
index a30b4f4f4a64104775c7e8b55bf9435e4c18542f..8cc0b34b20f025aaccf4dcc0f00d116fc5b8b3dd 100644 (file)
@@ -22,9 +22,10 @@ class FetchInput:
     def __init__(self):
 
         self.pc = Signal(64)
+        self.msr = Signal(64)
 
     def eq(self, i):
-        return [self.pc.eq(i.pc),
+        return [self.pc.eq(i.pc), self.msr.eq(i.msr),
                ]
 
 
diff --git a/src/soc/simple/inorder.py b/src/soc/simple/inorder.py
new file mode 100644 (file)
index 0000000..03a101a
--- /dev/null
@@ -0,0 +1,532 @@
+"""simple core issuer
+
+not in any way intended for production use.  this runs a FSM that:
+
+* reads the Program Counter from StateRegs
+* reads an instruction from a fixed-size Test Memory
+* issues it to the Simple Core
+* waits for it to complete
+* increments the PC
+* does it all over again
+
+the purpose of this module is to verify the functional correctness
+of the Function Units in the absolute simplest and clearest possible
+way, and to at provide something that can be further incrementally
+improved.
+"""
+
+from nmigen import (Elaboratable, Module, Signal,
+                    Mux, Const, Repl, Cat)
+from nmigen.cli import rtlil
+from nmigen.cli import main
+import sys
+
+from nmutil.singlepipe import ControlBase
+from soc.simple.core_data import FetchOutput, FetchInput
+
+from openpower.consts import MSR
+from openpower.decoder.power_enums import MicrOp
+from openpower.state import CoreState
+from soc.regfile.regfiles import StateRegs
+from soc.config.test.test_loadstore import TestMemPspec
+from soc.experiment.icache import ICache
+
+from nmutil.util import rising_edge
+
+from soc.simple.issuer import TestIssuerBase
+
+def get_insn(f_instr_o, pc):
+    if f_instr_o.width == 32:
+        return f_instr_o
+    else:
+        # 64-bit: bit 2 of pc decides which word to select
+        return f_instr_o.word_select(pc[2], 32)
+
+
+# Fetch Finite State Machine.
+# WARNING: there are currently DriverConflicts but it's actually working.
+# TODO, here: everything that is global in nature, information from the
+# main TestIssuerInternal, needs to move to either ispec() or ospec().
+# not only that: TestIssuerInternal.imem can entirely move into here
+# because imem is only ever accessed inside the FetchFSM.
+class FetchFSM(ControlBase):
+    def __init__(self, allow_overlap, imem, core_rst,
+                 pdecode2, cur_state,
+                 dbg, core, svstate, nia):
+        self.allow_overlap = allow_overlap
+        self.imem = imem
+        self.core_rst = core_rst
+        self.pdecode2 = pdecode2
+        self.cur_state = cur_state
+        self.dbg = dbg
+        self.core = core
+        self.svstate = svstate
+        self.nia = nia
+
+        # set up pipeline ControlBase and allocate i/o specs
+        # (unusual: normally done by the Pipeline API)
+        super().__init__(stage=self)
+        self.p.i_data, self.n.o_data = self.new_specs(None)
+        self.i, self.o = self.p.i_data, self.n.o_data
+
+    # next 3 functions are Stage API Compliance
+    def setup(self, m, i):
+        pass
+
+    def ispec(self):
+        return FetchInput()
+
+    def ospec(self):
+        return FetchOutput()
+
+    def elaborate(self, platform):
+        """fetch FSM
+
+        this FSM performs fetch of raw instruction data, partial-decodes
+        it 32-bit at a time to detect SVP64 prefixes, and will optionally
+        read a 2nd 32-bit quantity if that occurs.
+        """
+        m = super().elaborate(platform)
+
+        dbg = self.dbg
+        core = self.core
+        pc = self.i.pc
+        msr = self.i.msr
+        svstate = self.svstate
+        nia = self.nia
+        fetch_pc_o_ready = self.p.o_ready
+        fetch_pc_i_valid = self.p.i_valid
+        fetch_insn_o_valid = self.n.o_valid
+        fetch_insn_i_ready = self.n.i_ready
+
+        comb = m.d.comb
+        sync = m.d.sync
+        pdecode2 = self.pdecode2
+        cur_state = self.cur_state
+        dec_opcode_o = pdecode2.dec.raw_opcode_in  # raw opcode
+
+        # also note instruction fetch failed
+        if hasattr(core, "icache"):
+            fetch_failed = core.icache.i_out.fetch_failed
+            flush_needed = True
+        else:
+            fetch_failed = Const(0, 1)
+            flush_needed = False
+
+        # set priv / virt mode on I-Cache, sigh
+        if isinstance(self.imem, ICache):
+            comb += self.imem.i_in.priv_mode.eq(~msr[MSR.PR])
+            comb += self.imem.i_in.virt_mode.eq(msr[MSR.DR])
+
+        with m.FSM(name='fetch_fsm'):
+
+            # allow fetch to not run at startup due to I-Cache reset not
+            # having time to settle.  power-on-reset holds dbg.core_stopped_i
+            with m.State("PRE_IDLE"):
+                with m.If(~dbg.core_stopped_i & ~dbg.core_stop_o):
+                    m.next = "IDLE"
+
+            # waiting (zzz)
+            with m.State("IDLE"):
+                with m.If(~dbg.stopping_o & ~fetch_failed):
+                    comb += fetch_pc_o_ready.eq(1)
+                with m.If(fetch_pc_i_valid & ~fetch_failed):
+                    # instruction allowed to go: start by reading the PC
+                    # capture the PC and also drop it into Insn Memory
+                    # we have joined a pair of combinatorial memory
+                    # lookups together.  this is Generally Bad.
+                    comb += self.imem.a_pc_i.eq(pc)
+                    comb += self.imem.a_i_valid.eq(1)
+                    comb += self.imem.f_i_valid.eq(1)
+                    sync += cur_state.pc.eq(pc)
+                    sync += cur_state.svstate.eq(svstate)  # and svstate
+                    sync += cur_state.msr.eq(msr)  # and msr
+
+                    m.next = "INSN_READ"  # move to "wait for bus" phase
+
+            # dummy pause to find out why simulation is not keeping up
+            with m.State("INSN_READ"):
+                if self.allow_overlap:
+                    stopping = dbg.stopping_o
+                else:
+                    stopping = Const(0)
+                with m.If(stopping):
+                    # stopping: jump back to idle
+                    m.next = "IDLE"
+                with m.Else():
+                    with m.If(self.imem.f_busy_o & ~fetch_failed):  # zzz...
+                        # busy but not fetch failed: stay in wait-read
+                        comb += self.imem.a_i_valid.eq(1)
+                        comb += self.imem.f_i_valid.eq(1)
+                    with m.Else():
+                        # not busy (or fetch failed!): instruction fetched
+                        # when fetch failed, the instruction gets ignored
+                        # by the decoder
+                        insn = get_insn(self.imem.f_instr_o, cur_state.pc)
+                        # not SVP64 - 32-bit only
+                        sync += nia.eq(cur_state.pc + 4)
+                        sync += dec_opcode_o.eq(insn)
+                        m.next = "INSN_READY"
+
+            with m.State("INSN_READY"):
+                # hand over the instruction, to be decoded
+                comb += fetch_insn_o_valid.eq(1)
+                with m.If(fetch_insn_i_ready):
+                    m.next = "IDLE"
+
+        # whatever was done above, over-ride it if core reset is held
+        with m.If(self.core_rst):
+            sync += nia.eq(0)
+
+        return m
+
+
+class TestIssuerInternalInOrder(TestIssuerBase):
+    """TestIssuer - reads instructions from TestMemory and issues them
+
+    efficiency and speed is not the main goal here: functional correctness
+    and code clarity is.  optimisations (which almost 100% interfere with
+    easy understanding) come later.
+    """
+
+    def issue_fsm(self, m, core, nia,
+                  dbg, core_rst,
+                  fetch_pc_o_ready, fetch_pc_i_valid,
+                  fetch_insn_o_valid, fetch_insn_i_ready,
+                  exec_insn_i_valid, exec_insn_o_ready,
+                  exec_pc_o_valid, exec_pc_i_ready):
+        """issue FSM
+
+        decode / issue FSM.  this interacts with the "fetch" FSM
+        through fetch_insn_ready/valid (incoming) and fetch_pc_ready/valid
+        (outgoing). also interacts with the "execute" FSM
+        through exec_insn_ready/valid (outgoing) and exec_pc_ready/valid
+        (incoming).
+        SVP64 RM prefixes have already been set up by the
+        "fetch" phase, so execute is fairly straightforward.
+        """
+
+        comb = m.d.comb
+        sync = m.d.sync
+        pdecode2 = self.pdecode2
+        cur_state = self.cur_state
+
+        # temporaries
+        dec_opcode_i = pdecode2.dec.raw_opcode_in  # raw opcode
+
+        # note if an exception happened.  in a pipelined or OoO design
+        # this needs to be accompanied by "shadowing" (or stalling)
+        exc_happened = self.core.o.exc_happened
+        # also note instruction fetch failed
+        if hasattr(core, "icache"):
+            fetch_failed = core.icache.i_out.fetch_failed
+            flush_needed = True
+            # set to fault in decoder
+            # update (highest priority) instruction fault
+            rising_fetch_failed = rising_edge(m, fetch_failed)
+            with m.If(rising_fetch_failed):
+                sync += pdecode2.instr_fault.eq(1)
+        else:
+            fetch_failed = Const(0, 1)
+            flush_needed = False
+
+        with m.FSM(name="issue_fsm"):
+
+            # sync with the "fetch" phase which is reading the instruction
+            # at this point, there is no instruction running, that
+            # could inadvertently update the PC.
+            with m.State("ISSUE_START"):
+                # reset instruction fault
+                sync += pdecode2.instr_fault.eq(0)
+                # wait on "core stop" release, before next fetch
+                # need to do this here, in case we are in a VL==0 loop
+                with m.If(~dbg.core_stop_o & ~core_rst):
+                    comb += fetch_pc_i_valid.eq(1)  # tell fetch to start
+                    with m.If(fetch_pc_o_ready):   # fetch acknowledged us
+                        m.next = "INSN_WAIT"
+                with m.Else():
+                    # tell core it's stopped, and acknowledge debug handshake
+                    comb += dbg.core_stopped_i.eq(1)
+
+            # wait for an instruction to arrive from Fetch
+            with m.State("INSN_WAIT"):
+                if self.allow_overlap:
+                    stopping = dbg.stopping_o
+                else:
+                    stopping = Const(0)
+                with m.If(stopping):
+                    # stopping: jump back to idle
+                    m.next = "ISSUE_START"
+                    if flush_needed:
+                        # request the icache to stop asserting "failed"
+                        comb += core.icache.flush_in.eq(1)
+                    # stop instruction fault
+                    sync += pdecode2.instr_fault.eq(0)
+                with m.Else():
+                    comb += fetch_insn_i_ready.eq(1)
+                    with m.If(fetch_insn_o_valid):
+                        # loop into ISSUE_START if it's a SVP64 instruction
+                        # and VL == 0.  this because VL==0 is a for-loop
+                        # from 0 to 0 i.e. always, always a NOP.
+                        m.next = "DECODE_SV"  # skip predication
+
+            # after src/dst step have been updated, we are ready
+            # to decode the instruction
+            with m.State("DECODE_SV"):
+                # decode the instruction
+                with m.If(~fetch_failed):
+                    sync += pdecode2.instr_fault.eq(0)
+                sync += core.i.e.eq(pdecode2.e)
+                sync += core.i.state.eq(cur_state)
+                sync += core.i.raw_insn_i.eq(dec_opcode_i)
+                sync += core.i.bigendian_i.eq(self.core_bigendian_i)
+                # after decoding, reset any previous exception condition,
+                # allowing it to be set again during the next execution
+                sync += pdecode2.ldst_exc.eq(0)
+
+                m.next = "INSN_EXECUTE"  # move to "execute"
+
+            # handshake with execution FSM, move to "wait" once acknowledged
+            with m.State("INSN_EXECUTE"):
+                comb += exec_insn_i_valid.eq(1)  # trigger execute
+                with m.If(exec_insn_o_ready):   # execute acknowledged us
+                    m.next = "EXECUTE_WAIT"
+
+            with m.State("EXECUTE_WAIT"):
+                # wait on "core stop" release, at instruction end
+                # need to do this here, in case we are in a VL>1 loop
+                with m.If(~dbg.core_stop_o & ~core_rst):
+                    comb += exec_pc_i_ready.eq(1)
+                    # see https://bugs.libre-soc.org/show_bug.cgi?id=636
+                    # the exception info needs to be blatted into
+                    # pdecode.ldst_exc, and the instruction "re-run".
+                    # when ldst_exc.happened is set, the PowerDecoder2
+                    # reacts very differently: it re-writes the instruction
+                    # with a "trap" (calls PowerDecoder2.trap()) which
+                    # will *overwrite* whatever was requested and jump the
+                    # PC to the exception address, as well as alter MSR.
+                    # nothing else needs to be done other than to note
+                    # the change of PC and MSR (and, later, SVSTATE)
+                    with m.If(exc_happened):
+                        mmu = core.fus.get_exc("mmu0")
+                        ldst = core.fus.get_exc("ldst0")
+                        if mmu is not None:
+                            with m.If(fetch_failed):
+                                # instruction fetch: exception is from MMU
+                                # reset instr_fault (highest priority)
+                                sync += pdecode2.ldst_exc.eq(mmu)
+                                sync += pdecode2.instr_fault.eq(0)
+                                if flush_needed:
+                                    # request icache to stop asserting "failed"
+                                    comb += core.icache.flush_in.eq(1)
+                        with m.If(~fetch_failed):
+                            # otherwise assume it was a LDST exception
+                            sync += pdecode2.ldst_exc.eq(ldst)
+
+                    with m.If(exec_pc_o_valid):
+
+                        # return directly to Decode if Execute generated an
+                        # exception.
+                        with m.If(pdecode2.ldst_exc.happened):
+                            m.next = "DECODE_SV"
+
+                        # if MSR, PC or SVSTATE were changed by the previous
+                        # instruction, go directly back to Fetch, without
+                        # updating either MSR PC or SVSTATE
+                        with m.Elif(self.msr_changed | self.pc_changed |
+                                    self.sv_changed):
+                            m.next = "ISSUE_START"
+
+                        with m.Else():
+                            # before going back to fetch, update the PC state
+                            # register with the NIA.
+                            # ok here we are not reading the branch unit.
+                            # TODO: this just blithely overwrites whatever
+                            #       pipeline updated the PC
+                            comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
+                            comb += self.state_w_pc.i_data.eq(nia)
+                            m.next = "ISSUE_START"
+
+                with m.Else():
+                    comb += dbg.core_stopped_i.eq(1)
+                    if flush_needed:
+                        # request the icache to stop asserting "failed"
+                        comb += core.icache.flush_in.eq(1)
+                    # stop instruction fault
+                    sync += pdecode2.instr_fault.eq(0)
+                    if flush_needed:
+                        # request the icache to stop asserting "failed"
+                        comb += core.icache.flush_in.eq(1)
+                    # stop instruction fault
+                    sync += pdecode2.instr_fault.eq(0)
+
+    def execute_fsm(self, m, core,
+                    exec_insn_i_valid, exec_insn_o_ready,
+                    exec_pc_o_valid, exec_pc_i_ready):
+        """execute FSM
+
+        execute FSM. this interacts with the "issue" FSM
+        through exec_insn_ready/valid (incoming) and exec_pc_ready/valid
+        (outgoing). SVP64 RM prefixes have already been set up by the
+        "issue" phase, so execute is fairly straightforward.
+        """
+
+        comb = m.d.comb
+        sync = m.d.sync
+        pdecode2 = self.pdecode2
+
+        # temporaries
+        core_busy_o = core.n.o_data.busy_o  # core is busy
+        core_ivalid_i = core.p.i_valid              # instruction is valid
+
+        if hasattr(core, "icache"):
+            fetch_failed = core.icache.i_out.fetch_failed
+        else:
+            fetch_failed = Const(0, 1)
+
+        with m.FSM(name="exec_fsm"):
+
+            # waiting for instruction bus (stays there until not busy)
+            with m.State("INSN_START"):
+                comb += exec_insn_o_ready.eq(1)
+                with m.If(exec_insn_i_valid):
+                    comb += core_ivalid_i.eq(1)  # instruction is valid/issued
+                    sync += self.sv_changed.eq(0)
+                    sync += self.pc_changed.eq(0)
+                    sync += self.msr_changed.eq(0)
+                    with m.If(core.p.o_ready):  # only move if accepted
+                        m.next = "INSN_ACTIVE"  # move to "wait completion"
+
+            # instruction started: must wait till it finishes
+            with m.State("INSN_ACTIVE"):
+                # note changes to MSR, PC and SVSTATE
+                # XXX oops, really must monitor *all* State Regfile write
+                # ports looking for changes!
+                with m.If(self.state_nia.wen & (1 << StateRegs.SVSTATE)):
+                    sync += self.sv_changed.eq(1)
+                with m.If(self.state_nia.wen & (1 << StateRegs.MSR)):
+                    sync += self.msr_changed.eq(1)
+                with m.If(self.state_nia.wen & (1 << StateRegs.PC)):
+                    sync += self.pc_changed.eq(1)
+                with m.If(~core_busy_o):  # instruction done!
+                    comb += exec_pc_o_valid.eq(1)
+                    with m.If(exec_pc_i_ready):
+                        # when finished, indicate "done".
+                        # however, if there was an exception, the instruction
+                        # is *not* yet done.  this is an implementation
+                        # detail: we choose to implement exceptions by
+                        # taking the exception information from the LDST
+                        # unit, putting that *back* into the PowerDecoder2,
+                        # and *re-running the entire instruction*.
+                        # if we erroneously indicate "done" here, it is as if
+                        # there were *TWO* instructions:
+                        # 1) the failed LDST 2) a TRAP.
+                        with m.If(~pdecode2.ldst_exc.happened &
+                                  ~fetch_failed):
+                            comb += self.insn_done.eq(1)
+                        m.next = "INSN_START"  # back to fetch
+
+    def elaborate(self, platform):
+        m = super().elaborate(platform)
+        # convenience
+        comb, sync = m.d.comb, m.d.sync
+        cur_state = self.cur_state
+        pdecode2 = self.pdecode2
+        dbg = self.dbg
+        core = self.core
+
+        # set up peripherals and core
+        core_rst = self.core_rst
+
+        # indicate to outside world if any FU is still executing
+        comb += self.any_busy.eq(core.n.o_data.any_busy_o)  # any FU executing
+
+        # address of the next instruction, in the absence of a branch
+        # depends on the instruction size
+        nia = Signal(64)
+
+        # connect up debug signals
+        with m.If(core.o.core_terminate_o):
+            comb += dbg.terminate_i.eq(1)
+
+        # there are *THREE^WFOUR-if-SVP64-enabled* FSMs, fetch (32/64-bit)
+        # issue, decode/execute, now joined by "Predicate fetch/calculate".
+        # these are the handshake signals between each
+
+        # fetch FSM can run as soon as the PC is valid
+        fetch_pc_i_valid = Signal()  # Execute tells Fetch "start next read"
+        fetch_pc_o_ready = Signal()  # Fetch Tells SVSTATE "proceed"
+
+        # fetch FSM hands over the instruction to be decoded / issued
+        fetch_insn_o_valid = Signal()
+        fetch_insn_i_ready = Signal()
+
+        # issue FSM delivers the instruction to the be executed
+        exec_insn_i_valid = Signal()
+        exec_insn_o_ready = Signal()
+
+        # execute FSM, hands over the PC/SVSTATE back to the issue FSM
+        exec_pc_o_valid = Signal()
+        exec_pc_i_ready = Signal()
+
+        # the FSMs here are perhaps unusual in that they detect conditions
+        # then "hold" information, combinatorially, for the core
+        # (as opposed to using sync - which would be on a clock's delay)
+        # this includes the actual opcode, valid flags and so on.
+
+        # Fetch, then predicate fetch, then Issue, then Execute.
+        # Issue is where the VL for-loop # lives.  the ready/valid
+        # signalling is used to communicate between the four.
+
+        # set up Fetch FSM
+        fetch = FetchFSM(self.allow_overlap,
+                         self.imem, core_rst, pdecode2, cur_state,
+                         dbg, core,
+                         dbg.state.svstate, # combinatorially same
+                         nia)
+        m.submodules.fetch = fetch
+        # connect up in/out data to existing Signals
+        comb += fetch.p.i_data.pc.eq(dbg.state.pc)   # combinatorially same
+        comb += fetch.p.i_data.msr.eq(dbg.state.msr) # combinatorially same
+        # and the ready/valid signalling
+        comb += fetch_pc_o_ready.eq(fetch.p.o_ready)
+        comb += fetch.p.i_valid.eq(fetch_pc_i_valid)
+        comb += fetch_insn_o_valid.eq(fetch.n.o_valid)
+        comb += fetch.n.i_ready.eq(fetch_insn_i_ready)
+
+        self.issue_fsm(m, core, nia,
+                       dbg, core_rst,
+                       fetch_pc_o_ready, fetch_pc_i_valid,
+                       fetch_insn_o_valid, fetch_insn_i_ready,
+                       exec_insn_i_valid, exec_insn_o_ready,
+                       exec_pc_o_valid, exec_pc_i_ready)
+
+        self.execute_fsm(m, core,
+                         exec_insn_i_valid, exec_insn_o_ready,
+                         exec_pc_o_valid, exec_pc_i_ready)
+
+        return m
+
+
+# XXX TODO: update this
+
+if __name__ == '__main__':
+    units = {'alu': 1, 'cr': 1, 'branch': 1, 'trap': 1, 'logical': 1,
+             'spr': 1,
+             'div': 1,
+             'mul': 1,
+             'shiftrot': 1
+             }
+    pspec = TestMemPspec(ldst_ifacetype='bare_wb',
+                         imem_ifacetype='bare_wb',
+                         addr_wid=64,
+                         mask_wid=8,
+                         reg_wid=64,
+                         units=units)
+    dut = TestIssuer(pspec)
+    vl = main(dut, ports=dut.ports(), name="test_issuer")
+
+    if len(sys.argv) == 1:
+        vl = rtlil.convert(dut, ports=dut.external_ports(), name="test_issuer")
+        with open("test_issuer.il", "w") as f:
+            f.write(vl)
index 8b04ee0b0ccc7011577fe56b45b5de8c7954242d..15bd1760a5ab93f233d8cb7cdff813d7b0833096 100644 (file)
@@ -31,10 +31,10 @@ from openpower.decoder.power_decoder2 import PowerDecode2, SVP64PrefixDecoder
 from openpower.decoder.decode2execute1 import IssuerDecode2ToOperand
 from openpower.decoder.decode2execute1 import Data
 from openpower.decoder.power_enums import (MicrOp, SVP64PredInt, SVP64PredCR,
-                                     SVP64PredMode)
+                                           SVP64PredMode)
 from openpower.state import CoreState
-from openpower.consts import (CR, SVP64CROffs)
-from soc.experiment.testmem import TestMemory # test only for instructions
+from openpower.consts import (CR, SVP64CROffs, MSR)
+from soc.experiment.testmem import TestMemory  # test only for instructions
 from soc.regfile.regfiles import StateRegs, FastRegs
 from soc.simple.core import NonProductionCore
 from soc.config.test.test_loadstore import TestMemPspec
@@ -48,10 +48,11 @@ from soc.bus.SPBlock512W64B8W import SPBlock512W64B8W
 from soc.clock.select import ClockSelect
 from soc.clock.dummypll import DummyPLL
 from openpower.sv.svstate import SVSTATERec
-
+from soc.experiment.icache import ICache
 
 from nmutil.util import rising_edge
 
+
 def get_insn(f_instr_o, pc):
     if f_instr_o.width == 32:
         return f_instr_o
@@ -60,11 +61,12 @@ def get_insn(f_instr_o, pc):
         return f_instr_o.word_select(pc[2], 32)
 
 # gets state input or reads from state regfile
-def state_get(m, core_rst, state_i, name, regfile, regnum):
+
+
+def state_get(m, res, core_rst, state_i, name, regfile, regnum):
     comb = m.d.comb
     sync = m.d.sync
-    # read the PC
-    res = Signal(64, reset_less=True, name=name)
+    # read the {insert state variable here}
     res_ok_delay = Signal(name="%s_ok_delay" % name)
     with m.If(~core_rst):
         sync += res_ok_delay.eq(~state_i.ok)
@@ -72,12 +74,11 @@ def state_get(m, core_rst, state_i, name, regfile, regnum):
             # incoming override (start from pc_i)
             comb += res.eq(state_i.data)
         with m.Else():
-            # otherwise read StateRegs regfile for PC...
-            comb += regfile.ren.eq(1<<regnum)
+            # otherwise read StateRegs regfile for {insert state here}...
+            comb += regfile.ren.eq(1 << regnum)
         # ... but on a 1-clock delay
         with m.If(res_ok_delay):
             comb += res.eq(regfile.o_data)
-    return res
 
 
 def get_predint(m, mask, name):
@@ -155,227 +156,67 @@ def get_predcr(m, mask, name):
     return idx, invert
 
 
-# Fetch Finite State Machine.
-# WARNING: there are currently DriverConflicts but it's actually working.
-# TODO, here: everything that is global in nature, information from the
-# main TestIssuerInternal, needs to move to either ispec() or ospec().
-# not only that: TestIssuerInternal.imem can entirely move into here
-# because imem is only ever accessed inside the FetchFSM.
-class FetchFSM(ControlBase):
-    def __init__(self, allow_overlap, svp64_en, imem, core_rst,
-                       pdecode2, cur_state,
-                       dbg, core, svstate, nia, is_svp64_mode):
-        self.allow_overlap = allow_overlap
-        self.svp64_en = svp64_en
-        self.imem = imem
-        self.core_rst = core_rst
-        self.pdecode2 = pdecode2
-        self.cur_state = cur_state
-        self.dbg = dbg
-        self.core = core
-        self.svstate = svstate
-        self.nia = nia
-        self.is_svp64_mode = is_svp64_mode
-
-        # set up pipeline ControlBase and allocate i/o specs
-        # (unusual: normally done by the Pipeline API)
-        super().__init__(stage=self)
-        self.p.i_data, self.n.o_data = self.new_specs(None)
-        self.i, self.o = self.p.i_data, self.n.o_data
-
-    # next 3 functions are Stage API Compliance
-    def setup(self, m, i):
-        pass
-
-    def ispec(self):
-        return FetchInput()
-
-    def ospec(self):
-        return FetchOutput()
-
-    def elaborate(self, platform):
-        """fetch FSM
-
-        this FSM performs fetch of raw instruction data, partial-decodes
-        it 32-bit at a time to detect SVP64 prefixes, and will optionally
-        read a 2nd 32-bit quantity if that occurs.
-        """
-        m = super().elaborate(platform)
-
-        dbg = self.dbg
-        core = self.core,
-        pc = self.i.pc
-        svstate = self.svstate
-        nia = self.nia
-        is_svp64_mode = self.is_svp64_mode
-        fetch_pc_o_ready = self.p.o_ready
-        fetch_pc_i_valid = self.p.i_valid
-        fetch_insn_o_valid = self.n.o_valid
-        fetch_insn_i_ready = self.n.i_ready
-
-        comb = m.d.comb
-        sync = m.d.sync
-        pdecode2 = self.pdecode2
-        cur_state = self.cur_state
-        dec_opcode_o = pdecode2.dec.raw_opcode_in # raw opcode
-
-        msr_read = Signal(reset=1)
-
-        # don't read msr every cycle
-        staterf = self.core.regs.rf['state']
-        state_r_msr = staterf.r_ports['msr'] # MSR rd
+class TestIssuerBase(Elaboratable):
+    """TestIssuerBase - common base class for Issuers
 
-        comb += state_r_msr.ren.eq(0)
-
-        with m.FSM(name='fetch_fsm'):
-
-            # waiting (zzz)
-            with m.State("IDLE"):
-                with m.If(~dbg.stopping_o):
-                    comb += fetch_pc_o_ready.eq(1)
-                with m.If(fetch_pc_i_valid):
-                    # instruction allowed to go: start by reading the PC
-                    # capture the PC and also drop it into Insn Memory
-                    # we have joined a pair of combinatorial memory
-                    # lookups together.  this is Generally Bad.
-                    comb += self.imem.a_pc_i.eq(pc)
-                    comb += self.imem.a_i_valid.eq(1)
-                    comb += self.imem.f_i_valid.eq(1)
-                    sync += cur_state.pc.eq(pc)
-                    sync += cur_state.svstate.eq(svstate) # and svstate
-
-                    # initiate read of MSR. arrives one clock later
-                    comb += state_r_msr.ren.eq(1 << StateRegs.MSR)
-                    sync += msr_read.eq(0)
-
-                    m.next = "INSN_READ"  # move to "wait for bus" phase
-
-            # dummy pause to find out why simulation is not keeping up
-            with m.State("INSN_READ"):
-                if self.allow_overlap:
-                    stopping = dbg.stopping_o
-                else:
-                    stopping = Const(0)
-                with m.If(stopping):
-                    # stopping: jump back to idle
-                    m.next = "IDLE"
-                with m.Else():
-                    # one cycle later, msr/sv read arrives.  valid only once.
-                    with m.If(~msr_read):
-                        sync += msr_read.eq(1) # yeah don't read it again
-                        sync += cur_state.msr.eq(state_r_msr.o_data)
-                    with m.If(self.imem.f_busy_o): # zzz...
-                        # busy: stay in wait-read
-                        comb += self.imem.a_i_valid.eq(1)
-                        comb += self.imem.f_i_valid.eq(1)
-                    with m.Else():
-                        # not busy: instruction fetched
-                        insn = get_insn(self.imem.f_instr_o, cur_state.pc)
-                        if self.svp64_en:
-                            svp64 = self.svp64
-                            # decode the SVP64 prefix, if any
-                            comb += svp64.raw_opcode_in.eq(insn)
-                            comb += svp64.bigendian.eq(self.core_bigendian_i)
-                            # pass the decoded prefix (if any) to PowerDecoder2
-                            sync += pdecode2.sv_rm.eq(svp64.svp64_rm)
-                            sync += pdecode2.is_svp64_mode.eq(is_svp64_mode)
-                            # remember whether this is a prefixed instruction,
-                            # so the FSM can readily loop when VL==0
-                            sync += is_svp64_mode.eq(svp64.is_svp64_mode)
-                            # calculate the address of the following instruction
-                            insn_size = Mux(svp64.is_svp64_mode, 8, 4)
-                            sync += nia.eq(cur_state.pc + insn_size)
-                            with m.If(~svp64.is_svp64_mode):
-                                # with no prefix, store the instruction
-                                # and hand it directly to the next FSM
-                                sync += dec_opcode_o.eq(insn)
-                                m.next = "INSN_READY"
-                            with m.Else():
-                                # fetch the rest of the instruction from memory
-                                comb += self.imem.a_pc_i.eq(cur_state.pc + 4)
-                                comb += self.imem.a_i_valid.eq(1)
-                                comb += self.imem.f_i_valid.eq(1)
-                                m.next = "INSN_READ2"
-                        else:
-                            # not SVP64 - 32-bit only
-                            sync += nia.eq(cur_state.pc + 4)
-                            sync += dec_opcode_o.eq(insn)
-                            m.next = "INSN_READY"
-
-            with m.State("INSN_READ2"):
-                with m.If(self.imem.f_busy_o):  # zzz...
-                    # busy: stay in wait-read
-                    comb += self.imem.a_i_valid.eq(1)
-                    comb += self.imem.f_i_valid.eq(1)
-                with m.Else():
-                    # not busy: instruction fetched
-                    insn = get_insn(self.imem.f_instr_o, cur_state.pc+4)
-                    sync += dec_opcode_o.eq(insn)
-                    m.next = "INSN_READY"
-                    # TODO: probably can start looking at pdecode2.rm_dec
-                    # here or maybe even in INSN_READ state, if svp64_mode
-                    # detected, in order to trigger - and wait for - the
-                    # predicate reading.
-                    if self.svp64_en:
-                        pmode = pdecode2.rm_dec.predmode
-                    """
-                    if pmode != SVP64PredMode.ALWAYS.value:
-                        fire predicate loading FSM and wait before
-                        moving to INSN_READY
-                    else:
-                        sync += self.srcmask.eq(-1) # set to all 1s
-                        sync += self.dstmask.eq(-1) # set to all 1s
-                        m.next = "INSN_READY"
-                    """
-
-            with m.State("INSN_READY"):
-                # hand over the instruction, to be decoded
-                comb += fetch_insn_o_valid.eq(1)
-                with m.If(fetch_insn_i_ready):
-                    m.next = "IDLE"
+    takes care of power-on reset, peripherals, debug, DEC/TB,
+    and gets PC/MSR/SVSTATE from the State Regfile etc.
+    """
 
-        # whatever was done above, over-ride it if core reset is held
-        with m.If(self.core_rst):
-            sync += nia.eq(0)
+    def __init__(self, pspec):
 
-        return m
+        # test if microwatt compatibility is to be enabled
+        self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+                                 (pspec.microwatt_compat == True))
+        self.alt_reset = Signal(reset_less=True) # not connected yet (microwatt)
+        # test if fabric compatibility is to be enabled
+        self.fabric_compat = (hasattr(pspec, "fabric_compat") and
+                                 (pspec.fabric_compat == True))
 
+        if self.microwatt_compat or self.fabric_compat:
 
-class TestIssuerInternal(Elaboratable):
-    """TestIssuer - reads instructions from TestMemory and issues them
+            if hasattr(pspec, "microwatt_old"):
+                self.microwatt_old = pspec.microwatt_old
+            else:
+                self.microwatt_old = True # PLEASE DO NOT ALTER THIS
 
-    efficiency and speed is not the main goal here: functional correctness
-    and code clarity is.  optimisations (which almost 100% interfere with
-    easy understanding) come later.
-    """
-    def __init__(self, pspec):
+            if hasattr(pspec, "microwatt_debug"):
+                self.microwatt_debug = pspec.microwatt_debug
+            else:
+                self.microwatt_debug = True # set to False when using an FPGA
 
         # test is SVP64 is to be enabled
         self.svp64_en = hasattr(pspec, "svp64") and (pspec.svp64 == True)
 
         # and if regfiles are reduced
         self.regreduce_en = (hasattr(pspec, "regreduce") and
-                                            (pspec.regreduce == True))
+                             (pspec.regreduce == True))
 
         # and if overlap requested
         self.allow_overlap = (hasattr(pspec, "allow_overlap") and
-                                            (pspec.allow_overlap == True))
+                              (pspec.allow_overlap == True))
+
+        # and get the core domain
+        self.core_domain = "coresync"
+        if (hasattr(pspec, "core_domain") and
+            isinstance(pspec.core_domain, str)):
+            self.core_domain = pspec.core_domain
 
         # JTAG interface.  add this right at the start because if it's
         # added it *modifies* the pspec, by adding enable/disable signals
         # for parts of the rest of the core
         self.jtag_en = hasattr(pspec, "debug") and pspec.debug == 'jtag'
-        self.dbg_domain = "sync" # sigh "dbgsunc" too problematic
-        #self.dbg_domain = "dbgsync" # domain for DMI/JTAG clock
+        #self.dbg_domain = "sync"  # sigh "dbgsunc" too problematic
+        self.dbg_domain = "dbgsync" # domain for DMI/JTAG clock
         if self.jtag_en:
-            # XXX MUST keep this up-to-date with litex, and
+            # XXX MUST keep this up-to-date with fabric, and
             # soc-cocotb-sim, and err.. all needs sorting out, argh
             subset = ['uart',
                       'mtwi',
                       'eint', 'gpio', 'mspi0',
                       # 'mspi1', - disabled for now
                       # 'pwm', 'sd0', - disabled for now
-                       'sdr']
+                      'sdr']
             self.jtag = JTAG(get_pinspecs(subset=subset),
                              domain=self.dbg_domain)
             # add signals to pspec to enable/disable icache and dcache
@@ -396,7 +237,7 @@ class TestIssuerInternal(Elaboratable):
             self.sram4k = []
             for i in range(4):
                 self.sram4k.append(SPBlock512W64B8W(name="sram4k_%d" % i,
-                                                    #features={'err'}
+                                                    # features={'err'}
                                                     ))
 
         # add interrupt controller?
@@ -405,6 +246,8 @@ class TestIssuerInternal(Elaboratable):
             self.xics_icp = XICS_ICP()
             self.xics_ics = XICS_ICS()
             self.int_level_i = self.xics_ics.int_level_i
+        else:
+            self.ext_irq = Signal()
 
         # add GPIO peripheral?
         self.gpio = hasattr(pspec, "gpio") and pspec.gpio == True
@@ -414,11 +257,11 @@ class TestIssuerInternal(Elaboratable):
 
         # main instruction core.  suitable for prototyping / demo only
         self.core = core = NonProductionCore(pspec)
-        self.core_rst = ResetSignal("coresync")
+        self.core_rst = ResetSignal(self.core_domain)
 
         # instruction decoder.  goes into Trap Record
         #pdecode = create_pdecode()
-        self.cur_state = CoreState("cur") # current state (MSR/PC/SVSTATE)
+        self.cur_state = CoreState("cur")  # current state (MSR/PC/SVSTATE)
         self.pdecode2 = PowerDecode2(None, state=self.cur_state,
                                      opkls=IssuerDecode2ToOperand,
                                      svp64_en=self.svp64_en,
@@ -426,45 +269,61 @@ class TestIssuerInternal(Elaboratable):
         pdecode = self.pdecode2.dec
 
         if self.svp64_en:
-            self.svp64 = SVP64PrefixDecoder() # for decoding SVP64 prefix
+            self.svp64 = SVP64PrefixDecoder()  # for decoding SVP64 prefix
+
+        self.update_svstate = Signal()  # set this if updating svstate
+        self.new_svstate = new_svstate = SVSTATERec("new_svstate")
 
         # Test Instruction memory
+        if hasattr(core, "icache"):
+            # XXX BLECH! use pspec to transfer the I-Cache to ConfigFetchUnit
+            # truly dreadful.  needs a huge reorg.
+            pspec.icache = core.icache
         self.imem = ConfigFetchUnit(pspec).fu
 
         # DMI interface
         self.dbg = CoreDebug()
+        self.dbg_rst_i = Signal(reset_less=True)
 
         # instruction go/monitor
         self.pc_o = Signal(64, reset_less=True)
-        self.pc_i = Data(64, "pc_i") # set "ok" to indicate "please change me"
-        self.svstate_i = Data(64, "svstate_i") # ditto
-        self.core_bigendian_i = Signal() # TODO: set based on MSR.LE
+        self.pc_i = Data(64, "pc_i")  # set "ok" to indicate "please change me"
+        self.msr_i = Data(64, "msr_i") # set "ok" to indicate "please change me"
+        self.svstate_i = Data(64, "svstate_i")  # ditto
+        self.core_bigendian_i = Signal()  # TODO: set based on MSR.LE
         self.busy_o = Signal(reset_less=True)
         self.memerr_o = Signal(reset_less=True)
 
         # STATE regfile read /write ports for PC, MSR, SVSTATE
         staterf = self.core.regs.rf['state']
-        self.state_r_pc = staterf.r_ports['cia'] # PC rd
-        self.state_w_pc = staterf.w_ports['d_wr1'] # PC wr
-        self.state_r_sv = staterf.r_ports['sv'] # SVSTATE rd
-        self.state_w_sv = staterf.w_ports['sv'] # SVSTATE wr
+        self.state_r_msr = staterf.r_ports['msr']  # MSR rd
+        self.state_r_pc = staterf.r_ports['cia']  # PC rd
+        self.state_r_sv = staterf.r_ports['sv']  # SVSTATE rd
+
+        self.state_w_msr = staterf.w_ports['d_wr2']  # MSR wr
+        self.state_w_pc = staterf.w_ports['d_wr1']  # PC wr
+        self.state_w_sv = staterf.w_ports['sv']  # SVSTATE wr
 
         # DMI interface access
         intrf = self.core.regs.rf['int']
+        fastrf = self.core.regs.rf['fast']
         crrf = self.core.regs.rf['cr']
         xerrf = self.core.regs.rf['xer']
-        self.int_r = intrf.r_ports['dmi'] # INT read
-        self.cr_r = crrf.r_ports['full_cr_dbg'] # CR read
-        self.xer_r = xerrf.r_ports['full_xer'] # XER read
+        self.int_r = intrf.r_ports['dmi']  # INT DMI read
+        self.cr_r = crrf.r_ports['full_cr_dbg']  # CR DMI read
+        self.xer_r = xerrf.r_ports['full_xer']  # XER DMI read
+        self.fast_r = fastrf.r_ports['dmi']  # FAST DMI read
 
         if self.svp64_en:
             # for predication
-            self.int_pred = intrf.r_ports['pred'] # INT predicate read
-            self.cr_pred = crrf.r_ports['cr_pred'] # CR predicate read
+            self.int_pred = intrf.r_ports['pred']  # INT predicate read
+            self.cr_pred = crrf.r_ports['cr_pred']  # CR predicate read
 
         # hack method of keeping an eye on whether branch/trap set the PC
         self.state_nia = self.core.regs.rf['state'].w_ports['nia']
         self.state_nia.wen.name = 'state_nia_wen'
+        # and whether SPR pipeline sets DEC or TB (fu/spr/main_stage.py)
+        self.state_spr = self.core.regs.rf['state'].w_ports['state1']
 
         # pulse to synchronize the simulator at instruction end
         self.insn_done = Signal()
@@ -472,10 +331,661 @@ class TestIssuerInternal(Elaboratable):
         # indicate any instruction still outstanding, in execution
         self.any_busy = Signal()
 
-        if self.svp64_en:
-            # store copies of predicate masks
-            self.srcmask = Signal(64)
-            self.dstmask = Signal(64)
+        if self.svp64_en:
+            # store copies of predicate masks
+            self.srcmask = Signal(64)
+            self.dstmask = Signal(64)
+
+        # sigh, the wishbone addresses are not wishbone-compliant
+        # in old versions of microwatt, tplaten_3d_game is a new one
+        if self.microwatt_compat or self.fabric_compat:
+            self.ibus_adr = Signal(32, name='wishbone_insn_out.adr')
+            self.dbus_adr = Signal(32, name='wishbone_data_out.adr')
+
+        # add an output of the PC and instruction, and whether it was requested
+        # this is for verilator debug purposes
+        if self.microwatt_compat or self.fabric_compat:
+            self.nia = Signal(64)
+            self.msr_o = Signal(64)
+            self.nia_req = Signal(1)
+            self.insn = Signal(32)
+            self.ldst_req = Signal(1)
+            self.ldst_addr = Signal(1)
+
+        # for pausing dec/tb during an SPR pipeline event, this
+        # ensures that an SPR write (mtspr) to TB or DEC does not
+        # get overwritten by the DEC/TB FSM
+        self.pause_dec_tb = Signal()
+
+    def setup_peripherals(self, m):
+        comb, sync = m.d.comb, m.d.sync
+
+        # okaaaay so the debug module must be in coresync clock domain
+        # but NOT its reset signal. to cope with this, set every single
+        # submodule explicitly in coresync domain, debug and JTAG
+        # in their own one but using *external* reset.
+        csd = DomainRenamer(self.core_domain)
+        dbd = DomainRenamer(self.dbg_domain)
+
+        if self.microwatt_compat or self.fabric_compat:
+            m.submodules.core = core = self.core
+        else:
+            m.submodules.core = core = csd(self.core)
+
+        # this _so_ needs sorting out.  ICache is added down inside
+        # LoadStore1 and is already a submodule of LoadStore1
+        if not isinstance(self.imem, ICache):
+            m.submodules.imem = imem = csd(self.imem)
+
+        # set up JTAG Debug Module (in correct domain)
+        m.submodules.dbg = dbg = dbd(self.dbg)
+        if self.jtag_en:
+            m.submodules.jtag = jtag = dbd(self.jtag)
+            # TODO: UART2GDB mux, here, from external pin
+            # see https://bugs.libre-soc.org/show_bug.cgi?id=499
+            sync += dbg.dmi.connect_to(jtag.dmi)
+
+        # fixup the clocks in microwatt-compat mode (but leave resets alone
+        # so that microwatt soc.vhdl can pull a reset on the core or DMI
+        # can do it, just like in TestIssuer)
+        if self.microwatt_compat or self.fabric_compat:
+            intclk = ClockSignal(self.core_domain)
+            dbgclk = ClockSignal(self.dbg_domain)
+            if self.core_domain != 'sync':
+                comb += intclk.eq(ClockSignal())
+            if self.dbg_domain != 'sync':
+                comb += dbgclk.eq(ClockSignal())
+
+        # if using old version of microwatt
+        # drop the first 3 bits of the incoming wishbone addresses
+        if self.microwatt_compat or self.fabric_compat:
+            ibus = self.imem.ibus
+            dbus = self.core.l0.cmpi.wb_bus()
+            if self.microwatt_old:
+                comb += self.ibus_adr.eq(Cat(Const(0, 3), ibus.adr))
+                comb += self.dbus_adr.eq(Cat(Const(0, 3), dbus.adr))
+            else:
+                comb += self.ibus_adr.eq(ibus.adr)
+                comb += self.dbus_adr.eq(dbus.adr)
+            if self.microwatt_debug:
+                # microwatt verilator debug purposes
+                pi = self.core.l0.cmpi.pi.pi
+                comb += self.ldst_req.eq(pi.addr_ok_o)
+                comb += self.ldst_addr.eq(pi.addr)
+
+        cur_state = self.cur_state
+
+        # 4x 4k SRAM blocks.  these simply "exist", they get routed in fabric
+        if self.sram4x4k:
+            for i, sram in enumerate(self.sram4k):
+                m.submodules["sram4k_%d" % i] = csd(sram)
+                comb += sram.enable.eq(self.wb_sram_en)
+
+        # XICS interrupt handler
+        if self.xics:
+            m.submodules.xics_icp = icp = csd(self.xics_icp)
+            m.submodules.xics_ics = ics = csd(self.xics_ics)
+            comb += icp.ics_i.eq(ics.icp_o)           # connect ICS to ICP
+            sync += cur_state.eint.eq(icp.core_irq_o)  # connect ICP to core
+        else:
+            sync += cur_state.eint.eq(self.ext_irq)  # connect externally
+
+        # GPIO test peripheral
+        if self.gpio:
+            m.submodules.simple_gpio = simple_gpio = csd(self.simple_gpio)
+
+        # connect one GPIO output to ICS bit 15 (like in microwatt soc.vhdl)
+        # XXX causes fabric ECP5 test to get wrong idea about input and output
+        # (but works with verilator sim *sigh*)
+        # if self.gpio and self.xics:
+        #   comb += self.int_level_i[15].eq(simple_gpio.gpio_o[0])
+
+        # instruction decoder
+        pdecode = create_pdecode()
+        m.submodules.dec2 = pdecode2 = csd(self.pdecode2)
+        if self.svp64_en:
+            m.submodules.svp64 = svp64 = csd(self.svp64)
+
+        # clock delay power-on reset
+        cd_por = ClockDomain(reset_less=True)
+        cd_sync = ClockDomain()
+        m.domains += cd_por, cd_sync
+        core_sync = ClockDomain(self.core_domain)
+        if self.core_domain != "sync":
+            m.domains += core_sync
+        if self.dbg_domain != "sync":
+            dbg_sync = ClockDomain(self.dbg_domain)
+            m.domains += dbg_sync
+
+        # create a delay, but remember it is in the power-on-reset clock domain!
+        ti_rst = Signal(reset_less=True)
+        delay = Signal(range(4), reset=3)
+        stop_delay = Signal(range(16), reset=5)
+        with m.If(delay != 0):
+            m.d.por += delay.eq(delay - 1) # decrement... in POR domain!
+        with m.If(stop_delay != 0):
+            m.d.por += stop_delay.eq(stop_delay - 1) # likewise
+        comb += cd_por.clk.eq(ClockSignal())
+
+        # power-on reset delay
+        core_rst = ResetSignal(self.core_domain)
+        if self.core_domain != "sync":
+            comb += ti_rst.eq(delay != 0 | dbg.core_rst_o | ResetSignal())
+            comb += core_rst.eq(ti_rst)
+        else:
+            with m.If(delay != 0 | dbg.core_rst_o):
+                comb += core_rst.eq(1)
+        with m.If(stop_delay != 0):
+            # run DMI core-stop as well but on an extra couple of cycles
+            comb += dbg.core_stopped_i.eq(1)
+
+        # connect external reset signal to DMI Reset
+        if self.dbg_domain != "sync":
+            dbg_rst = ResetSignal(self.dbg_domain)
+            comb += dbg_rst.eq(self.dbg_rst_i)
+
+        # busy/halted signals from core
+        core_busy_o = ~core.p.o_ready | core.n.o_data.busy_o  # core is busy
+        comb += self.busy_o.eq(core_busy_o)
+        comb += pdecode2.dec.bigendian.eq(self.core_bigendian_i)
+
+        # temporary hack: says "go" immediately for both address gen and ST
+        # XXX: st.go_i is set to 1 cycle delay to reduce combinatorial chains
+        l0 = core.l0
+        ldst = core.fus.fus['ldst0']
+        st_go_edge = rising_edge(m, ldst.st.rel_o)
+        # link addr-go direct to rel
+        m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o)
+        m.d.sync += ldst.st.go_i.eq(st_go_edge)  # link store-go to rising rel
+
+    def do_dmi(self, m, dbg):
+        """deals with DMI debug requests
+
+        currently only provides read requests for the INT regfile, CR and XER
+        it will later also deal with *writing* to these regfiles.
+        """
+        comb = m.d.comb
+        sync = m.d.sync
+        dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer
+        d_fast = dbg.d_fast
+        intrf = self.core.regs.rf['int']
+        fastrf = self.core.regs.rf['fast']
+
+        with m.If(d_reg.req):  # request for regfile access being made
+            # TODO: error-check this
+            # XXX should this be combinatorial?  sync better?
+            if intrf.unary:
+                comb += self.int_r.ren.eq(1 << d_reg.addr)
+            else:
+                comb += self.int_r.addr.eq(d_reg.addr)
+                comb += self.int_r.ren.eq(1)
+        d_reg_delay = Signal()
+        sync += d_reg_delay.eq(d_reg.req)
+        with m.If(d_reg_delay):
+            # data arrives one clock later
+            comb += d_reg.data.eq(self.int_r.o_data)
+            comb += d_reg.ack.eq(1)
+
+        # fast regfile
+        with m.If(d_fast.req):  # request for regfile access being made
+            if fastrf.unary:
+                comb += self.fast_r.ren.eq(1 << d_fast.addr)
+            else:
+                comb += self.fast_r.addr.eq(d_fast.addr)
+                comb += self.fast_r.ren.eq(1)
+        d_fast_delay = Signal()
+        sync += d_fast_delay.eq(d_fast.req)
+        with m.If(d_fast_delay):
+            # data arrives one clock later
+            comb += d_fast.data.eq(self.fast_r.o_data)
+            comb += d_fast.ack.eq(1)
+
+        # sigh same thing for CR debug
+        with m.If(d_cr.req):  # request for regfile access being made
+            comb += self.cr_r.ren.eq(0b11111111)  # enable all
+        d_cr_delay = Signal()
+        sync += d_cr_delay.eq(d_cr.req)
+        with m.If(d_cr_delay):
+            # data arrives one clock later
+            comb += d_cr.data.eq(self.cr_r.o_data)
+            comb += d_cr.ack.eq(1)
+
+        # aaand XER...
+        with m.If(d_xer.req):  # request for regfile access being made
+            comb += self.xer_r.ren.eq(0b111111)  # enable all
+        d_xer_delay = Signal()
+        sync += d_xer_delay.eq(d_xer.req)
+        with m.If(d_xer_delay):
+            # data arrives one clock later
+            comb += d_xer.data.eq(self.xer_r.o_data)
+            comb += d_xer.ack.eq(1)
+
+    def tb_dec_fsm(self, m, spr_dec):
+        """tb_dec_fsm
+
+        this is a FSM for updating either dec or tb.  it runs alternately
+        DEC, TB, DEC, TB.  note that SPR pipeline could have written a new
+        value to DEC, however the regfile has "passthrough" on it so this
+        *should* be ok.
+
+        see v3.0B p1097-1099 for Timer Resource and p1065 and p1076
+        """
+
+        comb, sync = m.d.comb, m.d.sync
+        state_rf = self.core.regs.rf['state']
+        state_r_dectb = state_rf.r_ports['issue']  # DEC/TB
+        state_w_dectb = state_rf.w_ports['issue']  # DEC/TB
+
+
+        with m.FSM() as fsm:
+
+            # initiates read of current DEC
+            with m.State("DEC_READ"):
+                comb += state_r_dectb.ren.eq(1<<StateRegs.DEC)
+                with m.If(~self.pause_dec_tb):
+                    m.next = "DEC_WRITE"
+
+            # waits for DEC read to arrive (1 cycle), updates with new value
+            # respects if dec/tb writing has been paused
+            with m.State("DEC_WRITE"):
+                with m.If(self.pause_dec_tb):
+                    # if paused, return to reading
+                    m.next = "DEC_READ"
+                with m.Else():
+                    new_dec = Signal(64)
+                    # TODO: MSR.LPCR 32-bit decrement mode
+                    comb += new_dec.eq(state_r_dectb.o_data - 1)
+                    comb += state_w_dectb.wen.eq(1<<StateRegs.DEC)
+                    comb += state_w_dectb.i_data.eq(new_dec)
+                    # copy to cur_state for decoder, for an interrupt
+                    sync += spr_dec.eq(new_dec)
+                    m.next = "TB_READ"
+
+            # initiates read of current TB
+            with m.State("TB_READ"):
+                comb += state_r_dectb.ren.eq(1<<StateRegs.TB)
+                with m.If(~self.pause_dec_tb):
+                    m.next = "TB_WRITE"
+
+            # waits for read TB to arrive, initiates write of current TB
+            # respects if dec/tb writing has been paused
+            with m.State("TB_WRITE"):
+                with m.If(self.pause_dec_tb):
+                    # if paused, return to reading
+                    m.next = "TB_READ"
+                with m.Else():
+                    new_tb = Signal(64)
+                    comb += new_tb.eq(state_r_dectb.o_data + 1)
+                    comb += state_w_dectb.wen.eq(1<<StateRegs.TB)
+                    comb += state_w_dectb.i_data.eq(new_tb)
+                    m.next = "DEC_READ"
+
+        return m
+
+    def elaborate(self, platform):
+        m = Module()
+        # convenience
+        comb, sync = m.d.comb, m.d.sync
+        cur_state = self.cur_state
+        pdecode2 = self.pdecode2
+        dbg = self.dbg
+
+        # set up peripherals and core
+        core_rst = self.core_rst
+        self.setup_peripherals(m)
+
+        # reset current state if core reset requested
+        with m.If(core_rst):
+            m.d.sync += self.cur_state.eq(0)
+            # and, sigh, set configured values, which are also done in regfile
+            # XXX ??? what the hell is the shift for??
+            m.d.sync += self.cur_state.pc.eq(self.core.pc_at_reset)
+            m.d.sync += self.cur_state.msr.eq(self.core.msr_at_reset)
+
+        # check halted condition: requested PC to execute matches DMI stop addr
+        # and immediately stop. address of 0xffff_ffff_ffff_ffff can never
+        # match
+        halted = Signal()
+        comb += halted.eq(dbg.stop_addr_o == dbg.state.pc)
+        with m.If(halted):
+            comb += dbg.core_stopped_i.eq(1)
+            comb += dbg.terminate_i.eq(1)
+
+        # PC and instruction from I-Memory
+        comb += self.pc_o.eq(cur_state.pc)
+        self.pc_changed = Signal()  # note write to PC
+        self.msr_changed = Signal()  # note write to MSR
+        self.sv_changed = Signal()  # note write to SVSTATE
+
+        # read state either from incoming override or from regfile
+        state = CoreState("get")  # current state (MSR/PC/SVSTATE)
+        state_get(m, state.msr, core_rst, self.msr_i,
+                       "msr",                  # read MSR
+                       self.state_r_msr, StateRegs.MSR)
+        state_get(m, state.pc, core_rst, self.pc_i,
+                       "pc",                  # read PC
+                       self.state_r_pc, StateRegs.PC)
+        state_get(m, state.svstate, core_rst, self.svstate_i,
+                            "svstate",   # read SVSTATE
+                            self.state_r_sv, StateRegs.SVSTATE)
+
+        # don't write pc every cycle
+        comb += self.state_w_pc.wen.eq(0)
+        comb += self.state_w_pc.i_data.eq(0)
+
+        # connect up debug state.  note "combinatorially same" below,
+        # this is a bit naff, passing state over in the dbg class, but
+        # because it is combinatorial it achieves the desired goal
+        comb += dbg.state.eq(state)
+
+        # this bit doesn't have to be in the FSM: connect up to read
+        # regfiles on demand from DMI
+        self.do_dmi(m, dbg)
+
+        # DEC and TB inc/dec FSM.  copy of DEC is put into CoreState,
+        # (which uses that in PowerDecoder2 to raise 0x900 exception)
+        self.tb_dec_fsm(m, cur_state.dec)
+
+        # while stopped, allow updating the MSR, PC and SVSTATE.
+        # these are mainly for debugging purposes (including DMI/JTAG)
+        with m.If(dbg.core_stopped_i):
+            with m.If(self.pc_i.ok):
+                comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
+                comb += self.state_w_pc.i_data.eq(self.pc_i.data)
+                sync += self.pc_changed.eq(1)
+            with m.If(self.msr_i.ok):
+                comb += self.state_w_msr.wen.eq(1 << StateRegs.MSR)
+                comb += self.state_w_msr.i_data.eq(self.msr_i.data)
+                sync += self.msr_changed.eq(1)
+            with m.If(self.svstate_i.ok | self.update_svstate):
+                with m.If(self.svstate_i.ok): # over-ride from external source
+                    comb += self.new_svstate.eq(self.svstate_i.data)
+                comb += self.state_w_sv.wen.eq(1 << StateRegs.SVSTATE)
+                comb += self.state_w_sv.i_data.eq(self.new_svstate)
+                sync += self.sv_changed.eq(1)
+
+        # start renaming some of the ports to match microwatt
+        if self.microwatt_compat or self.fabric_compat:
+            self.core.o.core_terminate_o.name = "terminated_out"
+            # names of DMI interface
+            self.dbg.dmi.addr_i.name = 'dmi_addr'
+            self.dbg.dmi.din.name    = 'dmi_din'
+            self.dbg.dmi.dout.name   = 'dmi_dout'
+            self.dbg.dmi.req_i.name  = 'dmi_req'
+            self.dbg.dmi.we_i.name   = 'dmi_wr'
+            self.dbg.dmi.ack_o.name  = 'dmi_ack'
+            # wishbone instruction bus
+            ibus = self.imem.ibus
+            if self.microwatt_compat:
+                ibus.adr.name = 'wishbone_insn_out.adr'
+                ibus.dat_w.name = 'wishbone_insn_out.dat'
+                ibus.sel.name = 'wishbone_insn_out.sel'
+                ibus.cyc.name = 'wishbone_insn_out.cyc'
+                ibus.stb.name = 'wishbone_insn_out.stb'
+                ibus.we.name = 'wishbone_insn_out.we'
+                ibus.dat_r.name = 'wishbone_insn_in.dat'
+                ibus.ack.name = 'wishbone_insn_in.ack'
+                ibus.stall.name = 'wishbone_insn_in.stall'
+            # wishbone data bus
+            dbus = self.core.l0.cmpi.wb_bus()
+            if self.microwatt_compat:
+                dbus.adr.name = 'wishbone_data_out.adr'
+                dbus.dat_w.name = 'wishbone_data_out.dat'
+                dbus.sel.name = 'wishbone_data_out.sel'
+                dbus.cyc.name = 'wishbone_data_out.cyc'
+                dbus.stb.name = 'wishbone_data_out.stb'
+                dbus.we.name = 'wishbone_data_out.we'
+                dbus.dat_r.name = 'wishbone_data_in.dat'
+                dbus.ack.name = 'wishbone_data_in.ack'
+                dbus.stall.name = 'wishbone_data_in.stall'
+
+        return m
+
+    def __iter__(self):
+        yield from self.pc_i.ports()
+        yield from self.msr_i.ports()
+        yield self.pc_o
+        yield self.memerr_o
+        yield from self.core.ports()
+        yield from self.imem.ports()
+        yield self.core_bigendian_i
+        yield self.busy_o
+
+    def ports(self):
+        return list(self)
+
+    def external_ports(self):
+        if self.microwatt_compat or self.fabric_compat:
+            if self.fabric_compat:
+                ports = [self.core.o.core_terminate_o,
+                         self.alt_reset, # not connected yet
+                         self.nia, self.insn, self.nia_req, self.msr_o,
+                         self.ldst_req, self.ldst_addr,
+                         ClockSignal(),
+                         ResetSignal(),
+                        ]
+            else:
+                ports = [self.core.o.core_terminate_o,
+                         self.ext_irq,
+                         self.alt_reset, # not connected yet
+                         self.nia, self.insn, self.nia_req, self.msr_o,
+                         self.ldst_req, self.ldst_addr,
+                         ClockSignal(),
+                         ResetSignal(),
+                        ]
+            ports += list(self.dbg.dmi.ports())
+            # for dbus/ibus microwatt, exclude err btw and cti
+            for name, sig in self.imem.ibus.fields.items():
+                if name not in ['err', 'bte', 'cti', 'adr']:
+                    ports.append(sig)
+            for name, sig in self.core.l0.cmpi.wb_bus().fields.items():
+                if name not in ['err', 'bte', 'cti', 'adr']:
+                    ports.append(sig)
+            # microwatt non-compliant with wishbone
+            ports.append(self.ibus_adr)
+            ports.append(self.dbus_adr)
+
+            if self.microwatt_compat:
+                # Ignore the remaining ports in microwatt compat mode
+                return ports
+
+        ports = self.pc_i.ports()
+        ports = self.msr_i.ports()
+        ports += [self.pc_o, self.memerr_o, self.core_bigendian_i, self.busy_o,
+                  ]
+
+        if self.jtag_en:
+            ports += list(self.jtag.external_ports())
+        else:
+            # don't add DMI if JTAG is enabled
+            ports += list(self.dbg.dmi.ports())
+
+        ports += list(self.imem.ibus.fields.values())
+        ports += list(self.core.l0.cmpi.wb_bus().fields.values())
+
+        if self.sram4x4k:
+            for sram in self.sram4k:
+                ports += list(sram.bus.fields.values())
+
+        if self.xics:
+            ports += list(self.xics_icp.bus.fields.values())
+            ports += list(self.xics_ics.bus.fields.values())
+            ports.append(self.int_level_i)
+        else:
+            ports.append(self.ext_irq)
+
+        if self.gpio:
+            ports += list(self.simple_gpio.bus.fields.values())
+            ports.append(self.gpio_o)
+
+        return ports
+
+    def ports(self):
+        return list(self)
+
+
+class TestIssuerInternal(TestIssuerBase):
+    """TestIssuer - reads instructions from TestMemory and issues them
+
+    efficiency and speed is not the main goal here: functional correctness
+    and code clarity is.  optimisations (which almost 100% interfere with
+    easy understanding) come later.
+    """
+
+    def fetch_fsm(self, m, dbg, core, core_rst, nia, is_svp64_mode,
+                        fetch_pc_o_ready, fetch_pc_i_valid,
+                        fetch_insn_o_valid, fetch_insn_i_ready):
+        """fetch FSM
+
+        this FSM performs fetch of raw instruction data, partial-decodes
+        it 32-bit at a time to detect SVP64 prefixes, and will optionally
+        read a 2nd 32-bit quantity if that occurs.
+        """
+        comb = m.d.comb
+        sync = m.d.sync
+        pdecode2 = self.pdecode2
+        cur_state = self.cur_state
+        dec_opcode_i = pdecode2.dec.raw_opcode_in # raw opcode
+        pc, msr, svstate = cur_state.pc, cur_state.msr, cur_state.svstate
+
+        # also note instruction fetch failed
+        if hasattr(core, "icache"):
+            fetch_failed = core.icache.i_out.fetch_failed
+            flush_needed = True
+        else:
+            fetch_failed = Const(0, 1)
+            flush_needed = False
+
+        # set priv / virt mode on I-Cache, sigh
+        if isinstance(self.imem, ICache):
+            comb += self.imem.i_in.priv_mode.eq(~msr[MSR.PR])
+            comb += self.imem.i_in.virt_mode.eq(msr[MSR.IR]) # Instr. Redir (VM)
+
+        with m.FSM(name='fetch_fsm'):
+
+            # allow fetch to not run at startup due to I-Cache reset not
+            # having time to settle.  power-on-reset holds dbg.core_stopped_i
+            with m.State("PRE_IDLE"):
+                with m.If(~dbg.core_stopped_i & ~dbg.core_stop_o & ~core_rst):
+                    m.next = "IDLE"
+
+            # waiting (zzz)
+            with m.State("IDLE"):
+                # fetch allowed if not failed and stopped but not stepping
+                # (see dmi.py for how core_stop_o is generated)
+                with m.If(~fetch_failed & ~dbg.core_stop_o):
+                    comb += fetch_pc_o_ready.eq(1)
+                with m.If(fetch_pc_i_valid & ~pdecode2.instr_fault
+                          & ~dbg.core_stop_o):
+                    # instruction allowed to go: start by reading the PC
+                    # capture the PC and also drop it into Insn Memory
+                    # we have joined a pair of combinatorial memory
+                    # lookups together.  this is Generally Bad.
+                    comb += self.imem.a_pc_i.eq(pc)
+                    comb += self.imem.a_i_valid.eq(1)
+                    comb += self.imem.f_i_valid.eq(1)
+                    m.next = "INSN_READ"  # move to "wait for bus" phase
+
+            # dummy pause to find out why simulation is not keeping up
+            with m.State("INSN_READ"):
+                # when using "single-step" mode, checking dbg.stopping_o
+                # prevents progress.  allow fetch to proceed once started
+                stopping = Const(0)
+                #if self.allow_overlap:
+                #    stopping = dbg.stopping_o
+                with m.If(stopping):
+                    # stopping: jump back to idle
+                    m.next = "IDLE"
+                with m.Else():
+                    with m.If(self.imem.f_busy_o &
+                              ~pdecode2.instr_fault):  # zzz...
+                        # busy but not fetch failed: stay in wait-read
+                        comb += self.imem.a_pc_i.eq(pc)
+                        comb += self.imem.a_i_valid.eq(1)
+                        comb += self.imem.f_i_valid.eq(1)
+                    with m.Else():
+                        # not busy (or fetch failed!): instruction fetched
+                        # when fetch failed, the instruction gets ignored
+                        # by the decoder
+                        if hasattr(core, "icache"):
+                            # blech, icache returns actual instruction
+                            insn = self.imem.f_instr_o
+                        else:
+                            # but these return raw memory
+                            insn = get_insn(self.imem.f_instr_o, cur_state.pc)
+                        if self.svp64_en:
+                            svp64 = self.svp64
+                            # decode the SVP64 prefix, if any
+                            comb += svp64.raw_opcode_in.eq(insn)
+                            comb += svp64.bigendian.eq(self.core_bigendian_i)
+                            # pass the decoded prefix (if any) to PowerDecoder2
+                            sync += pdecode2.sv_rm.eq(svp64.svp64_rm)
+                            sync += pdecode2.is_svp64_mode.eq(is_svp64_mode)
+                            # remember whether this is a prefixed instruction,
+                            # so the FSM can readily loop when VL==0
+                            sync += is_svp64_mode.eq(svp64.is_svp64_mode)
+                            # calculate the address of the following instruction
+                            insn_size = Mux(svp64.is_svp64_mode, 8, 4)
+                            sync += nia.eq(cur_state.pc + insn_size)
+                            with m.If(~svp64.is_svp64_mode):
+                                # with no prefix, store the instruction
+                                # and hand it directly to the next FSM
+                                sync += dec_opcode_i.eq(insn)
+                                m.next = "INSN_READY"
+                            with m.Else():
+                                # fetch the rest of the instruction from memory
+                                comb += self.imem.a_pc_i.eq(cur_state.pc + 4)
+                                comb += self.imem.a_i_valid.eq(1)
+                                comb += self.imem.f_i_valid.eq(1)
+                                m.next = "INSN_READ2"
+                        else:
+                            # not SVP64 - 32-bit only
+                            sync += nia.eq(cur_state.pc + 4)
+                            sync += dec_opcode_i.eq(insn)
+                            if self.microwatt_compat or self.fabric_compat:
+                                # for verilator debug purposes
+                                comb += self.insn.eq(insn)
+                                comb += self.nia.eq(cur_state.pc)
+                                comb += self.msr_o.eq(cur_state.msr)
+                                comb += self.nia_req.eq(1)
+                            m.next = "INSN_READY"
+
+            with m.State("INSN_READ2"):
+                with m.If(self.imem.f_busy_o):  # zzz...
+                    # busy: stay in wait-read
+                    comb += self.imem.a_i_valid.eq(1)
+                    comb += self.imem.f_i_valid.eq(1)
+                with m.Else():
+                    # not busy: instruction fetched
+                    if hasattr(core, "icache"):
+                        # blech, icache returns actual instruction
+                        insn = self.imem.f_instr_o
+                    else:
+                        insn = get_insn(self.imem.f_instr_o, cur_state.pc+4)
+                    sync += dec_opcode_i.eq(insn)
+                    m.next = "INSN_READY"
+                    # TODO: probably can start looking at pdecode2.rm_dec
+                    # here or maybe even in INSN_READ state, if svp64_mode
+                    # detected, in order to trigger - and wait for - the
+                    # predicate reading.
+                    if self.svp64_en:
+                        pmode = pdecode2.rm_dec.predmode
+                    """
+                    if pmode != SVP64PredMode.ALWAYS.value:
+                        fire predicate loading FSM and wait before
+                        moving to INSN_READY
+                    else:
+                        sync += self.srcmask.eq(-1) # set to all 1s
+                        sync += self.dstmask.eq(-1) # set to all 1s
+                        m.next = "INSN_READY"
+                    """
+
+            with m.State("INSN_READY"):
+                # hand over the instruction, to be decoded
+                comb += fetch_insn_o_valid.eq(1)
+                with m.If(fetch_insn_i_ready):
+                    m.next = "IDLE"
+
 
     def fetch_predicate_fsm(self, m,
                             pred_insn_i_valid, pred_insn_o_ready,
@@ -496,7 +1006,7 @@ class TestIssuerInternal(Elaboratable):
         comb = m.d.comb
         sync = m.d.sync
         pdecode2 = self.pdecode2
-        rm_dec = pdecode2.rm_dec # SVP64RMModeDecode
+        rm_dec = pdecode2.rm_dec  # SVP64RMModeDecode
         predmode = rm_dec.predmode
         srcpred, dstpred = rm_dec.srcpred, rm_dec.dstpred
         cr_pred, int_pred = self.cr_pred, self.int_pred   # read regfiles
@@ -623,8 +1133,10 @@ class TestIssuerInternal(Elaboratable):
                     scr_bit = Signal()
                     dcr_bit = Signal()
                     comb += cr_field.eq(cr_pred.o_data)
-                    comb += scr_bit.eq(cr_field.bit_select(sidx, 1) ^ scrinvert)
-                    comb += dcr_bit.eq(cr_field.bit_select(didx, 1) ^ dcrinvert)
+                    comb += scr_bit.eq(cr_field.bit_select(sidx, 1)
+                                       ^ scrinvert)
+                    comb += dcr_bit.eq(cr_field.bit_select(didx, 1)
+                                       ^ dcrinvert)
                     # set the corresponding mask bit
                     bit_to_set = Signal.like(self.srcmask)
                     comb += bit_to_set.eq(1 << cur_cr_idx)
@@ -644,7 +1156,7 @@ class TestIssuerInternal(Elaboratable):
                 with m.If(pred_mask_i_ready):
                     m.next = "FETCH_PRED_IDLE"
 
-    def issue_fsm(self, m, core, pc_changed, sv_changed, nia,
+    def issue_fsm(self, m, core, nia,
                   dbg, core_rst, is_svp64_mode,
                   fetch_pc_o_ready, fetch_pc_i_valid,
                   fetch_insn_o_valid, fetch_insn_i_ready,
@@ -667,13 +1179,12 @@ class TestIssuerInternal(Elaboratable):
         sync = m.d.sync
         pdecode2 = self.pdecode2
         cur_state = self.cur_state
+        new_svstate = self.new_svstate
 
         # temporaries
-        dec_opcode_i = pdecode2.dec.raw_opcode_in # raw opcode
+        dec_opcode_i = pdecode2.dec.raw_opcode_in  # raw opcode
 
         # for updating svstate (things like srcstep etc.)
-        update_svstate = Signal() # set this (below) if updating
-        new_svstate = SVSTATERec("new_svstate")
         comb += new_svstate.eq(cur_state.svstate)
 
         # precalculate srcstep+1 and dststep+1
@@ -687,41 +1198,66 @@ class TestIssuerInternal(Elaboratable):
         # note if an exception happened.  in a pipelined or OoO design
         # this needs to be accompanied by "shadowing" (or stalling)
         exc_happened = self.core.o.exc_happened
+        # also note instruction fetch failed
+        if hasattr(core, "icache"):
+            fetch_failed = core.icache.i_out.fetch_failed
+            flush_needed = True
+            # set to fault in decoder
+            # update (highest priority) instruction fault
+            rising_fetch_failed = rising_edge(m, fetch_failed)
+            with m.If(rising_fetch_failed):
+                sync += pdecode2.instr_fault.eq(1)
+        else:
+            fetch_failed = Const(0, 1)
+            flush_needed = False
+
+        sync += fetch_pc_i_valid.eq(0)
 
         with m.FSM(name="issue_fsm"):
 
+            with m.State("PRE_IDLE"):
+                with m.If(~dbg.core_stop_o & ~core_rst):
+                    m.next = "ISSUE_START"
+
             # sync with the "fetch" phase which is reading the instruction
             # at this point, there is no instruction running, that
             # could inadvertently update the PC.
             with m.State("ISSUE_START"):
+                # reset instruction fault
+                sync += pdecode2.instr_fault.eq(0)
                 # wait on "core stop" release, before next fetch
                 # need to do this here, in case we are in a VL==0 loop
                 with m.If(~dbg.core_stop_o & ~core_rst):
-                    comb += fetch_pc_i_valid.eq(1) # tell fetch to start
+                    sync += fetch_pc_i_valid.eq(1)  # tell fetch to start
+                    sync += cur_state.pc.eq(dbg.state.pc)
+                    sync += cur_state.svstate.eq(dbg.state.svstate)
+                    sync += cur_state.msr.eq(dbg.state.msr)
                     with m.If(fetch_pc_o_ready):   # fetch acknowledged us
                         m.next = "INSN_WAIT"
                 with m.Else():
                     # tell core it's stopped, and acknowledge debug handshake
                     comb += dbg.core_stopped_i.eq(1)
-                    # while stopped, allow updating the PC and SVSTATE
-                    with m.If(self.pc_i.ok):
-                        comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
-                        comb += self.state_w_pc.i_data.eq(self.pc_i.data)
-                        sync += pc_changed.eq(1)
+                    # while stopped, allow updating SVSTATE
                     with m.If(self.svstate_i.ok):
                         comb += new_svstate.eq(self.svstate_i.data)
-                        comb += update_svstate.eq(1)
-                        sync += sv_changed.eq(1)
+                        comb += self.update_svstate.eq(1)
+                        sync += self.sv_changed.eq(1)
 
             # wait for an instruction to arrive from Fetch
             with m.State("INSN_WAIT"):
-                if self.allow_overlap:
-                    stopping = dbg.stopping_o
-                else:
-                    stopping = Const(0)
+                # when using "single-step" mode, checking dbg.stopping_o
+                # prevents progress.  allow issue to proceed once started
+                stopping = Const(0)
+                #if self.allow_overlap:
+                #    stopping = dbg.stopping_o
                 with m.If(stopping):
                     # stopping: jump back to idle
                     m.next = "ISSUE_START"
+                    if flush_needed:
+                        # request the icache to stop asserting "failed"
+                        comb += core.icache.flush_in.eq(1)
+                    # stop instruction fault
+                    sync += pdecode2.instr_fault.eq(0)
                 with m.Else():
                     comb += fetch_insn_i_ready.eq(1)
                     with m.If(fetch_insn_o_valid):
@@ -749,8 +1285,8 @@ class TestIssuerInternal(Elaboratable):
                     m.next = "MASK_WAIT"
 
             with m.State("MASK_WAIT"):
-                comb += pred_mask_i_ready.eq(1) # ready to receive the masks
-                with m.If(pred_mask_o_valid): # predication masks are ready
+                comb += pred_mask_i_ready.eq(1)  # ready to receive the masks
+                with m.If(pred_mask_o_valid):  # predication masks are ready
                     m.next = "PRED_SKIP"
 
             # skip zeros in predicate
@@ -803,7 +1339,7 @@ class TestIssuerInternal(Elaboratable):
                             comb += self.state_w_pc.i_data.eq(nia)
                             comb += new_svstate.srcstep.eq(0)
                             comb += new_svstate.dststep.eq(0)
-                            comb += update_svstate.eq(1)
+                            comb += self.update_svstate.eq(1)
                             # synchronize with the simulator
                             comb += self.insn_done.eq(1)
                             # go back to Issue
@@ -812,7 +1348,7 @@ class TestIssuerInternal(Elaboratable):
                             # update new src/dst step
                             comb += new_svstate.srcstep.eq(skip_srcstep)
                             comb += new_svstate.dststep.eq(skip_dststep)
-                            comb += update_svstate.eq(1)
+                            comb += self.update_svstate.eq(1)
                             # proceed to Decode
                             m.next = "DECODE_SV"
 
@@ -825,6 +1361,8 @@ class TestIssuerInternal(Elaboratable):
             # to decode the instruction
             with m.State("DECODE_SV"):
                 # decode the instruction
+                with m.If(~fetch_failed):
+                    sync += pdecode2.instr_fault.eq(0)
                 sync += core.i.e.eq(pdecode2.e)
                 sync += core.i.state.eq(cur_state)
                 sync += core.i.raw_insn_i.eq(dec_opcode_i)
@@ -846,96 +1384,113 @@ class TestIssuerInternal(Elaboratable):
 
             # handshake with execution FSM, move to "wait" once acknowledged
             with m.State("INSN_EXECUTE"):
-                comb += exec_insn_i_valid.eq(1) # trigger execute
-                with m.If(exec_insn_o_ready):   # execute acknowledged us
-                    m.next = "EXECUTE_WAIT"
+                # when using "single-step" mode, checking dbg.stopping_o
+                # prevents progress.  allow execute to proceed once started
+                stopping = Const(0)
+                #if self.allow_overlap:
+                #    stopping = dbg.stopping_o
+                with m.If(stopping):
+                    # stopping: jump back to idle
+                    m.next = "ISSUE_START"
+                    if flush_needed:
+                        # request the icache to stop asserting "failed"
+                        comb += core.icache.flush_in.eq(1)
+                    # stop instruction fault
+                    sync += pdecode2.instr_fault.eq(0)
+                with m.Else():
+                    comb += exec_insn_i_valid.eq(1)  # trigger execute
+                    with m.If(exec_insn_o_ready):   # execute acknowledged us
+                        m.next = "EXECUTE_WAIT"
 
             with m.State("EXECUTE_WAIT"):
-                # wait on "core stop" release, at instruction end
-                # need to do this here, in case we are in a VL>1 loop
-                with m.If(~dbg.core_stop_o & ~core_rst):
-                    comb += exec_pc_i_ready.eq(1)
-                    # see https://bugs.libre-soc.org/show_bug.cgi?id=636
-                    # the exception info needs to be blatted into
-                    # pdecode.ldst_exc, and the instruction "re-run".
-                    # when ldst_exc.happened is set, the PowerDecoder2
-                    # reacts very differently: it re-writes the instruction
-                    # with a "trap" (calls PowerDecoder2.trap()) which
-                    # will *overwrite* whatever was requested and jump the
-                    # PC to the exception address, as well as alter MSR.
-                    # nothing else needs to be done other than to note
-                    # the change of PC and MSR (and, later, SVSTATE)
-                    with m.If(exc_happened):
-                        sync += pdecode2.ldst_exc.eq(core.fus.get_exc("ldst0"))
-
-                    with m.If(exec_pc_o_valid):
-
-                        # was this the last loop iteration?
-                        is_last = Signal()
-                        cur_vl = cur_state.svstate.vl
-                        comb += is_last.eq(next_srcstep == cur_vl)
-
-                        # return directly to Decode if Execute generated an
-                        # exception.
-                        with m.If(pdecode2.ldst_exc.happened):
-                            m.next = "DECODE_SV"
-
-                        # if either PC or SVSTATE were changed by the previous
-                        # instruction, go directly back to Fetch, without
-                        # updating either PC or SVSTATE
-                        with m.Elif(pc_changed | sv_changed):
-                            m.next = "ISSUE_START"
-
-                        # also return to Fetch, when no output was a vector
-                        # (regardless of SRCSTEP and VL), or when the last
-                        # instruction was really the last one of the VL loop
-                        with m.Elif((~pdecode2.loop_continue) | is_last):
-                            # before going back to fetch, update the PC state
-                            # register with the NIA.
-                            # ok here we are not reading the branch unit.
-                            # TODO: this just blithely overwrites whatever
-                            #       pipeline updated the PC
-                            comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
-                            comb += self.state_w_pc.i_data.eq(nia)
-                            # reset SRCSTEP before returning to Fetch
-                            if self.svp64_en:
-                                with m.If(pdecode2.loop_continue):
-                                    comb += new_svstate.srcstep.eq(0)
-                                    comb += new_svstate.dststep.eq(0)
-                                    comb += update_svstate.eq(1)
-                            else:
+                comb += exec_pc_i_ready.eq(1)
+                # see https://bugs.libre-soc.org/show_bug.cgi?id=636
+                # the exception info needs to be blatted into
+                # pdecode.ldst_exc, and the instruction "re-run".
+                # when ldst_exc.happened is set, the PowerDecoder2
+                # reacts very differently: it re-writes the instruction
+                # with a "trap" (calls PowerDecoder2.trap()) which
+                # will *overwrite* whatever was requested and jump the
+                # PC to the exception address, as well as alter MSR.
+                # nothing else needs to be done other than to note
+                # the change of PC and MSR (and, later, SVSTATE)
+                with m.If(exc_happened):
+                    mmu = core.fus.get_exc("mmu0")
+                    ldst = core.fus.get_exc("ldst0")
+                    if mmu is not None:
+                        with m.If(fetch_failed):
+                            # instruction fetch: exception is from MMU
+                            # reset instr_fault (highest priority)
+                            sync += pdecode2.ldst_exc.eq(mmu)
+                            sync += pdecode2.instr_fault.eq(0)
+                            if flush_needed:
+                                # request icache to stop asserting "failed"
+                                comb += core.icache.flush_in.eq(1)
+                    with m.If(~fetch_failed):
+                        # otherwise assume it was a LDST exception
+                        sync += pdecode2.ldst_exc.eq(ldst)
+
+                with m.If(exec_pc_o_valid):
+
+                    # was this the last loop iteration?
+                    is_last = Signal()
+                    cur_vl = cur_state.svstate.vl
+                    comb += is_last.eq(next_srcstep == cur_vl)
+
+                    with m.If(pdecode2.instr_fault):
+                        # reset instruction fault, try again
+                        sync += pdecode2.instr_fault.eq(0)
+                        m.next = "ISSUE_START"
+
+                    # return directly to Decode if Execute generated an
+                    # exception.
+                    with m.Elif(pdecode2.ldst_exc.happened):
+                        m.next = "DECODE_SV"
+
+                    # if MSR, PC or SVSTATE were changed by the previous
+                    # instruction, go directly back to Fetch, without
+                    # updating either MSR PC or SVSTATE
+                    with m.Elif(self.msr_changed | self.pc_changed |
+                                self.sv_changed):
+                        m.next = "ISSUE_START"
+
+                    # also return to Fetch, when no output was a vector
+                    # (regardless of SRCSTEP and VL), or when the last
+                    # instruction was really the last one of the VL loop
+                    with m.Elif((~pdecode2.loop_continue) | is_last):
+                        # before going back to fetch, update the PC state
+                        # register with the NIA.
+                        # ok here we are not reading the branch unit.
+                        # TODO: this just blithely overwrites whatever
+                        #       pipeline updated the PC
+                        comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
+                        comb += self.state_w_pc.i_data.eq(nia)
+                        # reset SRCSTEP before returning to Fetch
+                        if self.svp64_en:
+                            with m.If(pdecode2.loop_continue):
                                 comb += new_svstate.srcstep.eq(0)
                                 comb += new_svstate.dststep.eq(0)
-                                comb += update_svstate.eq(1)
-                            m.next = "ISSUE_START"
+                                comb += self.update_svstate.eq(1)
+                        else:
+                            comb += new_svstate.srcstep.eq(0)
+                            comb += new_svstate.dststep.eq(0)
+                            comb += self.update_svstate.eq(1)
+                        m.next = "ISSUE_START"
 
-                        # returning to Execute? then, first update SRCSTEP
-                        with m.Else():
-                            comb += new_svstate.srcstep.eq(next_srcstep)
-                            comb += new_svstate.dststep.eq(next_dststep)
-                            comb += update_svstate.eq(1)
-                            # return to mask skip loop
-                            m.next = "PRED_SKIP"
+                    # returning to Execute? then, first update SRCSTEP
+                    with m.Else():
+                        comb += new_svstate.srcstep.eq(next_srcstep)
+                        comb += new_svstate.dststep.eq(next_dststep)
+                        comb += self.update_svstate.eq(1)
+                        # return to mask skip loop
+                        m.next = "PRED_SKIP"
 
-                with m.Else():
-                    comb += dbg.core_stopped_i.eq(1)
-                    # while stopped, allow updating the PC and SVSTATE
-                    with m.If(self.pc_i.ok):
-                        comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
-                        comb += self.state_w_pc.i_data.eq(self.pc_i.data)
-                        sync += pc_changed.eq(1)
-                    with m.If(self.svstate_i.ok):
-                        comb += new_svstate.eq(self.svstate_i.data)
-                        comb += update_svstate.eq(1)
-                        sync += sv_changed.eq(1)
 
         # check if svstate needs updating: if so, write it to State Regfile
-        with m.If(update_svstate):
-            comb += self.state_w_sv.wen.eq(1<<StateRegs.SVSTATE)
-            comb += self.state_w_sv.i_data.eq(new_svstate)
-            sync += cur_state.svstate.eq(new_svstate) # for next clock
+        with m.If(self.update_svstate):
+            sync += cur_state.svstate.eq(self.new_svstate)  # for next clock
 
-    def execute_fsm(self, m, core, pc_changed, sv_changed,
+    def execute_fsm(self, m, core,
                     exec_insn_i_valid, exec_insn_o_ready,
                     exec_pc_o_valid, exec_pc_i_ready):
         """execute FSM
@@ -948,12 +1503,19 @@ class TestIssuerInternal(Elaboratable):
 
         comb = m.d.comb
         sync = m.d.sync
+        dbg = self.dbg
         pdecode2 = self.pdecode2
+        cur_state = self.cur_state
 
         # temporaries
-        core_busy_o = core.n.o_data.busy_o # core is busy
+        core_busy_o = core.n.o_data.busy_o  # core is busy
         core_ivalid_i = core.p.i_valid              # instruction is valid
 
+        if hasattr(core, "icache"):
+            fetch_failed = core.icache.i_out.fetch_failed
+        else:
+            fetch_failed = Const(0, 1)
+
         with m.FSM(name="exec_fsm"):
 
             # waiting for instruction bus (stays there until not busy)
@@ -961,19 +1523,35 @@ class TestIssuerInternal(Elaboratable):
                 comb += exec_insn_o_ready.eq(1)
                 with m.If(exec_insn_i_valid):
                     comb += core_ivalid_i.eq(1)  # instruction is valid/issued
-                    sync += sv_changed.eq(0)
-                    sync += pc_changed.eq(0)
-                    with m.If(core.p.o_ready): # only move if accepted
+                    sync += self.sv_changed.eq(0)
+                    sync += self.pc_changed.eq(0)
+                    sync += self.msr_changed.eq(0)
+                    with m.If(core.p.o_ready):  # only move if accepted
                         m.next = "INSN_ACTIVE"  # move to "wait completion"
 
             # instruction started: must wait till it finishes
             with m.State("INSN_ACTIVE"):
-                # note changes to PC and SVSTATE
-                with m.If(self.state_nia.wen & (1<<StateRegs.SVSTATE)):
-                    sync += sv_changed.eq(1)
-                with m.If(self.state_nia.wen & (1<<StateRegs.PC)):
-                    sync += pc_changed.eq(1)
-                with m.If(~core_busy_o): # instruction done!
+                # note changes to MSR, PC and SVSTATE
+                with m.If(self.state_nia.wen & (1 << StateRegs.SVSTATE)):
+                    sync += self.sv_changed.eq(1)
+                with m.If(self.state_nia.wen & (1 << StateRegs.MSR)):
+                    sync += self.msr_changed.eq(1)
+                with m.If(self.state_nia.wen & (1 << StateRegs.PC)):
+                    sync += self.pc_changed.eq(1)
+                # and note changes to DEC/TB, to be passed to DEC/TB FSM
+                with m.If(self.state_spr.wen & (1 << StateRegs.TB)):
+                    comb += self.pause_dec_tb.eq(1)
+                # but also zero-out the cur_state DEC so that, on
+                # the next instruction, if it is "enable interrupt"
+                # the delay between the DEC/TB FSM reading and updating
+                # cur_state.dec doesn't trigger a spurious interrupt.
+                # the DEC/TB FSM will read the regfile and update to
+                # the correct value, so having cur_state.dec set to zero
+                # for a while is no big deal.
+                with m.If(self.state_spr.wen & (1 << StateRegs.DEC)):
+                    comb += self.pause_dec_tb.eq(1)
+                    sync += cur_state.dec.eq(0) # only needs top bit clear
+                with m.If(~core_busy_o):  # instruction done!
                     comb += exec_pc_o_valid.eq(1)
                     with m.If(exec_pc_i_ready):
                         # when finished, indicate "done".
@@ -986,103 +1564,17 @@ class TestIssuerInternal(Elaboratable):
                         # if we erroneously indicate "done" here, it is as if
                         # there were *TWO* instructions:
                         # 1) the failed LDST 2) a TRAP.
-                        with m.If(~pdecode2.ldst_exc.happened):
+                        with m.If(~pdecode2.ldst_exc.happened &
+                                   ~pdecode2.instr_fault):
                             comb += self.insn_done.eq(1)
                         m.next = "INSN_START"  # back to fetch
-
-    def setup_peripherals(self, m):
-        comb, sync = m.d.comb, m.d.sync
-
-        # okaaaay so the debug module must be in coresync clock domain
-        # but NOT its reset signal. to cope with this, set every single
-        # submodule explicitly in coresync domain, debug and JTAG
-        # in their own one but using *external* reset.
-        csd = DomainRenamer("coresync")
-        dbd = DomainRenamer(self.dbg_domain)
-
-        m.submodules.core = core = csd(self.core)
-        m.submodules.imem = imem = csd(self.imem)
-        m.submodules.dbg = dbg = dbd(self.dbg)
-        if self.jtag_en:
-            m.submodules.jtag = jtag = dbd(self.jtag)
-            # TODO: UART2GDB mux, here, from external pin
-            # see https://bugs.libre-soc.org/show_bug.cgi?id=499
-            sync += dbg.dmi.connect_to(jtag.dmi)
-
-        cur_state = self.cur_state
-
-        # 4x 4k SRAM blocks.  these simply "exist", they get routed in litex
-        if self.sram4x4k:
-            for i, sram in enumerate(self.sram4k):
-                m.submodules["sram4k_%d" % i] = csd(sram)
-                comb += sram.enable.eq(self.wb_sram_en)
-
-        # XICS interrupt handler
-        if self.xics:
-            m.submodules.xics_icp = icp = csd(self.xics_icp)
-            m.submodules.xics_ics = ics = csd(self.xics_ics)
-            comb += icp.ics_i.eq(ics.icp_o)           # connect ICS to ICP
-            sync += cur_state.eint.eq(icp.core_irq_o) # connect ICP to core
-
-        # GPIO test peripheral
-        if self.gpio:
-            m.submodules.simple_gpio = simple_gpio = csd(self.simple_gpio)
-
-        # connect one GPIO output to ICS bit 15 (like in microwatt soc.vhdl)
-        # XXX causes litex ECP5 test to get wrong idea about input and output
-        # (but works with verilator sim *sigh*)
-        #if self.gpio and self.xics:
-        #   comb += self.int_level_i[15].eq(simple_gpio.gpio_o[0])
-
-        # instruction decoder
-        pdecode = create_pdecode()
-        m.submodules.dec2 = pdecode2 = csd(self.pdecode2)
-        if self.svp64_en:
-            m.submodules.svp64 = svp64 = csd(self.svp64)
-
-        # convenience
-        dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer
-        intrf = self.core.regs.rf['int']
-
-        # clock delay power-on reset
-        cd_por  = ClockDomain(reset_less=True)
-        cd_sync = ClockDomain()
-        core_sync = ClockDomain("coresync")
-        m.domains += cd_por, cd_sync, core_sync
-        if self.dbg_domain != "sync":
-            dbg_sync = ClockDomain(self.dbg_domain)
-            m.domains += dbg_sync
-
-        ti_rst = Signal(reset_less=True)
-        delay = Signal(range(4), reset=3)
-        with m.If(delay != 0):
-            m.d.por += delay.eq(delay - 1)
-        comb += cd_por.clk.eq(ClockSignal())
-
-        # power-on reset delay
-        core_rst = ResetSignal("coresync")
-        comb += ti_rst.eq(delay != 0 | dbg.core_rst_o | ResetSignal())
-        comb += core_rst.eq(ti_rst)
-
-        # debug clock is same as coresync, but reset is *main external*
-        if self.dbg_domain != "sync":
-            dbg_rst = ResetSignal(self.dbg_domain)
-            comb += dbg_rst.eq(ResetSignal())
-
-        # busy/halted signals from core
-        core_busy_o = ~core.p.o_ready | core.n.o_data.busy_o # core is busy
-        comb += self.busy_o.eq(core_busy_o)
-        comb += pdecode2.dec.bigendian.eq(self.core_bigendian_i)
-
-        # temporary hack: says "go" immediately for both address gen and ST
-        l0 = core.l0
-        ldst = core.fus.fus['ldst0']
-        st_go_edge = rising_edge(m, ldst.st.rel_o)
-        m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o) # link addr-go direct to rel
-        m.d.comb += ldst.st.go_i.eq(st_go_edge) # link store-go to rising rel
+                # terminate returns directly to INSN_START
+                with m.If(dbg.terminate_i):
+                    # comb += self.insn_done.eq(1) - no because it's not
+                    m.next = "INSN_START"  # back to fetch
 
     def elaborate(self, platform):
-        m = Module()
+        m = super().elaborate(platform)
         # convenience
         comb, sync = m.d.comb, m.d.sync
         cur_state = self.cur_state
@@ -1092,43 +1584,17 @@ class TestIssuerInternal(Elaboratable):
 
         # set up peripherals and core
         core_rst = self.core_rst
-        self.setup_peripherals(m)
-
-        # reset current state if core reset requested
-        with m.If(core_rst):
-            m.d.sync += self.cur_state.eq(0)
-
-        # PC and instruction from I-Memory
-        comb += self.pc_o.eq(cur_state.pc)
-        pc_changed = Signal() # note write to PC
-        sv_changed = Signal() # note write to SVSTATE
 
         # indicate to outside world if any FU is still executing
-        comb += self.any_busy.eq(core.n.o_data.any_busy_o) # any FU executing
-
-        # read state either from incoming override or from regfile
-        # TODO: really should be doing MSR in the same way
-        pc = state_get(m, core_rst, self.pc_i,
-                            "pc",                  # read PC
-                            self.state_r_pc, StateRegs.PC)
-        svstate = state_get(m, core_rst, self.svstate_i,
-                            "svstate",   # read SVSTATE
-                            self.state_r_sv, StateRegs.SVSTATE)
-
-        # don't write pc every cycle
-        comb += self.state_w_pc.wen.eq(0)
-        comb += self.state_w_pc.i_data.eq(0)
+        comb += self.any_busy.eq(core.n.o_data.any_busy_o)  # any FU executing
 
         # address of the next instruction, in the absence of a branch
         # depends on the instruction size
         nia = Signal(64)
 
         # connect up debug signals
-        # TODO comb += core.icache_rst_i.eq(dbg.icache_rst_o)
-        comb += dbg.terminate_i.eq(core.o.core_terminate_o)
-        comb += dbg.state.pc.eq(pc)
-        comb += dbg.state.svstate.eq(svstate)
-        comb += dbg.state.msr.eq(cur_state.msr)
+        with m.If(core.o.core_terminate_o):
+            comb += dbg.terminate_i.eq(1)
 
         # pass the prefix mode from Fetch to Issue, so the latter can loop
         # on VL==0
@@ -1139,8 +1605,8 @@ class TestIssuerInternal(Elaboratable):
         # these are the handshake signals between each
 
         # fetch FSM can run as soon as the PC is valid
-        fetch_pc_i_valid = Signal() # Execute tells Fetch "start next read"
-        fetch_pc_o_ready = Signal() # Fetch Tells SVSTATE "proceed"
+        fetch_pc_i_valid = Signal()  # Execute tells Fetch "start next read"
+        fetch_pc_o_ready = Signal()  # Fetch Tells SVSTATE "proceed"
 
         # fetch FSM hands over the instruction to be decoded / issued
         fetch_insn_o_valid = Signal()
@@ -1171,20 +1637,11 @@ class TestIssuerInternal(Elaboratable):
         # Issue is where the VL for-loop # lives.  the ready/valid
         # signalling is used to communicate between the four.
 
-        # set up Fetch FSM
-        fetch = FetchFSM(self.allow_overlap, self.svp64_en,
-                        self.imem, core_rst, pdecode2, cur_state,
-                       dbg, core, svstate, nia, is_svp64_mode)
-        m.submodules.fetch = fetch
-        # connect up in/out data to existing Signals
-        comb += fetch.p.i_data.pc.eq(pc)
-        # and the ready/valid signalling
-        comb += fetch_pc_o_ready.eq(fetch.p.o_ready)
-        comb += fetch.p.i_valid.eq(fetch_pc_i_valid)
-        comb += fetch_insn_o_valid.eq(fetch.n.o_valid)
-        comb += fetch.n.i_ready.eq(fetch_insn_i_ready)
-
-        self.issue_fsm(m, core, pc_changed, sv_changed, nia,
+        self.fetch_fsm(m, dbg, core, core_rst, nia, is_svp64_mode,
+                       fetch_pc_o_ready, fetch_pc_i_valid,
+                       fetch_insn_o_valid, fetch_insn_i_ready)
+
+        self.issue_fsm(m, core, nia,
                        dbg, core_rst, is_svp64_mode,
                        fetch_pc_o_ready, fetch_pc_i_valid,
                        fetch_insn_o_valid, fetch_insn_i_ready,
@@ -1198,175 +1655,32 @@ class TestIssuerInternal(Elaboratable):
                                      pred_insn_i_valid, pred_insn_o_ready,
                                      pred_mask_o_valid, pred_mask_i_ready)
 
-        self.execute_fsm(m, core, pc_changed, sv_changed,
+        self.execute_fsm(m, core,
                          exec_insn_i_valid, exec_insn_o_ready,
                          exec_pc_o_valid, exec_pc_i_ready)
 
-        # this bit doesn't have to be in the FSM: connect up to read
-        # regfiles on demand from DMI
-        self.do_dmi(m, dbg)
-
-        # DEC and TB inc/dec FSM.  copy of DEC is put into CoreState,
-        # (which uses that in PowerDecoder2 to raise 0x900 exception)
-        self.tb_dec_fsm(m, cur_state.dec)
-
-        return m
-
-    def do_dmi(self, m, dbg):
-        """deals with DMI debug requests
-
-        currently only provides read requests for the INT regfile, CR and XER
-        it will later also deal with *writing* to these regfiles.
-        """
-        comb = m.d.comb
-        sync = m.d.sync
-        dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer
-        intrf = self.core.regs.rf['int']
-
-        with m.If(d_reg.req): # request for regfile access being made
-            # TODO: error-check this
-            # XXX should this be combinatorial?  sync better?
-            if intrf.unary:
-                comb += self.int_r.ren.eq(1<<d_reg.addr)
-            else:
-                comb += self.int_r.addr.eq(d_reg.addr)
-                comb += self.int_r.ren.eq(1)
-        d_reg_delay  = Signal()
-        sync += d_reg_delay.eq(d_reg.req)
-        with m.If(d_reg_delay):
-            # data arrives one clock later
-            comb += d_reg.data.eq(self.int_r.o_data)
-            comb += d_reg.ack.eq(1)
-
-        # sigh same thing for CR debug
-        with m.If(d_cr.req): # request for regfile access being made
-            comb += self.cr_r.ren.eq(0b11111111) # enable all
-        d_cr_delay  = Signal()
-        sync += d_cr_delay.eq(d_cr.req)
-        with m.If(d_cr_delay):
-            # data arrives one clock later
-            comb += d_cr.data.eq(self.cr_r.o_data)
-            comb += d_cr.ack.eq(1)
-
-        # aaand XER...
-        with m.If(d_xer.req): # request for regfile access being made
-            comb += self.xer_r.ren.eq(0b111111) # enable all
-        d_xer_delay  = Signal()
-        sync += d_xer_delay.eq(d_xer.req)
-        with m.If(d_xer_delay):
-            # data arrives one clock later
-            comb += d_xer.data.eq(self.xer_r.o_data)
-            comb += d_xer.ack.eq(1)
-
-    def tb_dec_fsm(self, m, spr_dec):
-        """tb_dec_fsm
-
-        this is a FSM for updating either dec or tb.  it runs alternately
-        DEC, TB, DEC, TB.  note that SPR pipeline could have written a new
-        value to DEC, however the regfile has "passthrough" on it so this
-        *should* be ok.
-
-        see v3.0B p1097-1099 for Timeer Resource and p1065 and p1076
-        """
-
-        comb, sync = m.d.comb, m.d.sync
-        fast_rf = self.core.regs.rf['fast']
-        fast_r_dectb = fast_rf.r_ports['issue'] # DEC/TB
-        fast_w_dectb = fast_rf.w_ports['issue'] # DEC/TB
-
-        with m.FSM() as fsm:
-
-            # initiates read of current DEC
-            with m.State("DEC_READ"):
-                comb += fast_r_dectb.addr.eq(FastRegs.DEC)
-                comb += fast_r_dectb.ren.eq(1)
-                m.next = "DEC_WRITE"
-
-            # waits for DEC read to arrive (1 cycle), updates with new value
-            with m.State("DEC_WRITE"):
-                new_dec = Signal(64)
-                # TODO: MSR.LPCR 32-bit decrement mode
-                comb += new_dec.eq(fast_r_dectb.o_data - 1)
-                comb += fast_w_dectb.addr.eq(FastRegs.DEC)
-                comb += fast_w_dectb.wen.eq(1)
-                comb += fast_w_dectb.i_data.eq(new_dec)
-                sync += spr_dec.eq(new_dec) # copy into cur_state for decoder
-                m.next = "TB_READ"
-
-            # initiates read of current TB
-            with m.State("TB_READ"):
-                comb += fast_r_dectb.addr.eq(FastRegs.TB)
-                comb += fast_r_dectb.ren.eq(1)
-                m.next = "TB_WRITE"
-
-            # waits for read TB to arrive, initiates write of current TB
-            with m.State("TB_WRITE"):
-                new_tb = Signal(64)
-                comb += new_tb.eq(fast_r_dectb.o_data + 1)
-                comb += fast_w_dectb.addr.eq(FastRegs.TB)
-                comb += fast_w_dectb.wen.eq(1)
-                comb += fast_w_dectb.i_data.eq(new_tb)
-                m.next = "DEC_READ"
+        # whatever was done above, over-ride it if core reset is held.
+        # set NIA to pc_at_reset
+        with m.If(core_rst):
+            sync += nia.eq(self.core.pc_at_reset)
 
         return m
 
-    def __iter__(self):
-        yield from self.pc_i.ports()
-        yield self.pc_o
-        yield self.memerr_o
-        yield from self.core.ports()
-        yield from self.imem.ports()
-        yield self.core_bigendian_i
-        yield self.busy_o
-
-    def ports(self):
-        return list(self)
-
-    def external_ports(self):
-        ports = self.pc_i.ports()
-        ports += [self.pc_o, self.memerr_o, self.core_bigendian_i, self.busy_o,
-                ]
-
-        if self.jtag_en:
-            ports += list(self.jtag.external_ports())
-        else:
-            # don't add DMI if JTAG is enabled
-            ports += list(self.dbg.dmi.ports())
-
-        ports += list(self.imem.ibus.fields.values())
-        ports += list(self.core.l0.cmpi.wb_bus().fields.values())
-
-        if self.sram4x4k:
-            for sram in self.sram4k:
-                ports += list(sram.bus.fields.values())
-
-        if self.xics:
-            ports += list(self.xics_icp.bus.fields.values())
-            ports += list(self.xics_ics.bus.fields.values())
-            ports.append(self.int_level_i)
-
-        if self.gpio:
-            ports += list(self.simple_gpio.bus.fields.values())
-            ports.append(self.gpio_o)
-
-        return ports
-
-    def ports(self):
-        return list(self)
-
 
 class TestIssuer(Elaboratable):
     def __init__(self, pspec):
         self.ti = TestIssuerInternal(pspec)
         self.pll = DummyPLL(instance=True)
 
+        self.dbg_rst_i = Signal(reset_less=True)
+
         # PLL direct clock or not
         self.pll_en = hasattr(pspec, "use_pll") and pspec.use_pll
         if self.pll_en:
             self.pll_test_o = Signal(reset_less=True)
             self.pll_vco_o = Signal(reset_less=True)
             self.clk_sel_i = Signal(2, reset_less=True)
-            self.ref_clk =  ClockSignal() # can't rename it but that's ok
+            self.ref_clk = ClockSignal()  # can't rename it but that's ok
             self.pllclk_clk = ClockSignal("pllclk")
 
     def elaborate(self, platform):
@@ -1406,29 +1720,30 @@ class TestIssuer(Elaboratable):
         # internal clock is set to selector clock-out.  has the side-effect of
         # running TestIssuer at this speed (see DomainRenamer("intclk") above)
         # debug clock runs at coresync internal clock
-        cd_coresync = ClockDomain("coresync")
-        #m.domains += cd_coresync
         if self.ti.dbg_domain != 'sync':
             cd_dbgsync = ClockDomain("dbgsync")
-            #m.domains += cd_dbgsync
-        intclk = ClockSignal("coresync")
+        intclk = ClockSignal(self.ti.core_domain)
         dbgclk = ClockSignal(self.ti.dbg_domain)
         # XXX BYPASS PLL XXX
         # XXX BYPASS PLL XXX
         # XXX BYPASS PLL XXX
         if self.pll_en:
             comb += intclk.eq(self.ref_clk)
+            assert self.ti.core_domain != 'sync', \
+                "cannot set core_domain to sync and use pll at the same time"
         else:
-            comb += intclk.eq(ClockSignal())
+            if self.ti.core_domain != 'sync':
+                comb += intclk.eq(ClockSignal())
         if self.ti.dbg_domain != 'sync':
             dbgclk = ClockSignal(self.ti.dbg_domain)
             comb += dbgclk.eq(intclk)
+        comb += self.ti.dbg_rst_i.eq(self.dbg_rst_i)
 
         return m
 
     def ports(self):
         return list(self.ti.ports()) + list(self.pll.ports()) + \
-               [ClockSignal(), ResetSignal()]
+            [ClockSignal(), ResetSignal()]
 
     def external_ports(self):
         ports = self.ti.external_ports()
@@ -1450,10 +1765,10 @@ if __name__ == '__main__':
              'div': 1,
              'mul': 1,
              'shiftrot': 1
-            }
+             }
     pspec = TestMemPspec(ldst_ifacetype='bare_wb',
                          imem_ifacetype='bare_wb',
-                         addr_wid=48,
+                         addr_wid=64,
                          mask_wid=8,
                          reg_wid=64,
                          units=units)
index 8c0f8e1f5b8cc6a1a3d3e4f5947350e880c428e5..d56c140d39791dabb42b01c76380746905db9d4a 100644 (file)
@@ -4,8 +4,9 @@
 import argparse
 from nmigen.cli import verilog
 
+from openpower.consts import MSR
 from soc.config.test.test_loadstore import TestMemPspec
-from soc.simple.issuer import TestIssuer
+from soc.simple.issuer import TestIssuer, TestIssuerInternal
 
 
 if __name__ == '__main__':
@@ -58,9 +59,69 @@ if __name__ == '__main__':
     parser.add_argument("--disable-svp64", dest='svp64', action="store_false",
                         help="disable SVP64",
                         default=False)
+    parser.add_argument("--pc-reset", default="0",
+                        help="Set PC at reset (default 0)")
+    parser.add_argument("--xlen", default=64, type=int,
+                        help="Set register width [default 64]")
+    # create a module that's directly compatible as a drop-in replacement
+    # in microwatt.v
+    parser.add_argument("--microwatt-compat", dest='mwcompat',
+                        action="store_true",
+                        help="generate microwatt-compatible interface",
+                        default=False)
+    parser.add_argument("--microwatt-compat-svp64", dest='mwcompatsvp64',
+                        action="store_true",
+                        help="generate microwatt-compatible interface + SVP64",
+                        default=False)
+    parser.add_argument("--old-microwatt-compat", dest='old_mwcompat',
+                        action="store_true",
+                        help="generate old microwatt-compatible interface",
+                        default=True)
+    parser.add_argument("--microwatt-debug", dest='mwdebug',
+                        action="store_true",
+                        help="generate old microwatt-compatible interface",
+                        default=False)
+    # create a module with Fabric compatibility
+    parser.add_argument("--fabric-compat", dest='fabriccompat',
+                        action="store_true",
+                        help="generate Fabric-compatible interface",
+                        default=False)
+    # small cache option
+    parser.add_argument("--small-cache", dest='smallcache',
+                        action="store_true",
+                        help="generate small caches",
+                        default=False)
+
+    # allow overlaps in TestIssuer
+    parser.add_argument("--allow-overlap", dest='allow_overlap',
+                        action="store_true",
+                        help="allow overlap in TestIssuer",
+                        default=False)
 
     args = parser.parse_args()
 
+    # convenience: set some defaults
+    if args.mwcompat:
+        args.pll = False
+        args.debug = 'dmi'
+        args.core = True
+        args.xics = False
+        args.gpio = False
+        args.sram4x4kblock = False
+        args.svp64 = False
+
+    # Yes, this is duplicating mwcompat, but for the sake of simplicity
+    # adding support for svp64 like this
+    if args.mwcompatsvp64:
+        args.pll = False
+        args.debug = 'dmi'
+        args.core = True
+        args.xics = False
+        args.gpio = False
+        args.sram4x4kblock = False
+        args.svp64 = True
+        args.mwcompat = True # Ensures TestMemPspec gets the expected value
+
     print(args)
 
     units = {'alu': 1,
@@ -77,14 +138,26 @@ if __name__ == '__main__':
     # decide which memory type to configure
     if args.mmu:
         ldst_ifacetype = 'mmu_cache_wb'
+        imem_ifacetype = 'mmu_cache_wb'
     else:
         ldst_ifacetype = 'bare_wb'
-    imem_ifacetype = 'bare_wb'
+        imem_ifacetype = 'bare_wb'
+
+    # default MSR
+    msr_reset = (1<<MSR.LE) | (1<<MSR.SF) # 64-bit, little-endian default
+
+    # default PC
+    if args.pc_reset.startswith("0x"):
+        pc_reset = int(args.pc_reset, 16)
+    else:
+        pc_reset = int(args.pc_reset)
 
     pspec = TestMemPspec(ldst_ifacetype=ldst_ifacetype,
                          imem_ifacetype=imem_ifacetype,
-                         addr_wid=48,
+                         addr_wid=64,
                          mask_wid=8,
+                         # pipeline and integer register file width
+                         XLEN=args.xlen,
                          # must leave at 64
                          reg_wid=64,
                          # set to 32 for instruction-memory width=32
@@ -99,10 +172,20 @@ if __name__ == '__main__':
                          sram4x4kblock=args.enable_sram4x4kblock, # add SRAMs
                          debug=args.debug,      # set to jtag or dmi
                          svp64=args.svp64,      # enable SVP64
-                         mmu=args.mmu,          # enable MMU
-                         units=units)
+                         microwatt_mmu=args.mmu,         # enable MMU
+                         microwatt_compat=args.mwcompat, # microwatt compatible
+                         microwatt_old=args.old_mwcompat, # old microwatt api
+                         microwatt_debug=args.mwdebug, # microwatt debug signals
+                         fabric_compat=args.fabriccompat, # fabric compatible (overlaps with microwatt compat)
+                         small_cache=args.smallcache, # small cache/TLB sizes
+                         allow_overlap=args.allow_overlap, # allow overlap
+                         units=units,
+                         msr_reset=msr_reset,
+                         pc_reset=pc_reset)
+    #if args.mwcompat:
+    #    pspec.core_domain = 'sync'
 
-    print("mmu", pspec.__dict__["mmu"])
+    print("mmu", pspec.__dict__["microwatt_mmu"])
     print("nocore", pspec.__dict__["nocore"])
     print("regreduce", pspec.__dict__["regreduce"])
     print("gpio", pspec.__dict__["gpio"])
@@ -111,9 +194,22 @@ if __name__ == '__main__':
     print("use_pll", pspec.__dict__["use_pll"])
     print("debug", pspec.__dict__["debug"])
     print("SVP64", pspec.__dict__["svp64"])
+    print("XLEN", pspec.__dict__["XLEN"])
+    print("MSR@reset", hex(pspec.__dict__["msr_reset"]))
+    print("PC@reset", hex(pspec.__dict__["pc_reset"]))
+    print("Microwatt compatibility", pspec.__dict__["microwatt_compat"])
+    print("Old Microwatt compatibility", pspec.__dict__["microwatt_old"])
+    print("Microwatt debug", pspec.__dict__["microwatt_debug"])
+    print("Fabric compatibility", pspec.__dict__["fabric_compat"])
+    print("Small Cache/TLB", pspec.__dict__["small_cache"])
 
-    dut = TestIssuer(pspec)
+    if args.mwcompat:
+        dut = TestIssuerInternal(pspec)
+        name = "external_core_top"
+    else:
+        dut = TestIssuer(pspec)
+        name = "test_issuer"
 
-    vl = verilog.convert(dut, ports=dut.external_ports(), name="test_issuer")
+    vl = verilog.convert(dut, ports=dut.external_ports(), name=name)
     with open(args.output_filename, "w") as f:
         f.write(vl)
index cbb093d286c5c2ba4689c4c3e14421ac5725a938..5d6bebc58d82c643ea42b6f882fd8193b199631b 100644 (file)
@@ -45,12 +45,13 @@ from soc.fu.branch.test.test_pipe_caller import BranchTestCase
 from soc.fu.ldst.test.test_pipe_caller import LDSTTestCase
 from openpower.test.general.overlap_hazards import (HazardTestCase,
                                                     RandomHazardTestCase)
-from openpower.util import spr_to_fast_reg
+from openpower.util import spr_to_fast_reg, spr_to_state_reg
 
 from openpower.consts import StateRegsEnum
 
 # list of SPRs that are controlled and managed by the MMU
-mmu_sprs = ["PRTBL", "DSISR", "DAR", "PIDR"]
+mmu_sprs = ["PRTBL", "PIDR"]
+ldst_sprs = ["DAR", "DSISR"]
 
 
 def set_mmu_spr(name, i, val, core):  # important keep pep8 formatting
@@ -60,7 +61,29 @@ def set_mmu_spr(name, i, val, core):  # important keep pep8 formatting
     yield fsm.mmu.l_in.rs.eq(val)
     yield
     yield fsm.mmu.l_in.mtspr.eq(0)
-    print("mmu_spr was updated")
+    while True:
+        done = yield fsm.mmu.l_out.done
+        if done:
+            break
+        yield
+    yield
+    print("mmu_spr %s %d was updated %x" % (name, i, val))
+
+
+def set_ldst_spr(name, i, val, core):  # important keep pep8 formatting
+    ldst = core.fus.get_fu("mmu0").alu.ldst # awkward to get at but it works
+    yield ldst.sprval_in.eq(val)
+    yield ldst.mmu_set_spr.eq(1)
+    if name == 'DAR':
+        yield ldst.mmu_set_dar.eq(1)
+        yield
+        yield ldst.mmu_set_dar.eq(0)
+    else:
+        yield ldst.mmu_set_dsisr.eq(1)
+        yield
+        yield ldst.mmu_set_dsisr.eq(0)
+    yield ldst.mmu_set_spr.eq(0)
+    print("ldst_spr %s %d was updated %x" % (name, i, val))
 
 
 def setup_regs(pdecode2, core, test):
@@ -120,6 +143,7 @@ def setup_regs(pdecode2, core, test):
     # setting both fast and slow SPRs from test data
 
     fregs = core.regs.fast
+    stateregs = core.regs.state
     sregs = core.regs.spr
     for sprname, val in test.sprs.items():
         if isinstance(val, SelectableInt):
@@ -128,17 +152,31 @@ def setup_regs(pdecode2, core, test):
             sprname = spr_dict[sprname].SPR
         if sprname == 'XER':
             continue
+        print ('set spr %s val %x' % (sprname, val))
+
         fast = spr_to_fast_reg(sprname)
-        if fast is None:
+        state = spr_to_state_reg(sprname)
+
+        if fast is None and state is None:
             # match behaviour of SPRMap in power_decoder2.py
             for i, x in enumerate(SPR):
                 if sprname == x.name:
-                    print("setting slow SPR %d (%s) to %x" %
-                          (i, sprname, val))
-                    if sprname not in mmu_sprs:
-                        yield sregs.memory._array[i].eq(val)
+                    print("setting slow SPR %d (%s/%d) to %x" %
+                          (i, sprname, x.value, val))
+                    if sprname in mmu_sprs:
+                        yield from set_mmu_spr(sprname, x.value, val, core)
+                    elif sprname in ldst_sprs:
+                        yield from set_ldst_spr(sprname, x.value, val, core)
                     else:
-                        yield from set_mmu_spr(sprname, i, val, core)
+                        yield sregs.memory._array[i].eq(val)
+        elif state is not None:
+            print("setting state reg %d (%s) to %x" %
+                  (state, sprname, val))
+            if stateregs.unary:
+                rval = stateregs.regs[state].reg
+            else:
+                rval = stateregs.memory._array[state]
+            yield rval.eq(val)
         else:
             print("setting fast reg %d (%s) to %x" %
                   (fast, sprname, val))
index b8245cbf1a5356f39a6695531ecda1e96460617b..b86e162efd2e2cfdbbf2763fbabaefc7e0f999d0 100644 (file)
@@ -30,13 +30,14 @@ from openpower.test.cr.cr_cases import CRTestCase
 from openpower.test.branch.branch_cases import BranchTestCase
 from soc.fu.spr.test.test_pipe_caller import SPRTestCase
 from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.trap.trap_cases import TrapTestCase
 from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
 from openpower.simulator.test_helloworld_sim import HelloTestCases
 
 
 if __name__ == "__main__":
     svp64 = True
-    if sys.argv[1] == 'nosvp64':
+    if len(sys.argv) > 1 and sys.argv[1] == 'nosvp64':
         svp64 = False
         del sys.argv[1]
 
@@ -46,17 +47,25 @@ if __name__ == "__main__":
         allow_overlap = True
         del sys.argv[1]
 
+    # use in-order issuer, instead of the original FSM based one
+    inorder = False
+    if len(sys.argv) >= 2 and sys.argv[1] == '--inorder':
+        inorder = True
+        del sys.argv[1]
+
     # allow list of testing to be selected by command-line
-    testing = sys.argv[1:]
-    sys.argv = sys.argv[:1]
+    testing = []
+    for i in reversed(range(1, len(sys.argv))):
+        if not sys.argv[i].startswith('-'):
+            testing.append(sys.argv.pop(i))
 
     if not testing:
         testing = ['general', 'ldst', 'cr', 'shiftrot', 'shiftrot2',
                    'logical', 'alu',
                    'branch', 'div', 'mul', 'hazard']
 
-    print ("SVP64 test mode enabled", svp64, "overlap",
-                                      allow_overlap, "testing", testing)
+    print("SVP64 test mode enabled", svp64, "overlap",
+          allow_overlap, "in-order", inorder, "testing", testing)
 
     unittest.main(exit=False)
     suite = unittest.TestSuite()
@@ -75,13 +84,14 @@ if __name__ == "__main__":
              'hazard': HazardTestCase().test_data,
              'alu': ALUTestCase().test_data,
              'branch': BranchTestCase().test_data,
+             'trap': TrapTestCase().test_data,
              'spr': SPRTestCase().test_data
-            }
+             }
 
     # walk through all tests, those requested get added
     for tname, data in tests.items():
         if tname in testing:
-            suite.addTest(TestRunner(data, svp64=svp64,
+            suite.addTest(TestRunner(data, svp64=svp64, inorder=inorder,
                                      allow_overlap=allow_overlap))
 
     runner = unittest.TextTestRunner()
diff --git a/src/soc/simple/test/test_issuer_linux_5_7.py b/src/soc/simple/test/test_issuer_linux_5_7.py
new file mode 100644 (file)
index 0000000..00a0949
--- /dev/null
@@ -0,0 +1,126 @@
+"""simple core test, runs instructions from a TestMemory
+
+related bugs:
+
+ * https://bugs.libre-soc.org/show_bug.cgi?id=363
+"""
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+
+import unittest
+import sys
+
+# here is the logic which takes test cases and "executes" them.
+# in this instance (TestRunner) its job is to instantiate both
+# a Libre-SOC nmigen-based HDL instance and an ISACaller python
+# simulator.  it's also responsible for performing the single
+# step and comparison.
+from soc.simple.test.test_runner import TestRunner
+
+#@platen:bookmarks
+#src/openpower/test/runner.py:class TestRunnerBase(FHDLTestCase):
+
+# test with MMU
+from openpower.test.mmu.mmu_cases import MMUTestCase
+from openpower.test.mmu.mmu_rom_cases import MMUTestCaseROM, default_mem
+from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.ldst.ldst_exc_cases import LDSTExceptionTestCase
+#from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
+from soc.experiment.test import pagetables
+
+
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+from openpower.consts import MSR
+
+from soc.experiment.test import pagetables
+
+
+class MMUTestCase(TestAccumulatorBase):
+
+    def case_first_vm_enabled(self):
+        lst = [
+               "std 6,0(2)",
+              ]
+
+        # set up regs
+        initial_regs = [0] * 32
+        initial_regs[2] = 0xc0000000005fc190
+        initial_regs[6] = 0x0101
+
+        # memory same as microwatt test
+        initial_mem = pagetables.microwatt_linux_5_7_boot
+
+        # set virtual and non-privileged
+        # msr: 8000000000000011
+        initial_msr = 0 << MSR.PR # must set "problem" state
+        initial_msr |= 1 << MSR.LE # little-endian
+        initial_msr |= 1 << MSR.SF # 64-bit
+        initial_msr |= 1 << MSR.DR # set "virtual" state for data
+
+        # set PRTBL to 0xe000000
+        initial_sprs = {720: 0xe000000, # PRTBL
+                        48: 1       # PIDR
+                        } 
+
+        print("MMUTEST: initial_msr=",initial_msr)
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem,
+                             initial_sprs=initial_sprs,
+                             initial_msr=initial_msr)
+
+
+    def case_first_vm_enabled_2(self):
+        lst = [
+               "std 6,0(2)",
+              ]
+
+        # set up regs
+        initial_regs = [0] * 32
+        initial_regs[2] = 0xc000000000598000
+        initial_regs[6] = 0x0101
+
+        # memory same as microwatt test
+        initial_mem = pagetables.microwatt_linux_5_7_boot
+
+        # set virtual and non-privileged
+        # msr: 8000000000000011
+        initial_msr = 0 << MSR.PR # must set "problem" state
+        initial_msr |= 1 << MSR.LE # little-endian
+        initial_msr |= 1 << MSR.SF # 64-bit
+        initial_msr |= 1 << MSR.DR # set "virtual" state for data
+
+        # set PRTBL to 0xe000000
+        initial_sprs = {720: 0xe00000c, # PRTBL
+                        48: 1       # PIDR
+                        }
+
+        print("MMUTEST: initial_msr=",initial_msr)
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem,
+                             initial_sprs=initial_sprs,
+                             initial_msr=initial_msr)
+
+
+if __name__ == "__main__":
+    svp64 = True
+    if len(sys.argv) == 2:
+        if sys.argv[1] == 'nosvp64':
+            svp64 = False
+        sys.argv.pop()
+
+    print ("SVP64 test mode enabled", svp64)
+
+    unittest.main(exit=False)
+    suite = unittest.TestSuite()
+
+    # MMU/DCache integration tests
+    suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
+                              microwatt_mmu=True,
+                              rom=pagetables.microwatt_linux_5_7_boot))
+
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
diff --git a/src/soc/simple/test/test_issuer_mmu_ifetch.py b/src/soc/simple/test/test_issuer_mmu_ifetch.py
new file mode 100644 (file)
index 0000000..81f1b32
--- /dev/null
@@ -0,0 +1,114 @@
+"""simple core test, runs instructions from a TestMemory
+
+related bugs:
+
+ * https://bugs.libre-soc.org/show_bug.cgi?id=363
+"""
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+
+import unittest
+import sys
+
+# here is the logic which takes test cases and "executes" them.
+# in this instance (TestRunner) its job is to instantiate both
+# a Libre-SOC nmigen-based HDL instance and an ISACaller python
+# simulator.  it's also responsible for performing the single
+# step and comparison.
+from soc.simple.test.test_runner import TestRunner
+
+#@platen:bookmarks
+#src/openpower/test/runner.py:class TestRunnerBase(FHDLTestCase):
+
+# test with MMU
+from openpower.test.mmu.mmu_cases import MMUTestCase
+from openpower.test.mmu.mmu_rom_cases import MMUTestCaseROM, default_mem
+from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.ldst.ldst_exc_cases import LDSTExceptionTestCase
+#from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
+
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+from openpower.consts import MSR
+
+from soc.experiment.test import pagetables
+
+
+class MMUTestCase(TestAccumulatorBase):
+
+    def case_virtual_ld_st(self):
+        lst = ["stb 10,0(2)",
+               "addi 10,0, -4",
+               "stb 10,0(5)",
+               "lhz 6,0(2)",
+              ]
+
+        # set up regs
+        initial_regs = [0] * 32
+        initial_regs[1] = 0x1000000 # hm, was going to do mtspr 720,1 with this
+        initial_regs[2] = 0x3456
+        initial_regs[3] = 0x4321
+        initial_regs[4] = 0x6543
+        initial_regs[5] = 0x3457
+        initial_regs[10] = 0xfe
+
+        # no pre-loaded memory here
+        initial_mem = {}
+
+        # set virtual and non-privileged
+        initial_msr = 0 << MSR.PR # must set "problem" state
+        #initial_msr |= 1 << MSR.DR # set "virtual" state for data
+        initial_msr |= 1 << MSR.IR # set "virtual" state for instructions
+        initial_msr |= 1 << MSR.LE # set little-endian
+
+        # set PRTBL to 0x1000000
+        initial_sprs = {720: 0x1000000} # PRTBL
+
+        print("MMUTEST: initial_msr=",initial_msr)
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem,
+                             initial_sprs=initial_sprs,
+                             initial_msr=initial_msr)
+
+    def case_virtual_invalid_no_prtbl(self):
+        """virtual memory test but with no PRTBL set it is expected
+        to throw an "invalid" exception
+        """
+        lst = ["stb 10,0(2)",
+              ]
+
+        # set up regs
+        initial_regs = [0] * 32
+
+        # set virtual and non-privileged
+        initial_msr = 1 << MSR.PR # must set "problem" state
+        initial_msr |= 1 << MSR.DR # set "virtual" state for data
+        initial_msr |= 1 << MSR.IR # set "virtual" state for instructions
+
+        print("MMUTEST: initial_msr=",initial_msr)
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_msr=initial_msr,
+                             stop_at_pc=0x400) # stop at this exception addr
+
+if __name__ == "__main__":
+    svp64 = True
+    if len(sys.argv) == 2:
+        if sys.argv[1] == 'nosvp64':
+            svp64 = False
+        sys.argv.pop()
+
+    print ("SVP64 test mode enabled", svp64)
+
+    unittest.main(exit=False)
+    suite = unittest.TestSuite()
+
+    # MMU/DCache integration tests
+    suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
+                              microwatt_mmu=True,
+                              rom=pagetables.test1))
+
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
diff --git a/src/soc/simple/test/test_issuer_mmu_microwatt.py b/src/soc/simple/test/test_issuer_mmu_microwatt.py
new file mode 100644 (file)
index 0000000..69fe9ff
--- /dev/null
@@ -0,0 +1,93 @@
+"""simple core test, runs instructions from a TestMemory
+
+related bugs:
+
+ * https://bugs.libre-soc.org/show_bug.cgi?id=363
+"""
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+
+import unittest
+import sys
+
+# here is the logic which takes test cases and "executes" them.
+# in this instance (TestRunner) its job is to instantiate both
+# a Libre-SOC nmigen-based HDL instance and an ISACaller python
+# simulator.  it's also responsible for performing the single
+# step and comparison.
+from soc.simple.test.test_runner import TestRunner
+
+#@platen:bookmarks
+#src/openpower/test/runner.py:class TestRunnerBase(FHDLTestCase):
+
+# test with MMU
+from openpower.test.mmu.mmu_cases import MMUTestCase
+from openpower.test.mmu.mmu_rom_cases import MMUTestCaseROM, default_mem
+from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.ldst.ldst_exc_cases import LDSTExceptionTestCase
+#from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
+from soc.experiment.test import pagetables
+
+
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+from openpower.consts import MSR
+
+from soc.experiment.test import pagetables
+
+
+class MMUTestCase(TestAccumulatorBase):
+
+    def case_microwatt_test_3_mmu_ld(self):
+        lst = [
+               "ld 6,0(2)",
+              ]
+
+        # set up regs
+        initial_regs = [0] * 32
+        initial_regs[2] = 0x124108
+
+        # memory same as microwatt test
+        initial_mem = pagetables.microwatt_test2
+
+        # set virtual and non-privileged
+        # msr: 8000000000000011
+        initial_msr = 0 << MSR.PR # must set "problem" state
+        initial_msr |= 1 << MSR.LE # little-endian
+        initial_msr |= 1 << MSR.SF # 64-bit
+        initial_msr |= 1 << MSR.DR # set "virtual" state for data
+
+        # set PRTBL to 0x12000
+        initial_sprs = {720: 0x12000, # PRTBL
+                        48: 1       # PIDR
+                        } 
+
+        print("MMUTEST: initial_msr=",initial_msr)
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem,
+                             initial_sprs=initial_sprs,
+                             initial_msr=initial_msr)
+
+
+if __name__ == "__main__":
+    svp64 = True
+    if len(sys.argv) == 2:
+        if sys.argv[1] == 'nosvp64':
+            svp64 = False
+        sys.argv.pop()
+
+    print ("SVP64 test mode enabled", svp64)
+
+    unittest.main(exit=False)
+    suite = unittest.TestSuite()
+
+    # MMU/DCache integration tests
+    suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
+                              microwatt_mmu=True,
+                              rom=pagetables.microwatt_test2))
+
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
index 1dd89c533d19d5f9f28c9d190fb48ce30000bf29..29d6acf63a05e1713c5b1c6b086c85b76af46970 100644 (file)
@@ -20,6 +20,8 @@ from soc.fu.compunits.test.test_compunit import (check_sim_memory,
 
 from soc.simple.test.test_runner import setup_i_memory
 
+from pathlib import Path
+
 import sys
 sys.setrecursionlimit(10**6)
 
@@ -36,6 +38,8 @@ class BinaryTestCase(FHDLTestCase):
         with Program("1.bin", bigendian) as program:
             self.run_tst_program(program)
 
+    @unittest.skipUnless(Path("hello_world.bin").exists(),
+                         "missing hello_world.bin")
     def test_binary(self):
         with Program("hello_world.bin", bigendian) as program:
             self.run_tst_program(program)
@@ -62,7 +66,7 @@ class TestRunner(FHDLTestCase):
 
         pspec = TestMemPspec(ldst_ifacetype='test_bare_wb',
                              imem_ifacetype='test_bare_wb',
-                             addr_wid=48,
+                             addr_wid=64,
                              mask_wid=8,
                              reg_wid=64,
                              imem_test_depth=32768,
index d33e8fbf6b299ca48ee135e5f1eee4b02126157f..5d6e57da2b1e82025a1604b52aa918927dac4d8a 100644 (file)
@@ -8,6 +8,7 @@ related bugs:
 from nmigen import Module, Signal
 from nmigen.hdl.xfrm import ResetInserter
 from copy import copy
+from pprint import pprint
 
 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
 # Also, check out the cxxsim nmigen branch, and latest yosys from git
@@ -18,6 +19,7 @@ from openpower.decoder.isa.all import ISA
 from openpower.endian import bigendian
 
 from soc.simple.issuer import TestIssuerInternal
+from soc.simple.inorder import TestIssuerInternalInOrder
 
 from soc.simple.test.test_core import (setup_regs, check_regs, check_mem,
                                        wait_for_busy_clear,
@@ -26,20 +28,59 @@ from soc.fu.compunits.test.test_compunit import (setup_tst_memory,
                                                  check_sim_memory)
 from soc.debug.dmi import DBGCore, DBGCtrl, DBGStat
 from nmutil.util import wrap
-from soc.experiment.test.test_mmu_dcache import wb_get
 from openpower.test.state import TestState, StateRunner
 from openpower.test.runner import TestRunnerBase
 
 
-def setup_i_memory(imem, startaddr, instructions):
+def insert_into_rom(startaddr, instructions, rom):
+    print("insn before, init rom", len(instructions))
+    pprint(rom)
+
+    startaddr //= 4  # instructions are 32-bit
+
+    # 64 bit
+    mask = ((1 << 64)-1)
+    for ins in instructions:
+        if isinstance(ins, tuple):
+            insn, code = ins
+        else:
+            insn, code = ins, ''
+        insn = insn & 0xffffffff
+        msbs = (startaddr >> 1) & mask
+        lsb = 1 if (startaddr & 1) else 0
+        print ("insn", hex(insn), hex(msbs), hex(lsb))
+
+        val = rom.get(msbs<<3, 0)
+        if insn != 0:
+            print("before set", hex(4*startaddr),
+                  hex(msbs), hex(val), hex(insn))
+        val = (val | (insn << (lsb*32)))
+        val = val & mask
+        rom[msbs<<3] = val
+        if insn != 0:
+            print("after  set", hex(4*startaddr), hex(msbs), hex(val))
+            print("instr: %06x 0x%x %s %08x" % (4*startaddr, insn, code, val))
+        startaddr += 1
+        startaddr = startaddr & mask
+
+    print ("after insn insert")
+    pprint(rom)
+
+
+def setup_i_memory(imem, startaddr, instructions, rom):
     mem = imem
     print("insn before, init mem", mem.depth, mem.width, mem,
           len(instructions))
-    for i in range(mem.depth):
-        yield mem._array[i].eq(0)
-    yield Settle()
+
+    if not rom:
+        # initialise mem array to zero
+        for i in range(mem.depth):
+            yield mem._array[i].eq(0)
+        yield Settle()
+
     startaddr //= 4  # instructions are 32-bit
     if mem.width == 32:
+        assert rom is None, "cannot do 32-bit from wb_get ROM yet"
         mask = ((1 << 32)-1)
         for ins in instructions:
             if isinstance(ins, tuple):
@@ -64,15 +105,22 @@ def setup_i_memory(imem, startaddr, instructions):
             insn, code = ins, ''
         insn = insn & 0xffffffff
         msbs = (startaddr >> 1) & mask
-        val = yield mem._array[msbs]
+        lsb = 1 if (startaddr & 1) else 0
+
+        if rom: # must put the value into the wb_get area
+            val = rom[msbs<<1]
+        else:
+            val = yield mem._array[msbs]
         if insn != 0:
             print("before set", hex(4*startaddr),
                   hex(msbs), hex(val), hex(insn))
-        lsb = 1 if (startaddr & 1) else 0
         val = (val | (insn << (lsb*32)))
         val = val & mask
-        yield mem._array[msbs].eq(val)
-        yield Settle()
+        if rom: # must put the value into the wb_get area
+            rom[msbs<<1] = val
+        else:
+            yield mem._array[msbs].eq(val)
+            yield Settle()
         if insn != 0:
             print("after  set", hex(4*startaddr), hex(msbs), hex(val))
             print("instr: %06x 0x%x %s %08x" % (4*startaddr, insn, code, val))
@@ -121,17 +169,22 @@ class HDLRunner(StateRunner):
     """HDLRunner:  Implements methods for the setup, preparation, and
     running of tests using nmigen HDL simulation.
     """
+
     def __init__(self, dut, m, pspec):
         super().__init__("hdl", HDLRunner)
 
         self.dut = dut
+        self.pspec = pspec
         self.pc_i = Signal(32)
         self.svstate_i = Signal(64)
 
         #hard_reset = Signal(reset_less=True)
-        self.issuer = TestIssuerInternal(pspec)
+        if pspec.inorder:
+            self.issuer = TestIssuerInternalInOrder(pspec)
+        else:
+            self.issuer = TestIssuerInternal(pspec)
         # use DMI RESET command instead, this does actually work though
-        #issuer = ResetInserter({'coresync': hard_reset,
+        # issuer = ResetInserter({'coresync': hard_reset,
         #                        'sync': hard_reset})(issuer)
         m.submodules.issuer = self.issuer
         self.dmi = self.issuer.dbg.dmi
@@ -142,6 +195,7 @@ class HDLRunner(StateRunner):
 
     def prepare_for_test(self, test):
         self.test = test
+        #print ("preparing for test name", test.name)
 
         # set up bigendian (TODO: don't do this, use MSR)
         yield self.issuer.core_bigendian_i.eq(bigendian)
@@ -151,16 +205,34 @@ class HDLRunner(StateRunner):
         yield
         yield
         yield
+        #print ("end of test preparation", test.name)
 
     def setup_during_test(self):
-        yield from set_dmi(self.dmi, DBGCore.CTRL, 1<<DBGCtrl.STOP)
+        # first run a manual hard-reset of the debug interface.
+        # core is counting down on a 3-clock delay at this point
+        yield self.issuer.dbg_rst_i.eq(1)
+        yield
+        yield self.issuer.dbg_rst_i.eq(0)
+
+        # now run a DMI-interface reset.  because DMI is running
+        # in dbgsync domain its reset is *NOT* connected to
+        # core reset (hence the dbg_rst_i blip, above)
+        yield from set_dmi(self.dmi, DBGCore.CTRL, 1 << DBGCtrl.STOP)
         yield
+        #print("test setup")
 
     def run_test(self, instructions):
         """run_hdl_state - runs a TestIssuer nmigen HDL simulation
         """
 
-        imem = self.issuer.imem._get_memory()
+        #print("starting test")
+
+        if self.dut.rom is None:
+            imem = self.issuer.imem._get_memory()
+            #print("got memory", imem)
+        else:
+            print("skipping memory get due to rom")
+            pprint(self.dut.rom)
         core = self.issuer.core
         dmi = self.issuer.dbg.dmi
         pdecode2 = self.issuer.pdecode2
@@ -172,9 +244,16 @@ class HDLRunner(StateRunner):
         pc = 0  # start address
         counter = 0  # test to pause/start
 
-        yield from setup_i_memory(imem, pc, instructions)
-        yield from setup_tst_memory(l0, self.test.mem)
+        # XXX for now, when ROM (run under wb_get) is detected,
+        # skip setup of memories.  must be done a different way
+        if self.dut.rom is None:
+            yield from setup_i_memory(imem, pc, instructions, self.dut.rom)
+            yield from setup_tst_memory(l0, self.test.mem)
+        else:
+            insert_into_rom(pc, instructions, self.dut.default_mem)
+        print("about to setup regs")
         yield from setup_regs(pdecode2, core, self.test)
+        #print("setup mem and regs done")
 
         # set PC and SVSTATE
         yield self.pc_i.eq(pc)
@@ -190,6 +269,15 @@ class HDLRunner(StateRunner):
 
         print("instructions", instructions)
 
+        # before starting the simulation, set the core stop address to be
+        # just after the last instruction. if a load of an instruction is
+        # requested at this address, the core is immediately put into "halt"
+        # XXX: keep an eye out for in-order problems
+        hard_stop_addr = self.test.stop_at_pc
+        if hard_stop_addr is None:
+            hard_stop_addr = len(instructions)*4
+        yield from set_dmi(dmi, DBGCore.STOPADDR, hard_stop_addr)
+
         # run the loop of the instructions on the current test
         index = (yield self.issuer.cur_state.pc) // 4
         while index < len(instructions):
@@ -202,16 +290,17 @@ class HDLRunner(StateRunner):
                 # start the core
                 yield
                 yield from set_dmi(dmi, DBGCore.CTRL,
-                                1<<DBGCtrl.START)
-                yield self.issuer.pc_i.ok.eq(0) # no change PC after this
-                yield self.issuer.svstate_i.ok.eq(0) # ditto
+                                   1 << DBGCtrl.START)
+                yield self.issuer.pc_i.ok.eq(0)  # no change PC after this
+                yield self.issuer.svstate_i.ok.eq(0)  # ditto
                 yield
                 yield
 
             counter = counter + 1
 
             # wait until executed
-            while not (yield self.issuer.insn_done):
+            while not ((yield self.issuer.insn_done) or
+                       (yield self.issuer.dbg.terminated_o)):
                 yield
 
             # okaaay long story: in overlap mode, PC is updated one cycle
@@ -228,22 +317,29 @@ class HDLRunner(StateRunner):
             if index < len(instructions):
                 # Get HDL mem and state
                 state = yield from TestState("hdl", core, self.dut,
-                                            code)
+                                             code)
                 hdl_states.append(state)
 
             if index >= len(instructions):
-                print ("index over, send dmi stop")
+                print("index over, send dmi stop")
                 # stop at end
-                yield from set_dmi(dmi, DBGCore.CTRL, 1<<DBGCtrl.STOP)
+                yield from set_dmi(dmi, DBGCore.CTRL, 1 << DBGCtrl.STOP)
                 yield
                 yield
+                # hmm really should use DMI status check here but hey it's quick
+                while True:
+                    stopped = yield self.issuer.dbg.core_stop_o
+                    if stopped:
+                        break
+                    yield
+                break
 
             terminated = yield self.issuer.dbg.terminated_o
             print("terminated(2)", terminated)
             if terminated:
                 break
 
-        if self.dut.allow_overlap:
+        if self.dut.allow_overlap: # or not self.dut.rom: ??
             # wait until all settled
             # XXX really this should be in DMI, which should in turn
             # use issuer.any_busy to not send back "stopped" signal
@@ -253,13 +349,13 @@ class HDLRunner(StateRunner):
         if self.dut.allow_overlap:
             # get last state, at end of run
             state = yield from TestState("hdl", core, self.dut,
-                                        code)
+                                         code)
             hdl_states.append(state)
 
         return hdl_states
 
     def end_test(self):
-        yield from set_dmi(self.dmi, DBGCore.CTRL, 1<<DBGCtrl.STOP)
+        yield from set_dmi(self.dmi, DBGCore.CTRL, 1 << DBGCtrl.STOP)
         yield
         yield
 
@@ -277,27 +373,30 @@ class HDLRunner(StateRunner):
         xer = yield from get_dmi(self.dmi, DBGCore.XER)
         print("after test %s XER value %x" % (self.test.name, xer))
 
+        # get MSR
+        msr = yield from get_dmi(self.dmi, DBGCore.MSR)
+        print("after test %s MSR value %x" % (self.test.name, msr))
+
         # test of dmi reg get
         for int_reg in range(32):
             yield from set_dmi(self.dmi, DBGCore.GSPR_IDX, int_reg)
             value = yield from get_dmi(self.dmi, DBGCore.GSPR_DATA)
 
             print("after test %s reg %2d value %x" %
-            (self.test.name, int_reg, value))
+                  (self.test.name, int_reg, value))
 
         # pull a reset
-        yield from set_dmi(self.dmi, DBGCore.CTRL, 1<<DBGCtrl.RESET)
+        yield from set_dmi(self.dmi, DBGCore.CTRL, 1 << DBGCtrl.RESET)
         yield
 
 
 class TestRunner(TestRunnerBase):
     def __init__(self, tst_data, microwatt_mmu=False, rom=None,
-                        svp64=True, run_hdl=True, run_sim=True,
-                        allow_overlap=False):
+                 svp64=True, inorder=False, run_hdl=True, run_sim=True,
+                 allow_overlap=False):
         if run_hdl:
             run_hdl = HDLRunner
         super().__init__(tst_data, microwatt_mmu=microwatt_mmu,
-                        rom=rom,
-                        svp64=svp64, run_hdl=run_hdl, run_sim=run_sim,
-                        allow_overlap=allow_overlap)
-
+                         rom=rom, inorder=inorder,
+                         svp64=svp64, run_hdl=run_hdl, run_sim=run_sim,
+                         allow_overlap=allow_overlap)
index 7da358ea7af1c51e7ce46e8635d39ff3a2f7a0a9..cc62c5da4b4cceb3990c13e1476da72f1f96a8bf 100644 (file)
@@ -19,6 +19,13 @@ class HDLState(State):
         super().__init__()
         self.core = core
 
+    def get_fpregs(self):
+        if False:
+            yield
+        self.fpregs = []
+        for i in range(32):
+            self.fpregs.append(0)
+
     def get_intregs(self):
         self.intregs = []
         for i in range(32):
@@ -54,9 +61,12 @@ class HDLState(State):
         log("class hdl pc", hex(self.pc))
 
     def get_mem(self):
+        self.mem = {}
         # get the underlying HDL-simulated memory from the L0CacheBuffer
+        if hasattr(self.core, "icache"):
+            # err temporarily ignore memory
+            return # XXX have to work out how to deal with wb_get
         hdlmem = get_l0_mem(self.core.l0)
-        self.mem = {}
         for i in range(hdlmem.depth):
             value = yield hdlmem._array[i] # should not really do this
             self.mem[i*8] = value