Allow the formal engine to perform a same-cycle result in the ALU master
authorCesar Strauss <cestrauss@gmail.com>
Sun, 5 Nov 2023 14:18:40 +0000 (11:18 -0300)
committerCesar Strauss <cestrauss@gmail.com>
Sun, 5 Nov 2023 14:18:40 +0000 (11:18 -0300)
This adds an exception to holding o_valid low, when the ALU is idle.
If a write to the ALU just occurred, allow o_valid to become high, in
the same cycle.

341 files changed:
.gitignore
.gitlab-ci.yml
Makefile
conf.py
flake.lock [new file with mode: 0644]
flake.nix [new file with mode: 0644]
mkpinmux.sh
nix/bigfloat.nix [new file with mode: 0644]
nix/c4m-jtag.nix [new file with mode: 0644]
nix/ecp5-program.nix [new file with mode: 0644]
nix/ecp5.nix [new file with mode: 0644]
nix/ieee754fpu.nix [new file with mode: 0644]
nix/litex.toml [new file with mode: 0644]
nix/ls180.nix [new file with mode: 0644]
nix/modgrammar.nix [new file with mode: 0644]
nix/nmutil.nix [new file with mode: 0644]
nix/openpower-isa.nix [new file with mode: 0644]
nix/pinmux.nix [new file with mode: 0644]
nix/soc.nix [new file with mode: 0644]
nix/verilog.nix [new file with mode: 0644]
pinmux
pyproject.toml [new file with mode: 0644]
setup.py
src/soc/bus/external_core.py [new file with mode: 0644]
src/soc/bus/opencores_ethmac.py [new file with mode: 0644]
src/soc/bus/sdr_ctrl.py [new file with mode: 0644]
src/soc/bus/sram.py
src/soc/bus/syscon.py [new file with mode: 0644]
src/soc/bus/tercel.py [new file with mode: 0644]
src/soc/bus/test/wb_rw.py
src/soc/bus/uart_16550.py [new file with mode: 0644]
src/soc/bus/wb_async.py [new file with mode: 0644]
src/soc/bus/wb_downconvert.py
src/soc/config/ifetch.py
src/soc/config/pinouts.py
src/soc/config/test/test_fetch.py
src/soc/config/test/test_loadstore.py
src/soc/config/test/test_pi2ls.py
src/soc/debug/.gitignore [new file with mode: 0644]
src/soc/debug/dmi.py
src/soc/debug/jtagutils.py
src/soc/debug/test/test_jtag_tap.py
src/soc/debug/test/test_jtag_tap_srv.py
src/soc/experiment/alu_fsm.py
src/soc/experiment/alu_hier.py
src/soc/experiment/cache_ram.py
src/soc/experiment/compalu.py
src/soc/experiment/compalu_multi.py
src/soc/experiment/compldst_multi.py
src/soc/experiment/cscore.py
src/soc/experiment/dcache.py
src/soc/experiment/formal/proof_alu_fsm.py
src/soc/experiment/formal/proof_compalu_multi.py [new file with mode: 0644]
src/soc/experiment/icache.py
src/soc/experiment/imem.py
src/soc/experiment/l0_cache.py
src/soc/experiment/lsmem.py
src/soc/experiment/mmu.py
src/soc/experiment/pi2ls.py
src/soc/experiment/pimem.py
src/soc/experiment/plru.py
src/soc/experiment/radix_walk_example.txt
src/soc/experiment/score6600.py
src/soc/experiment/score6600_multi.py
src/soc/experiment/sim.py
src/soc/experiment/test/pagetables.py [new file with mode: 0644]
src/soc/experiment/test/test_compalu_multi.py
src/soc/experiment/test/test_compldst_multi.py
src/soc/experiment/test/test_compldst_multi_mmu.py [new file with mode: 0644]
src/soc/experiment/test/test_compldst_multi_mmu_fsm.py [new file with mode: 0644]
src/soc/experiment/test/test_dcache.py
src/soc/experiment/test/test_dcache_tlb.py
src/soc/experiment/test/test_dcbz_pi.py [new file with mode: 0644]
src/soc/experiment/test/test_l0_cache_buffer2.py
src/soc/experiment/test/test_ldst_pi.py
src/soc/experiment/test/test_ldst_pi_misalign.py
src/soc/experiment/test/test_loadstore1.py [new file with mode: 0644]
src/soc/experiment/test/test_mmu_dcache.py
src/soc/experiment/test/test_mmu_dcache_pi.py
src/soc/experiment/test/test_wishbone.py [new file with mode: 0644]
src/soc/fu/alu/formal/proof_input_stage.py
src/soc/fu/alu/formal/proof_main_stage.py
src/soc/fu/alu/formal/proof_output_stage.py
src/soc/fu/alu/main_stage.py
src/soc/fu/alu/output_stage.py
src/soc/fu/alu/pipe_data.py
src/soc/fu/alu/pipeline.py
src/soc/fu/alu/test/test_pipe_caller.py
src/soc/fu/branch/formal/proof_input_stage.py
src/soc/fu/branch/formal/proof_main_stage.py
src/soc/fu/branch/pipe_data.py
src/soc/fu/branch/pipeline.py
src/soc/fu/branch/test/test_pipe_caller.py
src/soc/fu/common_output_stage.py
src/soc/fu/compunits/compunits.py
src/soc/fu/compunits/formal/proof_fu.py
src/soc/fu/compunits/formal/test_compunit.py
src/soc/fu/compunits/test/test_compunit.py
src/soc/fu/compunits/test/test_div_compunit.py
src/soc/fu/cr/formal/proof_main_stage.py
src/soc/fu/cr/pipe_data.py
src/soc/fu/cr/test/test_pipe_caller.py
src/soc/fu/div/core_stages.py
src/soc/fu/div/experiment/__init__.py [new file with mode: 0644]
src/soc/fu/div/experiment/goldschmidt_div_sqrt.py [new file with mode: 0644]
src/soc/fu/div/experiment/test/__init__.py [new file with mode: 0644]
src/soc/fu/div/experiment/test/test_goldschmidt_div_sqrt.py [new file with mode: 0644]
src/soc/fu/div/fsm.py
src/soc/fu/div/output_stage.py
src/soc/fu/div/pipe_data.py
src/soc/fu/div/pipeline.py
src/soc/fu/div/setup_stage.py
src/soc/fu/div/test/helper.py
src/soc/fu/div/test/test_pipe_ilang.py
src/soc/fu/ldst/ldst_input_record.py
src/soc/fu/ldst/loadstore.py
src/soc/fu/ldst/pipe_data.py
src/soc/fu/logical/bpermd.py
src/soc/fu/logical/formal/proof_input_stage.py
src/soc/fu/logical/formal/proof_main_stage.py
src/soc/fu/logical/main_stage.py
src/soc/fu/logical/output_stage.py
src/soc/fu/logical/pipe_data.py
src/soc/fu/logical/pipeline.py
src/soc/fu/logical/popcount.py
src/soc/fu/logical/test/test_pipe_caller.py
src/soc/fu/mmu/fsm.py
src/soc/fu/mmu/mmu_input_record.py
src/soc/fu/mmu/pipe_data.py
src/soc/fu/mmu/test/test_issuer_mmu_data_path.py
src/soc/fu/mmu/test/test_non_production_core.py
src/soc/fu/mmu/test/test_pipe_caller.py
src/soc/fu/mul/formal/proof_main_stage.py
src/soc/fu/mul/main_stage.py
src/soc/fu/mul/pipe_data.py
src/soc/fu/mul/post_stage.py
src/soc/fu/mul/pre_stage.py
src/soc/fu/mul/test/helper.py
src/soc/fu/mul/test/test_pipe_caller_long.py
src/soc/fu/mul/test/test_pipe_ilang.py
src/soc/fu/pipe_data.py
src/soc/fu/regspec.py
src/soc/fu/shift_rot/formal/proof_main_stage.py
src/soc/fu/shift_rot/main_stage.py
src/soc/fu/shift_rot/pipe_data.py
src/soc/fu/shift_rot/pipeline.py
src/soc/fu/shift_rot/rotator.py
src/soc/fu/shift_rot/test/test_maskgen.py
src/soc/fu/shift_rot/test/test_pipe_caller.py
src/soc/fu/spr/formal/proof_main_stage.py
src/soc/fu/spr/main_stage.py
src/soc/fu/spr/pipe_data.py
src/soc/fu/spr/test/test_pipe_caller.py
src/soc/fu/trap/formal/proof_main_stage.py
src/soc/fu/trap/main_stage.py
src/soc/fu/trap/pipe_data.py
src/soc/fu/trap/test/test_pipe_caller.py
src/soc/fu/trap/trap_input_record.py
src/soc/interrupts/xics.py
src/soc/litex/florent
src/soc/minerva/units/fetch.py
src/soc/minerva/units/loadstore.py
src/soc/minerva/wishbone.py
src/soc/regfile/regfile.py
src/soc/regfile/regfiles.py
src/soc/regfile/sram_wrapper.py [new file with mode: 0644]
src/soc/regfile/virtual_port.py
src/soc/scoreboard/addr_match.py
src/soc/scoreboard/addr_split.py
src/soc/scoreboard/dependence_cell.py
src/soc/scoreboard/fn_unit.py
src/soc/scoreboard/fu_fu_matrix.py
src/soc/scoreboard/fu_mem_matrix.py
src/soc/scoreboard/fu_reg_matrix.py
src/soc/scoreboard/fu_wr_pending.py
src/soc/scoreboard/group_picker.py
src/soc/scoreboard/instruction_q.py
src/soc/scoreboard/ldst_matrix.py
src/soc/scoreboard/mdm.py
src/soc/scoreboard/mem_dependence_cell.py
src/soc/scoreboard/mem_fu_matrix.py
src/soc/scoreboard/memfu.py
src/soc/scoreboard/reg_select.py
src/soc/scoreboard/shadow.py
src/soc/scoreboard/test_iq.py
src/soc/scoreboard/test_mem2_fu_matrix.py
src/soc/scoreboard/test_mem_fu_matrix.py
src/soc/simple/core.py
src/soc/simple/core_data.py [new file with mode: 0644]
src/soc/simple/inorder.py [new file with mode: 0644]
src/soc/simple/issuer.py
src/soc/simple/issuer_verilog.py
src/soc/simple/test/test_core.py
src/soc/simple/test/test_issuer.py
src/soc/simple/test/test_issuer_dcache.py [new file with mode: 0644]
src/soc/simple/test/test_issuer_linux_5_7.py [new file with mode: 0644]
src/soc/simple/test/test_issuer_mmu.py
src/soc/simple/test/test_issuer_mmu_ifetch.py [new file with mode: 0644]
src/soc/simple/test/test_issuer_mmu_microwatt.py [new file with mode: 0644]
src/soc/simple/test/test_microwatt.py
src/soc/simple/test/test_runner.py
src/soc/simple/test/teststate.py [new file with mode: 0644]
src/unused/TLB/.gitignore [deleted file]
src/unused/TLB/AddressEncoder.py [deleted file]
src/unused/TLB/Cam.py [deleted file]
src/unused/TLB/CamEntry.py [deleted file]
src/unused/TLB/LFSR.py [deleted file]
src/unused/TLB/LFSR.pyi [deleted file]
src/unused/TLB/Makefile [deleted file]
src/unused/TLB/MemorySet.py [deleted file]
src/unused/TLB/PermissionValidator.py [deleted file]
src/unused/TLB/PteEntry.py [deleted file]
src/unused/TLB/SetAssociativeCache.py [deleted file]
src/unused/TLB/TLB.py [deleted file]
src/unused/TLB/__init__.py [deleted file]
src/unused/TLB/ariane/TreePLRU.cpp [deleted file]
src/unused/TLB/ariane/__init__.py [deleted file]
src/unused/TLB/ariane/exceptcause.py [deleted file]
src/unused/TLB/ariane/miss_handler.py [deleted file]
src/unused/TLB/ariane/mmu.py [deleted file]
src/unused/TLB/ariane/p_lru.txt [deleted file]
src/unused/TLB/ariane/plru.py [deleted file]
src/unused/TLB/ariane/ptw.py [deleted file]
src/unused/TLB/ariane/test/__init__.py [deleted file]
src/unused/TLB/ariane/test/test_plru.py [deleted file]
src/unused/TLB/ariane/test/test_ptw.py [deleted file]
src/unused/TLB/ariane/test/test_tlb.py [deleted file]
src/unused/TLB/ariane/test/test_tlb_content.py [deleted file]
src/unused/TLB/ariane/tlb.py [deleted file]
src/unused/TLB/ariane/tlb_content.py [deleted file]
src/unused/TLB/test/__init__.py [deleted file]
src/unused/TLB/test/test_LFSR2.py [deleted file]
src/unused/TLB/test/test_address_encoder.py [deleted file]
src/unused/TLB/test/test_cam.py [deleted file]
src/unused/TLB/test/test_cam_entry.py [deleted file]
src/unused/TLB/test/test_permission_validator.py [deleted file]
src/unused/TLB/test/test_pte_entry.py [deleted file]
src/unused/TLB/test/test_set_associative_cache.py [deleted file]
src/unused/TLB/test/test_tlb.py [deleted file]
src/unused/__init__.py [deleted file]
src/unused/experiment/l0_cache.py [deleted file]
src/unused/iommu/__init__.py [deleted file]
src/unused/iommu/axi_rab/__init__.py [deleted file]
src/unused/iommu/axi_rab/axi4_ar_buffer.py [deleted file]
src/unused/iommu/axi_rab/axi4_ar_sender.py [deleted file]
src/unused/iommu/axi_rab/axi4_aw_buffer.py [deleted file]
src/unused/iommu/axi_rab/axi4_aw_sender.py [deleted file]
src/unused/iommu/axi_rab/axi4_b_buffer.py [deleted file]
src/unused/iommu/axi_rab/axi4_b_sender.py [deleted file]
src/unused/iommu/axi_rab/axi4_r_buffer.py [deleted file]
src/unused/iommu/axi_rab/axi4_r_sender.py [deleted file]
src/unused/iommu/axi_rab/axi4_w_buffer.py [deleted file]
src/unused/iommu/axi_rab/axi4_w_sender.py [deleted file]
src/unused/iommu/axi_rab/axi_buffer_rab.py [deleted file]
src/unused/iommu/axi_rab/axi_buffer_rab_bram.py [deleted file]
src/unused/iommu/axi_rab/axi_rab_cfg.py [deleted file]
src/unused/iommu/axi_rab/axi_rab_top.py [deleted file]
src/unused/iommu/axi_rab/check_ram.py [deleted file]
src/unused/iommu/axi_rab/coreconfig.py [deleted file]
src/unused/iommu/axi_rab/fsm.py [deleted file]
src/unused/iommu/axi_rab/l2_tlb.py [deleted file]
src/unused/iommu/axi_rab/rab_core.py [deleted file]
src/unused/iommu/axi_rab/rab_slice.py [deleted file]
src/unused/iommu/axi_rab/ram_tp_no_change.py [deleted file]
src/unused/iommu/axi_rab/ram_tp_write_first.py [deleted file]
src/unused/iommu/axi_rab/slice_top.py [deleted file]
src/unused/iommu/axi_rab/test/__init__.py [deleted file]
src/unused/iommu/axi_rab/test/test_ram_tp_no_change.py [deleted file]
src/unused/iommu/axi_rab/test/test_slice_top.py [deleted file]
src/unused/simulator/__init__.py [deleted file]
src/unused/simulator/internalop_sim.py [deleted file]
src/unused/simulator/test_sim.py [deleted file]
unused_please_ignore_completely/TLB/.gitignore [new file with mode: 0644]
unused_please_ignore_completely/TLB/AddressEncoder.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/Cam.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/CamEntry.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/LFSR.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/LFSR.pyi [new file with mode: 0644]
unused_please_ignore_completely/TLB/Makefile [new file with mode: 0644]
unused_please_ignore_completely/TLB/MemorySet.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/PermissionValidator.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/PteEntry.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/SetAssociativeCache.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/TLB.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/TreePLRU.cpp [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/exceptcause.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/miss_handler.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/mmu.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/p_lru.txt [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/plru.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/ptw.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/test/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/test/test_plru.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/test/test_ptw.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/test/test_tlb.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/test/test_tlb_content.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/tlb.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/tlb_content.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_LFSR2.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_address_encoder.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_cam.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_cam_entry.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_permission_validator.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_pte_entry.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_set_associative_cache.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_tlb.py [new file with mode: 0644]
unused_please_ignore_completely/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/experiment/l0_cache.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_ar_buffer.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_ar_sender.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_aw_buffer.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_aw_sender.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_b_buffer.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_b_sender.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_r_buffer.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_r_sender.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_w_buffer.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_w_sender.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi_buffer_rab.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi_buffer_rab_bram.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi_rab_cfg.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi_rab_top.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/check_ram.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/coreconfig.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/fsm.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/l2_tlb.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/rab_core.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/rab_slice.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/ram_tp_no_change.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/ram_tp_write_first.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/slice_top.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/test/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/test/test_ram_tp_no_change.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/test/test_slice_top.py [new file with mode: 0644]
unused_please_ignore_completely/simulator/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/simulator/internalop_sim.py [new file with mode: 0644]

index d48dc7ff1f94bf7d776c91bb0774f1be6d66c98f..916979dfa880dc64cffd5b66439e5c1cf0602c8b 100644 (file)
@@ -10,9 +10,10 @@ Waveforms
 *.il
 **/*.gtkw
 .eggs
-
+formal_test_temp
 .vscode/*
 build
 gen
 .noseids
 nosetests.xml
+test-out
index 5f359a060a4ae697c18535b1d6379f52556b1f22..c57c2d547bf19eb76421deb3cb99e33e9d883a32 100644 (file)
@@ -5,10 +5,12 @@ cache:
         - ccache
         - .cache/pip
         - apt-cache
+    when: 'always'
 
 variables:
     PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
     GIT_SUBMODULE_STRATEGY: recursive
+    GIT_DEPTH: "500"
 
 build:
     stage: build
@@ -18,11 +20,29 @@ build:
         - apt-get -o dir::cache::archives="$(pwd)/apt-cache" update
         - >-
             apt-get -o dir::cache::archives="$(pwd)/apt-cache" -y install
-            build-essential git python3-dev python3-pip
-            python3-setuptools python3-wheel pkg-config tcl-dev
-            libreadline-dev bison flex libffi-dev ccache python3-venv
-            binutils-powerpc64-linux-gnu binutils-powerpc64le-linux-gnu
-            autoconf gperf libgmp-dev libmpfr-dev libssl-dev curl
+            build-essential
+            git
+            python3-dev
+            python3-pip
+            python3-setuptools
+            python3-setuptools-scm
+            python3-wheel
+            pkg-config
+            tcl-dev
+            libreadline-dev
+            bison
+            flex
+            libffi-dev
+            ccache
+            python3-venv
+            binutils-powerpc64-linux-gnu
+            binutils-powerpc64le-linux-gnu
+            autoconf
+            gperf
+            libgmp-dev
+            libmpfr-dev
+            libssl-dev
+            curl
         - export PATH="/usr/lib/ccache:$PATH"
         - export CCACHE_BASEDIR="$PWD"
         - export CCACHE_DIR="$PWD/ccache"
@@ -31,43 +51,65 @@ build:
         - ccache --show-stats || true
         - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
         - source $HOME/.cargo/env
-    after_script:
-        - export CCACHE_DIR="$PWD/ccache"
-        - ccache --show-stats
     script:
-        - python3 -m venv .env
+        - python3 -m venv --system-site-packages .env
         - . .env/bin/activate
-        - pip install nose
+        - pip install pytest-xdist==3.3.1 pytest==7.3.1
+
+        - git clone --depth 1 -b v0.1.1 https://github.com/cocotb/cocotb-bus.git cocotb-bus
+        - pushd cocotb-bus
+        - pip install . --no-deps
+        - popd
+
+        - git clone --depth 1 -b v1.5.2 https://github.com/cocotb/cocotb.git cocotb
+        - pushd cocotb
+        - pip install .
+        - popd
+
+        - git clone --depth 1 https://git.libre-soc.org/git/pytest-output-to-files.git pytest-output-to-files
+        - pushd pytest-output-to-files
+        - git rev-parse HEAD
+        - python3 setup.py develop
+        - popd
 
-        - git clone --depth 1 https://github.com/SRI-CSL/yices2.git yices2
+        - git clone --depth 1 -b Yices-2.6.4 https://github.com/SRI-CSL/yices2.git yices2
         - pushd yices2
         - autoconf
         - ./configure
-        - make -j$(nproc) > /dev/null
+        - make -j$(nproc)
         - make install
         - popd
 
-        - git clone --depth 1 https://github.com/YosysHQ/yosys.git yosys
+        - git clone --depth 1 -b yosys-0.17 https://github.com/YosysHQ/yosys.git yosys
         - pushd yosys
         - make config-gcc
-        - make -j$(nproc) > /dev/null
+        - make -j$(nproc)
         - make install
         - popd
         - yosys -V
 
-        - git clone --depth 1 https://github.com/YosysHQ/SymbiYosys.git SymbiYosys
+        - git clone https://github.com/YosysHQ/SymbiYosys.git SymbiYosys
         - pushd SymbiYosys
-        - make install > /dev/null
+        - git checkout d10e472edf4ea9be3aa6347b264ba575fbea933a
+        - make install
         - popd
 
-        - git clone --depth 1 https://github.com/nmigen/nmigen.git nmigen
+        - git clone --depth 1 https://gitlab.com/nmigen/nmigen.git nmigen
         - pushd nmigen
-        - python setup.py develop
+        - git rev-parse HEAD
+        - python3 setup.py develop
+        - popd
+
+        - git clone --depth 1 https://git.libre-soc.org/git/mdis.git mdis
+        - pushd mdis
+        - git rev-parse HEAD
+        - python3 setup.py develop
         - popd
 
         - git clone --depth 1 https://git.libre-soc.org/git/nmutil.git nmutil
         - pushd nmutil
-        - python setup.py develop
+        - git rev-parse HEAD
+        - python3 setup.py develop
         - popd
 
         - git clone --depth 1 https://git.libre-soc.org/git/nmigen-soc.git nmigen-soc
@@ -84,9 +126,7 @@ build:
         - git clone --depth 1 https://git.libre-soc.org/git/openpower-isa.git openpower-isa
         - pushd openpower-isa
         - python3 setup.py develop
-        - make -j$(nproc) svanalysis > /dev/null
-        - make -j$(nproc) pyfnwriter > /dev/null 2>&1
-        - make -j$(nproc) pywriter > /dev/null 2>&1
+        - if ! out="$(make 2>&1)"; then echo "$out"; exit 1; fi
         - popd
 
         - git clone --depth 1 https://git.libre-soc.org/git/c4m-jtag.git c4m-jtag
@@ -95,8 +135,9 @@ build:
         - popd
 
         - IEEE754FPU_PATH="$(pwd)"/ieee754fpu
-        - git clone --depth 1 --recursive https://github.com/billzorn/sfpy.git sfpy
+        - git clone --depth 1 --recursive -b v0.6.0 https://github.com/billzorn/sfpy.git sfpy
         - pushd sfpy
+        - git apply "$IEEE754FPU_PATH"/sfpy.patch
         - pushd berkeley-softfloat-3
         - git apply "$IEEE754FPU_PATH"/berkeley-softfloat.patch
         - popd
@@ -104,14 +145,14 @@ build:
         - git apply ../softposit_sfpy_build.patch
         - git apply "$IEEE754FPU_PATH"/SoftPosit.patch
         - popd
-        - pip install --upgrade -r requirements.txt
+        - pip install -r requirements.txt
         - make lib -j$(nproc)
         - make cython -j$(nproc)
         - make wheel -j$(nproc)
-        - pip install dist/sfpy*.whl
+        - pip install --force-reinstall dist/sfpy*.whl
         - popd
 
-        - cargo install maturin
+        - python3 -m pip install 'maturin>=0.11,<0.12'
         - git clone --depth 1 https://git.libre-soc.org/git/power-instruction-analyzer.git pia
         - pushd pia
         - maturin build --cargo-extra-args=--features=python-extension
@@ -119,4 +160,4 @@ build:
         - popd
 
         - python setup.py develop
-        - nosetests -v --processes=-1 --process-timeout=120
+        - SILENCELOG='!*,default' pytest -v --maxfail=20
index 909a463a0d605f97d2c931c5e65015ca1d1b1406..15670cf8b3babf7f8e0991cd3e5100fecb68d273 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -11,15 +11,15 @@ mkpinmux:
        cp pinmux/ls180/ls180_pins.py src/soc/debug
        cp pinmux/ls180/ls180_pins.py src/soc/litex/florent/libresoc
 
-install: gitupdate develop mkpinmux svanalysis
+install: gitupdate develop mkpinmux
 
 # this is now actually part of openpower-isa repository
 pywriter:
-       pywriter
+       echo "pywriter is part of openpower-isa, run that instead"
 
 # this is now actually part of openpower-isa repository
 svanalysis:
-       svanalysis
+       echo "sv_analysis is part of openpower-isa, run that instead"
 
 develop:
        python3 setup.py develop # yes, develop, not install
@@ -56,6 +56,43 @@ ls180_4k_verilog:
                --enable-xics --enable-sram4x4kblock --disable-svp64 \
                        src/soc/litex/florent/libresoc/libresoc.v
 
+# build microwatt "external core", note that the TLB set size is set to 16
+# for I/D-Cache which needs a corresponding alteration of the device-tree
+# entries for linux
+microwatt_external_core:
+       python3 src/soc/simple/issuer_verilog.py --microwatt-compat --enable-mmu \
+            external_core_top.v
+
+# build microwatt "external core" with fixed 64-bit width SVP64
+# note that the TLB set size is set to 16
+# for I/D-Cache which needs a corresponding alteration of the device-tree
+# entries for linux
+microwatt_external_core_svp64:
+       python3 src/soc/simple/issuer_verilog.py --microwatt-compat-svp64 --enable-mmu \
+            external_core_top.v
+
+microwatt_external_core_spi:
+       python3 src/soc/simple/issuer_verilog.py --microwatt-compat \
+            --small-cache \
+            --enable-mmu \
+            --pc-reset 0x10000000 \
+            external_core_top.v
+
+# microwatt-compatible core with smaller cache size (quick. VERSA_ECP5. just)
+microwatt_external_core_bram:
+       python3 src/soc/simple/issuer_verilog.py --microwatt-compat \
+            --small-cache \
+            --enable-mmu \
+            --pc-reset 0xFF000000 \
+            external_core_top.v
+
+# microwatt-compatible core with larger cache size (experiment on arty)
+microwatt_external_core_bram_arty:
+       python3 src/soc/simple/issuer_verilog.py --microwatt-compat \
+            --enable-mmu \
+            --pc-reset 0xFF000000 \
+            external_core_top.v
+
 # build the litex libresoc SoC without 4k SRAMs
 ls180_verilog_build: ls180_verilog
        make -C soc/soc/litex/florent ls180
diff --git a/conf.py b/conf.py
index 12b29a4fb10659843b17c069255fcc3199cc77ba..d752f59ef042ac4b8d5f42bcebdf50308f0ee585 100644 (file)
--- a/conf.py
+++ b/conf.py
@@ -47,7 +47,7 @@ extensions = [
     'sphinx.ext.coverage',
     'recommonmark',
     #'symbolator_sphinx',
-    'sphinxcontrib_verilog_diagrams',
+    #'sphinxcontrib_verilog_diagrams', # XXX now spinxcontrib-hdl-diagrams
     'sphinx_rtd_theme',
     #'sphinx_tabs.tabs',
 ]
diff --git a/flake.lock b/flake.lock
new file mode 100644 (file)
index 0000000..8193dbc
--- /dev/null
@@ -0,0 +1,131 @@
+{
+  "nodes": {
+    "c4m-jtag": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1619101523,
+        "narHash": "sha256-y1OY8URcE1lnu5L7IDFcJ8zT8sqlrfMP9VPNmVvACGk=",
+        "ref": "master",
+        "rev": "c2bf4810f9f91ced7fcda777b92b86ab353da288",
+        "revCount": 146,
+        "type": "git",
+        "url": "https://git.libre-soc.org/git/c4m-jtag.git"
+      },
+      "original": {
+        "type": "git",
+        "url": "https://git.libre-soc.org/git/c4m-jtag.git"
+      }
+    },
+    "migen": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1631614362,
+        "narHash": "sha256-BgYf4e7O/rbS5P1ZpDlcgCEUh2h2vK3FyHADdzyaMg0=",
+        "owner": "m-labs",
+        "repo": "migen",
+        "rev": "7bc4eb1387b39159a74c1dbd1b820728e0bfbbaa",
+        "type": "github"
+      },
+      "original": {
+        "owner": "m-labs",
+        "repo": "migen",
+        "type": "github"
+      }
+    },
+    "nix-litex": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1632150297,
+        "narHash": "sha256-ghlAJBZxLVkQB+9tXEOBOF1FfdT5Pn4292khF4iKCNA=",
+        "ref": "main",
+        "rev": "5ab6984eb1efad0c91d808c9b7b79e00e50ccc05",
+        "revCount": 31,
+        "type": "git",
+        "url": "https://git.sr.ht/~lschuermann/nix-litex"
+      },
+      "original": {
+        "ref": "main",
+        "type": "git",
+        "url": "https://git.sr.ht/~lschuermann/nix-litex"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1631723418,
+        "narHash": "sha256-Sbey1S81fXUKcEHVCMwlXMju/IoCQxMwP1PPkVYpGrc=",
+        "owner": "L-as",
+        "repo": "nixpkgs",
+        "rev": "8bfc1026477692b933df6eeec27bd494cac3e436",
+        "type": "github"
+      },
+      "original": {
+        "owner": "L-as",
+        "ref": "libresoc",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "nmigen": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1618220900,
+        "narHash": "sha256-Ol2SMZLUTikZWDLmK7F5lZuKBfGO71WmisATPNMTpHQ=",
+        "ref": "master",
+        "rev": "d824795c2c7cb43dcbc8ed8fac6d309d77284913",
+        "revCount": 1056,
+        "type": "git",
+        "url": "https://git.libre-soc.org/git/nmigen.git"
+      },
+      "original": {
+        "type": "git",
+        "url": "https://git.libre-soc.org/git/nmigen.git"
+      }
+    },
+    "nmigen-soc": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1601572554,
+        "narHash": "sha256-v9SH+KuIPydXCr363RUsMg9/tabuu+GjKPJOKq2Jze0=",
+        "ref": "master",
+        "rev": "692017c7eaf21ff37302790c4422db6bd08667be",
+        "revCount": 48,
+        "type": "git",
+        "url": "https://git.libre-soc.org/git/nmigen-soc.git"
+      },
+      "original": {
+        "type": "git",
+        "url": "https://git.libre-soc.org/git/nmigen-soc.git"
+      }
+    },
+    "root": {
+      "inputs": {
+        "c4m-jtag": "c4m-jtag",
+        "migen": "migen",
+        "nix-litex": "nix-litex",
+        "nixpkgs": "nixpkgs",
+        "nmigen": "nmigen",
+        "nmigen-soc": "nmigen-soc",
+        "yosys": "yosys"
+      }
+    },
+    "yosys": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1617979565,
+        "narHash": "sha256-M8ppe+lL/pgd2sXh7bM6/sbk1099KKECeWA5mXtqE6Y=",
+        "owner": "YosysHQ",
+        "repo": "yosys",
+        "rev": "a58571d0fe8971cb7d3a619a31b2c21be6d75bac",
+        "type": "github"
+      },
+      "original": {
+        "owner": "YosysHQ",
+        "repo": "yosys",
+        "rev": "a58571d0fe8971cb7d3a619a31b2c21be6d75bac",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644 (file)
index 0000000..90a976c
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,98 @@
+{
+  description = "FOSS CPU/GPU/VPU/SoC all in one, see https://libre-soc.org/";
+
+  inputs.nixpkgs.url = "github:L-as/nixpkgs?ref=libresoc"; # for alliance and migen
+  inputs.c4m-jtag.url = "git+https://git.libre-soc.org/git/c4m-jtag.git";
+  inputs.c4m-jtag.flake = false;
+  inputs.nmigen.url = "git+https://git.libre-soc.org/git/nmigen.git";
+  inputs.nmigen.flake = false;
+  inputs.nmigen-soc.url = "git+https://git.libre-soc.org/git/nmigen-soc.git";
+  inputs.nmigen-soc.flake = false;
+  inputs.migen.url = "github:m-labs/migen";
+  inputs.migen.flake = false;
+  inputs.yosys.url = "github:YosysHQ/yosys?rev=a58571d0fe8971cb7d3a619a31b2c21be6d75bac";
+  inputs.yosys.flake = false;
+  # submodules needed
+  inputs.nix-litex.url = "git+https://git.sr.ht/~lschuermann/nix-litex?ref=main";
+  inputs.nix-litex.flake = false;
+
+  outputs = { self, nixpkgs, c4m-jtag, nmigen, nmigen-soc, nix-litex, migen, yosys }:
+    let
+      getv = x: builtins.substring 0 8 x.lastModifiedDate;
+
+      supportedSystems = [ "x86_64-linux" "x86_64-darwin" "aarch64-linux" "aarch64-darwin" ];
+
+      forAllSystems = nixpkgs.lib.genAttrs supportedSystems;
+
+      litex = pkgs: import "${nix-litex}/pkgs" {
+        inherit pkgs;
+        pkgMetas = builtins.fromTOML (builtins.readFile ./nix/litex.toml);
+        skipChecks = true; # FIXME: remove once checks work
+      };
+
+      nixpkgsFor = forAllSystems (system: import nixpkgs { inherit system; overlays = [ self.overlay ]; });
+
+      lib = nixpkgs.lib;
+    in
+    {
+      overlay = final: prev: {
+        python37 = prev.python37.override {
+          packageOverrides = lib.composeExtensions (litex final).pythonOverlay (pfinal: pprev: {
+            libresoc-ieee754fpu = pfinal.callPackage ./nix/ieee754fpu.nix {};
+            libresoc-openpower-isa = pfinal.callPackage ./nix/openpower-isa.nix {};
+            c4m-jtag = pfinal.callPackage (import ./nix/c4m-jtag.nix { src = c4m-jtag; version = getv c4m-jtag; }) {};
+            bigfloat = pfinal.callPackage ./nix/bigfloat.nix {};
+            modgrammar = pfinal.callPackage ./nix/modgrammar.nix {};
+            libresoc-nmutil = pfinal.callPackage ./nix/nmutil.nix {};
+            libresoc-soc = pfinal.callPackage (import ./nix/soc.nix { version = getv self; }) {};
+
+            nmigen-soc = pprev.nmigen-soc.overrideAttrs (_: {
+              doCheck = false;
+              src = nmigen-soc;
+              setuptoolsCheckPhase = "true";
+            });
+
+            nmigen = pprev.nmigen.overrideAttrs (_: {
+              src = nmigen;
+            });
+
+            migen = pprev.migen.overrideAttrs (_: {
+              src = migen;
+            });
+          });
+        };
+
+        yosys = prev.yosys.overrideAttrs (_: {
+          version = "0.9+4052";
+          src = yosys;
+        });
+
+        libresoc-verilog = final.callPackage (import ./nix/verilog.nix { version = getv self; }) { python3Packages = final.python37Packages; };
+        libresoc-ls180 = final.callPackage (import ./nix/ls180.nix { version = getv self; }) { python3Packages = final.python37Packages; };
+        libresoc-ecp5 = final.callPackage (import ./nix/ecp5.nix { version = getv self; }) { python3Packages = final.python37Packages; };
+        libresoc-ecp5-program = final.callPackage (import ./nix/ecp5-program.nix { version = getv self; }) { python3Packages = final.python37Packages; };
+        libresoc-pinmux = final.callPackage (import ./nix/pinmux.nix { version = getv self; }) {};
+      };
+
+      apps = forAllSystems (system: {
+        ecp5 = {
+          type = "app";
+          program = "${nixpkgsFor.${system}.libresoc-ecp5-program}";
+        };
+      });
+      defaultApp = forAllSystems (system: self.apps.${system}.ecp5);
+
+      packages = forAllSystems (system: {
+        soc = nixpkgsFor.${system}.python37Packages.libresoc-soc;
+        verilog = nixpkgsFor.${system}.libresoc-verilog;
+        pinmux = nixpkgsFor.${system}.libresoc-pinmux;
+        ls180 = nixpkgsFor.${system}.libresoc-ls180;
+        ecp5 = nixpkgsFor.${system}.libresoc-ecp5;
+        ecp5-program = nixpkgsFor.${system}.libresoc-ecp5-program;
+        openpower-isa = nixpkgsFor.${system}.python37Packages.libresoc-openpower-isa;
+        debugNixpkgs = nixpkgsFor.${system};
+      });
+
+      defaultPackage = forAllSystems (system: self.packages.${system}.verilog);
+    };
+}
index b122611c5764140fec7bfa6876d366322043f3a9..c98e48044dfcf9019930997720ac5b431be7ac53 100755 (executable)
@@ -1,3 +1,5 @@
 #!/bin/sh
 cd pinmux
 python2 src/pinmux_generator.py -v -s ls180 -o ls180
+# temporary - return to older version of pinmux
+#python2 src/pinmux_generator.py -v -s ngi_router -o ngi_router
diff --git a/nix/bigfloat.nix b/nix/bigfloat.nix
new file mode 100644 (file)
index 0000000..4355ef0
--- /dev/null
@@ -0,0 +1,21 @@
+{ lib, buildPythonPackage, fetchPypi, gmp, mpfr, six }:
+
+buildPythonPackage rec {
+  pname = "bigfloat";
+  version = "0.4.0";
+
+  buildInputs = [ gmp mpfr ];
+  propagatedBuildInputs = [ six ];
+
+  src = fetchPypi {
+    inherit pname version;
+    sha256 = "WLlr3ocqylmJ0T2C66Os8qoblOIhF91yoWulkRsMDLg=";
+  };
+
+  doCheck = false;
+
+  meta = with lib; {
+    homepage = "https://pypi.org/project/bigfloat/";
+    license = licenses.lgpl3Plus;
+  };
+}
diff --git a/nix/c4m-jtag.nix b/nix/c4m-jtag.nix
new file mode 100644 (file)
index 0000000..cf301c6
--- /dev/null
@@ -0,0 +1,24 @@
+{ version, src }:
+
+{ lib, python, buildPythonPackage, nmigen-soc, nmigen, modgrammar, setuptools-scm }:
+
+buildPythonPackage {
+  pname = "c4m-jtag";
+  inherit src version;
+
+  nativeBuildInputs = [ setuptools-scm ];
+  propagatedBuildInputs = [ nmigen-soc nmigen modgrammar ];
+
+  doCheck = false;
+
+  pythonImportsCheck = [ "c4m.nmigen.jtag.tap" ];
+
+  prePatch = ''
+    export SETUPTOOLS_SCM_PRETEND_VERSION=${version}
+  '';
+
+  meta = with lib; {
+    homepage = "https://pypi.org/project/libresoc-openpower-isa/";
+    license = licenses.lgpl3Plus;
+  };
+}
diff --git a/nix/ecp5-program.nix b/nix/ecp5-program.nix
new file mode 100644 (file)
index 0000000..4d696b2
--- /dev/null
@@ -0,0 +1,24 @@
+{ version }:
+
+{ writeShellScript, openocd, python3Packages, libresoc-ecp5, nextpnr, trellis }:
+
+let
+  pythonWithEnv = python3Packages.python.withPackages (ps: with ps; [
+    requests migen libresoc-soc litex-boards litex litedram liteeth liteiclink litescope litesdcard
+  ]);
+in
+writeShellScript "program-ecp5-libresoc" ''
+  export PATH="${openocd}/bin:${pythonWithEnv}/bin:${trellis}/bin:${nextpnr}/bin:$PATH"
+
+  dir="$(mktemp -d)"
+  pushd "$dir"
+  echo "$dir"
+
+  export PYTHONPATH="${../src/soc/litex/florent}:$PYTHONPATH"
+
+  python ${../src/soc/litex/florent/versa_ecp5.py} --sys-clk-freq=55e6 --load-from ${libresoc-ecp5}
+
+  popd
+  rm -rf "$dir"
+  exit 0
+''
diff --git a/nix/ecp5.nix b/nix/ecp5.nix
new file mode 100644 (file)
index 0000000..1c82ee4
--- /dev/null
@@ -0,0 +1,40 @@
+{ version }:
+
+{ stdenv, python3Packages, yosys, libresoc-verilog, libresoc-pinmux, pkgsCross
+, nextpnr, trellis }:
+
+stdenv.mkDerivation {
+  pname = "libresoc-versa-ecp5.v";
+  inherit version;
+
+  src = ../src/soc/litex/florent;
+
+  nativeBuildInputs =
+    (with python3Packages; [
+    python libresoc-soc litex-boards litex litedram liteeth liteiclink litescope litesdcard
+    ])
+    ++ [ trellis nextpnr pkgsCross.powernv.buildPackages.gcc ];
+
+  postPatch = ''
+    patchShebangs --build .
+  '';
+
+  configurePhase = "true";
+
+  buildPhase = ''
+    runHook preBuild
+    export PINMUX="$(mktemp -d)"
+    ln -s ${libresoc-pinmux} "$PINMUX/ls180"
+    cp ${libresoc-verilog} libresoc/libresoc.v
+    ./versa_ecp5.py --sys-clk-freq=55e6 --build
+    runHook postBuild
+  '';
+
+  installPhase = ''
+    runHook preInstall
+    mv /build/florent/build/versa_ecp5/gateware/versa_ecp5.svf $out
+    runHook postInstall
+  '';
+
+  fixupPhase = "true";
+}
diff --git a/nix/ieee754fpu.nix b/nix/ieee754fpu.nix
new file mode 100644 (file)
index 0000000..e520437
--- /dev/null
@@ -0,0 +1,27 @@
+{ lib, buildPythonPackage, libresoc-nmutil, bigfloat, fetchgit }:
+
+buildPythonPackage {
+  pname = "libresoc-ieee754fpu";
+  version = "unstable-2021-06-05";
+
+  src = fetchgit {
+    url = "https://git.libre-soc.org/git/ieee754fpu.git";
+    rev = "c62fa3a7ee95832587d7725729dcdb9a002ae015";
+    sha256 = "wbr1vGFzUlUtBT6IcRsykADYeksiVoq/LacU/dbRQ0o=";
+  };
+
+  propagatedBuildInputs = [ libresoc-nmutil bigfloat ];
+
+  doCheck = false;
+
+  prePatch = ''
+    touch ./src/ieee754/part/__init__.py
+  '';
+
+  pythonImportsCheck = [ "ieee754.part" ];
+
+  meta = with lib; {
+    homepage = "https://pypi.org/project/libresoc-ieee754fpu/";
+    license = licenses.lgpl3Plus;
+  };
+}
diff --git a/nix/litex.toml b/nix/litex.toml
new file mode 100644 (file)
index 0000000..89317f0
--- /dev/null
@@ -0,0 +1,89 @@
+[litex]
+github_user = "enjoy-digital"
+github_repo = "litex"
+git_revision = "42d8fc226a4f4e8dfef104257a95f98eb9b10da7"
+github_archive_nix_hash = "16zb7mci2a09jc5bbr4342pn95iyl84705n566alpx696xk2l0zr"
+
+[litex-boards]
+github_user = "litex-hub"
+github_repo = "litex-boards"
+git_revision = "1781be166aee867421e0d943f6a62c3397524563"
+github_archive_nix_hash = "0ar41ibs6si03iyhcjn3blw1rkdsazn5rsa95ph8v061kg2yjbjh"
+
+[liteeth]
+github_user = "enjoy-digital"
+github_repo = "liteeth"
+git_revision = "64b85e621e740b9b7a9bdb03749758c703fea6e1"
+github_archive_nix_hash = "1gbscl36n6mgaz1y1b27nzhykrhrccl6ls5vp7dd6divpqdf328i"
+
+[litedram]
+github_user = "enjoy-digital"
+github_repo = "litedram"
+git_revision = "ac825e51124e926c67455292cd2b949954fc6f65"
+github_archive_nix_hash = "1acs4kgbsv8pgml1q7709afh46f8mpy8b1nw0p9n8a1zih8ang1r"
+
+[litehyperbus]
+github_user = "litex-hub"
+github_repo = "litehyperbus"
+git_revision = "c4b64d2c992cedf3e03ffdf87f389feb5ddfff52"
+github_archive_nix_hash = "1iwjwzz4wa9zzm6yqa7rkag9igmsawp8wpmkj6fqia20b7xjglnb"
+
+[liteiclink]
+github_user = "enjoy-digital"
+github_repo = "liteiclink"
+git_revision = "efd200fa9e625144131a310fc09fd1fecf1682e6"
+github_archive_nix_hash = "0g643ryfzc6iq0p80rhq116n5w6mh4fv4yg4adyy5i1vy2grlg8s"
+
+[litepcie]
+github_user = "enjoy-digital"
+github_repo = "litepcie"
+git_revision = "0718fd135fc30e0a3598eaf66ce2fcb54b62193c"
+github_archive_nix_hash = "1m3i4hv49438ik4qhdp7rx9nan5rddrqp7nzvya9xfbh7lfc59hl"
+
+[litescope]
+github_user = "enjoy-digital"
+github_repo = "litescope"
+git_revision = "2739d5a069386c8e834c7f660dce9f93dc2b4598"
+github_archive_nix_hash = "08r7dzlmlfs9pmfz4xkf61sal5zy3caby88bcb4993c43nzpw8a3"
+
+[litesdcard]
+github_user = "enjoy-digital"
+github_repo = "litesdcard"
+git_revision = "edee2467fcabc62c4b34e3daa2271a71e52ba09f"
+github_archive_nix_hash = "0n5x9cx61xij0hc61slabxa05pzmw8i5fyg54ydmxi2fl2p5p0rs"
+
+[litespi]
+github_user = "litex-hub"
+github_repo = "litespi"
+git_revision = "c0730ebdb3c976618bf24e9ec04911e7c9934adf"
+github_archive_nix_hash = "015irjdpii514aj4av02pglvvq0wgxkplyy09435crzy9j5i5v04"
+
+[pythondata-misc-tapcfg]
+github_user = "litex-hub"
+github_repo = "pythondata-misc-tapcfg"
+git_revision = "25c97a4a9ff9af85248028fe01e2c65b2e3640ee"
+github_archive_nix_hash = "0zr6d5giqzsjmqpfyf1b25r0y70bj09xjbfinfxcdc6s8cwwwz71"
+
+[pythondata-software-compiler_rt]
+github_user = "litex-hub"
+github_repo = "pythondata-software-compiler_rt"
+git_revision = "7cfcaed2e726027fd622650b58dd77e47c495ee0"
+github_archive_nix_hash = "0b65dj95418j4pjqqkqjq5npnn1ih1789ba9575kxcljgj7r8xb7"
+
+[pythondata-cpu-serv]
+github_user = "litex-hub"
+github_repo = "pythondata-cpu-serv"
+git_revision = "915cdf793395ab48cc52c0225660eb6eeff41133"
+github_archive_nix_hash = "1ndkjhh7r521cc9g63pmjvgvv9sa3s8n2mkdli91nr7ns3q3lxmk"
+
+[litevideo]
+github_user = "enjoy-digital"
+github_repo = "litevideo"
+git_revision = "41f30143075ece3fff5c33a332ed067d1837cbb3"
+github_archive_nix_hash = "06vw4rn8xby8is13275bmkrxlwp3wlznbdqfay78a5m8bp73kypy"
+
+[valentyusb-hw_cdc_eptri]
+github_user = "litex-hub"
+github_repo = "valentyusb"
+git_revision = "a0526ad053c394306ad7a585a7ddd463831ad09d"
+github_archive_nix_hash = "0nad2x5j5rnjyciwm0abxhzng8nrv06ri8g9qdi39zk8n9zy7cmf"
diff --git a/nix/ls180.nix b/nix/ls180.nix
new file mode 100644 (file)
index 0000000..028fbcb
--- /dev/null
@@ -0,0 +1,44 @@
+{ version }:
+
+{ stdenv, python3Packages, yosys, libresoc-verilog, libresoc-pinmux, pkgsCross }:
+
+stdenv.mkDerivation {
+  pname = "libresoc-ls1804k";
+  inherit version;
+
+  src = ../src/soc/litex/florent;
+
+  nativeBuildInputs =
+    (with python3Packages; [
+    python libresoc-soc litex litedram liteeth liteiclink litescope litesdcard
+    ])
+    ++ [ pkgsCross.powernv.buildPackages.gcc ];
+
+  postPatch = ''
+    patchShebangs --build .
+  '';
+
+  configurePhase = "true";
+
+  buildPhase = ''
+    runHook preBuild
+    export PINMUX="$(mktemp -d)"
+    ln -s ${libresoc-pinmux} "$PINMUX/ls180"
+    cp ${libresoc-verilog} libresoc/libresoc.v
+    ./ls180soc.py --build --platform=ls180sram4k --num-srams=2 --srams4k
+    runHook postBuild
+  '';
+
+  installPhase = ''
+    runHook preInstall
+    mkdir $out
+    mv build/ls180sram4k/gateware/ls180sram4k.v $out/ls180.v
+    mv build/ls180sram4k/gateware/mem.init $out
+    mv build/ls180sram4k/gateware/mem_1.init $out
+    mv libresoc/libresoc.v $out
+    mv libresoc/SPBlock_512W64B8W.v $out
+    runHook postInstall
+  '';
+
+  fixupPhase = "true";
+}
diff --git a/nix/modgrammar.nix b/nix/modgrammar.nix
new file mode 100644 (file)
index 0000000..ce0f348
--- /dev/null
@@ -0,0 +1,20 @@
+{ lib, buildPythonPackage, fetchFromGitHub }:
+
+buildPythonPackage rec {
+  pname = "modgrammar";
+  version = "unstable-2020-09-20";
+
+  src = fetchFromGitHub {
+    owner = "bloerwald";
+    repo = "modgrammar";
+    rev = "d363ad5a86584e560a8b03cbe11c0168d7610691";
+    sha256 = "SO2qjfEVaJfgbA5HLJYwXlaeUzt5EFoljYQ2SsdDCbc=";
+  };
+
+  doCheck = false;
+
+  meta = with lib; {
+    homepage = "https://pypi.org/project/modgrammar/";
+    # license = licenses.bsd; # FIXME: Which BSD?
+  };
+}
diff --git a/nix/nmutil.nix b/nix/nmutil.nix
new file mode 100644 (file)
index 0000000..3489e77
--- /dev/null
@@ -0,0 +1,21 @@
+{ lib, buildPythonPackage, bigfloat, fetchgit, pyvcd }:
+
+buildPythonPackage {
+  pname = "libresoc-nmutil";
+  version = "unstable-2021-08-24";
+
+  propagatedBuildInputs = [ pyvcd ];
+
+  src = fetchgit {
+    url = "https://git.libre-soc.org/git/nmutil.git";
+    rev = "efda080db6978d249a23003bec734f1cc07de329";
+    sha256 = "nTgUiZc4CC0VoUND29kHSIyMlP9IB3oZfehutoNK07w=";
+  };
+
+  doCheck = false;
+
+  meta = with lib; {
+    homepage = "https://pypi.org/project/libresoc-ieee754fpu/";
+    license = licenses.lgpl3Plus;
+  };
+}
diff --git a/nix/openpower-isa.nix b/nix/openpower-isa.nix
new file mode 100644 (file)
index 0000000..5aee8b1
--- /dev/null
@@ -0,0 +1,31 @@
+{ lib, python, buildPythonPackage, fetchgit, libresoc-nmutil, astor, nmigen, ply, pygdbmi }:
+
+buildPythonPackage {
+  pname = "libresoc-openpower-isa";
+  version = "unstable-2021-09-04";
+
+  src = fetchgit {
+    url = "https://git.libre-soc.org/git/openpower-isa.git";
+    rev = "6e43a194f3d07ed5a8daa297187a32746c4c4d3c";
+    sha256 = "0EekUouTQruTXGO5jlPJtqh0DOudghILy0nca5eaZz8=";
+  };
+
+  propagatedBuildInputs = [ libresoc-nmutil astor nmigen ply pygdbmi ];
+
+  doCheck = false;
+
+  prePatch = ''
+    touch ./src/openpower/sv/__init__.py # TODO: fix upstream
+  '';
+
+  postInstall = ''
+    cp -rT ./openpower $out/${python.sitePackages}/../openpower/
+  '';
+
+  pythonImportsCheck = [ "openpower.decoder.power_decoder2" "openpower" ];
+
+  meta = with lib; {
+    homepage = "https://pypi.org/project/libresoc-openpower-isa/";
+    license = licenses.lgpl3Plus;
+  };
+}
diff --git a/nix/pinmux.nix b/nix/pinmux.nix
new file mode 100644 (file)
index 0000000..fc9ca7e
--- /dev/null
@@ -0,0 +1,28 @@
+{ version }:
+
+{ stdenv, python2 }:
+
+stdenv.mkDerivation {
+  pname = "libresoc-pinmux";
+  inherit version;
+
+  src = ../pinmux;
+
+  nativeBuildInputs = [ python2 ];
+
+  configurePhase = "true";
+
+  buildPhase = ''
+    runHook preBuild
+    python src/pinmux_generator.py -v -s ls180 -o ls180
+    runHook postBuild
+  '';
+
+  installPhase = ''
+    runHook preInstall
+    mv ls180 $out
+    runHook postInstall
+  '';
+
+  fixupPhase = "true";
+}
diff --git a/nix/soc.nix b/nix/soc.nix
new file mode 100644 (file)
index 0000000..a4ed136
--- /dev/null
@@ -0,0 +1,38 @@
+{ version }:
+
+{ lib, buildPythonPackage, yosys, runCommand, c4m-jtag, nmigen-soc
+, libresoc-ieee754fpu, libresoc-openpower-isa, python }:
+
+let
+  # If we use ../. as source, then any change to
+  # any unrelated Nix file would cause a rebuild,
+  # since the build would have access to it.
+  src = runCommand "libresoc-soc-source" {} ''
+    mkdir $out
+    cp -r ${../src} -T $out/src
+    cp -r ${../setup.py} -T $out/setup.py
+    cp -r ${../README.md} -T $out/README.md
+    cp -r ${../NEWS.txt} -T $out/NEWS.txt
+  '';
+in
+buildPythonPackage {
+  pname = "libresoc-soc";
+  inherit version src;
+
+  propagatedBuildInputs = [
+    c4m-jtag nmigen-soc python libresoc-ieee754fpu libresoc-openpower-isa yosys
+  ];
+
+  doCheck = false;
+
+  prePatch = ''
+    rm -r src/soc/litex
+  '';
+
+  pythonImportsCheck = [ "soc" ];
+
+  meta = with lib; {
+    homepage = "https://libre-soc.org/";
+    license = licenses.lgpl3Plus;
+  };
+}
diff --git a/nix/verilog.nix b/nix/verilog.nix
new file mode 100644 (file)
index 0000000..600b693
--- /dev/null
@@ -0,0 +1,20 @@
+{ version }:
+
+{ runCommand, python3Packages, libresoc-pinmux }:
+
+let script = ''
+  mkdir pinmux
+  ln -s ${libresoc-pinmux} pinmux/ls180
+  export PINMUX="$(realpath ./pinmux)"
+  python3 -m soc.simple.issuer_verilog \
+    --debug=jtag --enable-core --enable-pll \
+    --enable-xics --enable-sram4x4kblock --disable-svp64 \
+    $out
+''; in
+runCommand "libresoc.v" {
+  inherit version;
+
+  nativeBuildInputs = (with python3Packages; [
+    libresoc-soc
+  ]) ++ [ libresoc-pinmux ];
+} script
diff --git a/pinmux b/pinmux
index 096caad8418250693c93ccf90047750704adcaa7..7cbf0e2a54448f549243cd602ebafd10de8d32f0 160000 (submodule)
--- a/pinmux
+++ b/pinmux
@@ -1 +1 @@
-Subproject commit 096caad8418250693c93ccf90047750704adcaa7
+Subproject commit 7cbf0e2a54448f549243cd602ebafd10de8d32f0
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644 (file)
index 0000000..e21c7c4
--- /dev/null
@@ -0,0 +1,11 @@
+[tool.pytest.ini_options]
+minversion = "6.0"
+python_classes = ""
+python_functions = ""
+testpaths = ["src/soc"]
+required_plugins = ["pytest-xdist>=1.0.0", "pytest-output-to-files>=0.1.0"]
+addopts = [
+    "-n",
+    "auto",
+    "--shorten-output-dir=test-out",
+]
index 14cd4c6e9508c1a6b18bd5ace84897b9c9db6bb1..ddbdf8b4402fc7316616bf30bdad22b4386d4a88 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -8,16 +8,45 @@ NEWS = open(os.path.join(here, 'NEWS.txt')).read()
 
 version = '0.0.1'
 
+# the only reason this is added is because it's become a part of python 3.9.
+# the project standard is python 3.7 however in future that will be updated.
+# for now, cached_property is RELUCTANTLY added but a *copy* is added so
+# that the generation of HDL is not critically dependent on random crap
+# off the internet. you're spending USD 16 *MILLION* on masks, you better
+# be absolutely paranoid-level certain you know where every piece of the
+# chain creating the HDL comes from.
+cprop = "git+https://git.libre-soc.org/git/cached-property.git@1.5.2" \
+        "#egg=cached-property-1.5.2"
+
 # using pip3 for ongoing development is a royal pain.  seriously not
 # recommended.  therefore a number of these dependencies have been
 # commented out.  *they are still required* - they will need installing
 # manually.
 
+# XXX UNDER NO CIRCUMSTANCES ADD ARBITRARY DEPENDENCIES HERE. XXX
+# as this is HDL, not software, every dependency added is
+# a serious maintenance and reproducible-build problem.
+# dropping USD 16 million on 7nm Mask Charges when the
+# HDL can be compromised - accidentally or deliberately -
+# by pip3 going out and randomly downloading complete
+# shite is not going to do anyone any favours.
+
+# TODO: make *all* of these be from libre-soc git repo only
+# (which means updating the nmigen-soc one to mirror gitlab)
+
 install_requires = [
     #    'sfpy',    # needs manual patching
     'libresoc-ieee754fpu',   # uploaded (successfully, whew) to pip
     'libresoc-openpower-isa',  # uploaded (successfully, whew) to pip
     # 'nmigen-soc', # install manually from git.libre-soc.org
+
+    # git url needed for having `pip3 install -e .` install from libre-soc git
+    "cached-property@"+cprop,
+]
+
+# git url needed for having `setup.py develop` install from libre-soc git
+dependency_links = [
+    cprop,
 ]
 
 test_requires = [
@@ -34,7 +63,8 @@ setup(
     long_description_content_type='text/markdown',
     classifiers=[
         "Topic :: Software Development",
-        "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)",
+        "License :: OSI Approved :: " \
+            "GNU Lesser General Public License v3 or later (LGPLv3+)",
         "Programming Language :: Python :: 3",
         "Operating System :: OS Independent",
     ],
@@ -48,6 +78,7 @@ setup(
     include_package_data=True,
     zip_safe=False,
     install_requires=install_requires,
+    dependency_links=dependency_links,
     tests_require=test_requires,
     test_suite='nose.collector',
 )
diff --git a/src/soc/bus/external_core.py b/src/soc/bus/external_core.py
new file mode 100644 (file)
index 0000000..102e66c
--- /dev/null
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the external_core_top.v verilog module
+# which allows for faster development iteration (oh and microwatt or
+# other core to be dropped into a peripheral fabric)
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+                    ResetSignal, Const)
+from nmigen.cli import rtlil, verilog
+
+from soc.debug.dmi import DMIInterface
+from nmigen_soc.wishbone.bus import Interface
+import os
+
+__all__ = ["ExternalCore"]
+
+
+class ExternalCore(Elaboratable):
+    """External Core verilog wrapper for microwatt and libre-soc
+   (actually, anything prepared to map to the Signals defined below)
+   remember to call ExternalCore.add_verilog_source
+    """
+
+    def __init__(self, ibus=None, dbus=None, features=None, name=None):
+
+        # set up the icache wishbone bus
+        if features is None:
+            features = frozenset(("stall",))
+        if ibus is None:
+            ibus = Interface(addr_width=32,
+                            data_width=64,
+                            features=features,
+                            granularity=8,
+                            name="core_ibus")
+        if dbus is None:
+            dbus = Interface(addr_width=32,
+                            data_width=64,
+                            features=features,
+                            granularity=8,
+                            name="core_dbus")
+        self.dmi = DMIInterface(name="dmi")
+        self.ibus = ibus
+        self.dbus = dbus
+
+        assert len(self.ibus.dat_r) == 64, "bus width must be 64"
+        assert len(self.dbus.dat_r) == 64, "bus width must be 64"
+
+        # IRQ for data buffer receive/xmit
+        self.irq = Signal() 
+
+        # debug monitoring signals
+        self.nia = Signal(64)
+        self.nia_req = Signal()
+        self.msr = Signal(64)
+        self.ldst_addr = Signal(64)
+        self.ldst_req = Signal()
+
+        # alternative reset and termination indicator
+        self.alt_reset = Signal()
+        self.terminated_o = Signal()
+
+    @classmethod
+    def add_verilog_source(cls, verilog_src_dir, platform):
+        # add each of the verilog sources, needed for when doing platform.build
+        for fname in ['external_core_top.v',
+                     ]:
+            # prepend the src directory to each filename, add its contents
+            fullname = os.path.join(verilog_src_dir, fname)
+            with open(fullname) as f:
+                platform.add_file(fullname, f)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # create definition of external core here, so that
+        # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+        ibus, dbus, dmi = self.ibus, self.dbus, self.dmi
+
+        # sigh, microwatt wishbone address is borked, it contains the 3 LSBs
+        ibus_adr = Signal(32)
+        dbus_adr = Signal(32)
+        m.d.comb += ibus.adr.eq(ibus_adr[3:])
+        m.d.comb += dbus.adr.eq(dbus_adr[3:])
+
+        kwargs = {
+            # clock/reset signals
+            'i_clk': ClockSignal(),
+            'i_rst': ResetSignal(),
+            # DMI interface
+            'i_dmi_addr': dmi.addr_i,
+            'i_dmi_req': dmi.req_i,
+            'i_dmi_wr': dmi.we_i,
+            'i_dmi_din': dmi.din,
+            'o_dmi_dout': dmi.dout,
+            'o_dmi_ack': dmi.ack_o,
+            # debug/monitor signals
+            'o_nia': self.nia,
+            'o_nia_req': self.nia_req,
+            'o_msr_o': self.msr,
+            'o_ldst_addr': self.ldst_addr,
+            'o_ldst_req': self.ldst_req,
+            'i_alt_reset': self.alt_reset,
+            'o_terminated_out': self.terminated_o,
+            # wishbone instruction bus
+            'o_wishbone_insn_out.adr': ibus_adr,
+            'o_wishbone_insn_out.dat': ibus.dat_w,
+            'o_wishbone_insn_out.sel': ibus.sel,
+            'o_wishbone_insn_out.cyc': ibus.cyc,
+            'o_wishbone_insn_out.stb': ibus.stb,
+            'o_wishbone_insn_out.we': ibus.we,
+            'i_wishbone_insn_in.dat': ibus.dat_r,
+            'i_wishbone_insn_in.ack': ibus.ack,
+            'i_wishbone_insn_in.stall': ibus.stall,
+            # wishbone data bus
+            'o_wishbone_data_out.adr': dbus_adr,
+            'o_wishbone_data_out.dat': dbus.dat_w,
+            'o_wishbone_data_out.sel': dbus.sel,
+            'o_wishbone_data_out.cyc': dbus.cyc,
+            'o_wishbone_data_out.stb': dbus.stb,
+            'o_wishbone_data_out.we': dbus.we,
+            'i_wishbone_data_in.dat': dbus.dat_r,
+            'i_wishbone_data_in.ack': dbus.ack,
+            'i_wishbone_data_in.stall': dbus.stall,
+            # external interrupt request
+            'i_ext_irq': self.irq,
+        }
+        core = Instance("external_core_top", **kwargs)
+        m.submodules['core_top'] = core
+
+        return m
+
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+    vl = verilog.convert(dut, name=test_name, ports=ports)
+    with open("%s.v" % test_name, "w") as f:
+        f.write(vl)
+
+
+if __name__ == "__main__":
+    core = ExternalCore(name="core")
+    create_ilang(core, [
+                        core.ibus.cyc, core.ibus.stb, core.ibus.ack,
+                        core.ibus.dat_r, core.ibus.dat_w, core.ibus.adr,
+                        core.ibus.we, core.ibus.sel, core.ibus.stall,
+                        core.dbus.cyc, core.dbus.stb, core.dbus.ack,
+                        core.dbus.dat_r, core.dbus.dat_w, core.dbus.adr,
+                        core.dbus.we, core.dbus.sel,
+                        core.irq, core.alt_reset, core.terminated_o,
+                        core.msr, core.nia, core.nia_req,
+                        core.ldst_addr, core.ldst_req,
+                        core.dmi.addr_i, core.dmi.req_i, core.dmi.we_i,
+                        core.dmi.din, core.dmi.dout, core.dmi.ack_o,
+                       ], "core_0")
+
diff --git a/src/soc/bus/opencores_ethmac.py b/src/soc/bus/opencores_ethmac.py
new file mode 100644 (file)
index 0000000..cad4919
--- /dev/null
@@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2020-2022 Raptor Engineering LLC <support@raptorengineering.com>
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog 10/100 MAC
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+                    ResetSignal)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen_soc.memory import MemoryMap
+from nmigen.utils import log2_int
+from nmigen.cli import rtlil, verilog
+import os
+
+__all__ = ["EthMAC"]
+
+
+class EthMAC(Elaboratable):
+    """Ethernet MAC from opencores, nmigen wrapper.
+    remember to call EthMAC.add_verilog_source
+    """
+
+    def __init__(self, master_bus=None, slave_bus=None, name=None,
+                       irq=None, pins=None):
+        if name is not None:
+            # convention: give the name in the format "name_number"
+            self.idx = int(name.split("_")[-1])
+        else:
+            self.idx = 0
+            name = "eth_0"
+        self.granularity = 8
+        self.data_width = 32
+        self.dsize = log2_int(self.data_width//self.granularity)
+
+        # set up the wishbone busses
+        features = frozenset()
+        if master_bus is None:
+            master_bus = Interface(addr_width=30,
+                            data_width=32,
+                            features=features,
+                            granularity=8,
+                            name=name+"_wb_%d_0" % self.idx)
+        if slave_bus is None:
+            slave_bus = Interface(addr_width=12,
+                            data_width=32,
+                            features=features,
+                            granularity=8,
+                            name=name+"_wb_%d_1" % self.idx)
+        self.master_bus = master_bus
+        self.slave_bus = slave_bus
+        if irq is None:
+            irq = Signal()
+        self.irq = irq
+
+        slave_mmap = MemoryMap(addr_width=12+self.dsize,
+                        data_width=self.granularity)
+
+        self.slave_bus.memory_map = slave_mmap
+
+        # RMII TX signals
+        self.mtx_clk = Signal()
+        self.mtxd = Signal(4)
+        self.mtxen = Signal()
+        self.mtxerr = Signal()
+
+        # RMII RX signals
+        self.mrx_clk = Signal()
+        self.mrxd = Signal(4)
+        self.mrxdv = Signal()
+        self.mrxerr = Signal()
+
+        # RMII common signals
+        self.mcoll = Signal()
+        self.mcrs = Signal()
+
+        # RMII management interface signals
+        self.mdc = Signal()
+        self.md_in = Signal()
+        self.md_out = Signal()
+        self.md_direction = Signal()
+
+        # pins resource
+        self.pins = pins
+
+    @classmethod
+    def add_verilog_source(cls, verilog_src_dir, platform):
+        # add each of the verilog sources, needed for when doing platform.build
+        for fname in ['eth_clockgen.v', 'eth_cop.v', 'eth_crc.v',
+                    'eth_fifo.v', 'eth_maccontrol.v', 'ethmac_defines.v',
+                    'eth_macstatus.v', 'ethmac.v', 'eth_miim.v',
+                    'eth_outputcontrol.v', 'eth_random.v',
+                    'eth_receivecontrol.v', 'eth_registers.v',
+                    'eth_register.v', 'eth_rxaddrcheck.v',
+                    'eth_rxcounters.v', 'eth_rxethmac.v',
+                    'eth_rxstatem.v', 'eth_shiftreg.v',
+                    'eth_spram_256x32.v', 'eth_top.v',
+                    'eth_transmitcontrol.v', 'eth_txcounters.v',
+                    'eth_txethmac.v', 'eth_txstatem.v', 'eth_wishbone.v',
+                    'timescale.v']:
+            # prepend the src directory to each filename, add its contents
+            fullname = os.path.join(verilog_src_dir, fname)
+            with open(fullname) as f:
+                platform.add_file(fullname, f)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        idx = self.idx
+
+        # Calculate arbiter bus address
+        wb_master_bus_adr = Signal(32)
+        # arbiter address is in words, ethernet master address is in bytes
+        comb += self.master_bus.adr.eq(wb_master_bus_adr >> 2)
+
+        # create definition of external verilog EthMAC code here, so that
+        # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+        ethmac = Instance("eth_top",
+                            # Clock/reset (use DomainRenamer if needed)
+                            i_wb_clk_i=ClockSignal(),
+                            i_wb_rst_i=ResetSignal(),
+
+                            # Master Wishbone bus signals
+                            o_m_wb_adr_o=wb_master_bus_adr,
+                            i_m_wb_dat_i=self.master_bus.dat_r,
+                            o_m_wb_sel_o=self.master_bus.sel,
+                            o_m_wb_dat_o=self.master_bus.dat_w,
+                            o_m_wb_we_o=self.master_bus.we,
+                            o_m_wb_stb_o=self.master_bus.stb,
+                            o_m_wb_cyc_o=self.master_bus.cyc,
+                            i_m_wb_ack_i=self.master_bus.ack,
+
+                            # Slave Wishbone bus signals
+                            i_wb_adr_i=self.slave_bus.adr,
+                            i_wb_dat_i=self.slave_bus.dat_w,
+                            i_wb_sel_i=self.slave_bus.sel,
+                            o_wb_dat_o=self.slave_bus.dat_r,
+                            i_wb_we_i=self.slave_bus.we,
+                            i_wb_stb_i=self.slave_bus.stb,
+                            i_wb_cyc_i=self.slave_bus.cyc,
+                            o_wb_ack_o=self.slave_bus.ack,
+
+                            o_int_o=self.irq,
+
+                            # RMII TX
+                            i_mtx_clk_pad_i=self.mtx_clk,
+                            o_mtxd_pad_o=self.mtxd,
+                            o_mtxen_pad_o=self.mtxen,
+                            o_mtxerr_pad_o=self.mtxerr,
+
+                            # RMII RX
+                            i_mrx_clk_pad_i=self.mrx_clk,
+                            i_mrxd_pad_i=self.mrxd,
+                            i_mrxdv_pad_i=self.mrxdv,
+                            i_mrxerr_pad_i=self.mrxerr,
+
+                            # RMII common
+                            i_mcoll_pad_i=self.mcoll,
+                            i_mcrs_pad_i=self.mcrs,
+
+                            # Management Interface
+                            o_mdc_pad_o=self.mdc,
+                            i_md_pad_i=self.md_in,
+                            o_md_pad_o=self.md_out,
+                            o_md_padoe_o=self.md_direction
+                            );
+
+        m.submodules['ethmac_%d' % self.idx] = ethmac
+
+        if self.pins is not None:
+            comb += self.mtx_clk.eq(self.pins.mtx_clk.i)
+            comb += self.pins.mtxd.o.eq(self.mtxd)
+            comb += self.pins.mtxen.o.eq(self.mtxen)
+            comb += self.pins.mtxerr.o.eq(self.mtxerr)
+
+            comb += self.mrx_clk.eq(self.pins.mrx_clk.i)
+            comb += self.mrxd.eq(self.pins.mrxd.i)
+            comb += self.mrxdv.eq(self.pins.mrxdv.i)
+            comb += self.mrxerr.eq(self.pins.mrxerr.i)
+            comb += self.mcoll.eq(self.pins.mcoll.i)
+            comb += self.mcrs.eq(self.pins.mcrs.i)
+
+            comb += self.pins.mdc.o.eq(self.mdc)
+
+            comb += self.pins.md.o.eq(self.md_out)
+            comb += self.pins.md.oe.eq(self.md_direction)
+            comb += self.md_in.eq(self.pins.md.i)
+        return m
+
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+    vl = verilog.convert(dut, name=test_name, ports=ports)
+    with open("%s.v" % test_name, "w") as f:
+        f.write(vl)
+
+if __name__ == "__main__":
+    ethmac = EthMAC(name="eth_0")
+    create_ilang(ethmac, [ethmac.master_bus.cyc, ethmac.master_bus.stb,
+                        ethmac.master_bus.ack, ethmac.master_bus.dat_r,
+                        ethmac.master_bus.dat_w, ethmac.master_bus.adr,
+                        ethmac.master_bus.we, ethmac.master_bus.sel,
+                        ethmac.slave_bus.cyc, ethmac.slave_bus.stb,
+                        ethmac.slave_bus.ack,
+                        ethmac.slave_bus.dat_r, ethmac.slave_bus.dat_w,
+                        ethmac.slave_bus.adr,
+                        ethmac.slave_bus.we, ethmac.slave_bus.sel,
+                        ethmac.mtx_clk, ethmac.mtxd, ethmac.mtxen,
+                        ethmac.mtxerr, ethmac.mrx_clk, ethmac.mrxd,
+                        ethmac.mrxdv, ethmac.mrxerr, ethmac.mcoll,
+                        ethmac.mcrs, ethmac.mdc, ethmac.md_in,
+                        ethmac.md_out, ethmac.md_direction
+                       ], "eth_0")
+
diff --git a/src/soc/bus/sdr_ctrl.py b/src/soc/bus/sdr_ctrl.py
new file mode 100644 (file)
index 0000000..4b6799f
--- /dev/null
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog uart16550 module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+                    ResetSignal, Record)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen.cli import rtlil, verilog
+import os
+
+__all__ = ["SDRAM", "SDRAMConfig"]
+
+        """
+        class MT48LC16M16(SDRModule):
+            # geometry
+            nbanks = 4
+            nrows  = 8192
+            ncols  = 512
+            # timings
+            technology_timings = _TechnologyTimings(tREFI=64e6/8192,
+                                                    tWTR=(2, None),
+                                                    tCCD=(1, None),
+                                                    tRRD=(None, 15))
+            speedgrade_timings = {"default": _SpeedgradeTimings(tRP=20,
+                                                    tRCD=20,
+                                                    tWR=15,
+                                                    tRFC=(None, 66),
+                                                    tFAW=None,
+                                                    tRAS=44)}
+            # for MT48LC16M16-75 part
+            comb += self.cfg.sdr_en.eq(1)
+            comb += self.cfg.sdr_mode_reg.eq(0x033)
+            comb += self.cfg.req_depth.eq(3)    # max 
+            comb += self.cfg.sdr_tras_d.eq(44)  # Active to precharge delay
+            comb += self.cfg.sdr_trp_d.eq(20)   # Precharge to active delay
+            comb += self.cfg.sdr_trcd_d.eq(20)  # Active to R/W delay
+            comb += self.cfg.sdr_cas.eq(3)      # CAS latency
+            comb += self.cfg.sdr_trcar_d.eq(66) # tRFC auto-refresh period
+            comb += self.cfg.sdr_twr_d.eq(15) # clock + 7.5ns
+            comb += self.cfg.sdr_rfsh.eq(0x100)
+            comb += self.cfg.sdr_rfmax.eq(6)
+        """
+
+
+class SDRAMConfig(Record):
+    def __init__(self, refresh_timer_sz, refresh_row_count, name=None):
+        super().__init__(name=name, layout=[
+        # configuration parameters, these need to match the SDRAM IC datasheet
+                        ('req_depth', 2),       # max request accepted
+                        ('sdr_en', 1),          # Enable SDRAM controller
+                        ('sdr_mode_reg', 13),
+                        ('sdr_tras_d', 4),      # Active to precharge delay
+                        ('sdr_trp_d', 4),       # Precharge to active delay
+                        ('sdr_trcd_d', 4),      # Active to R/W delay
+                        ('sdr_cas', 3),         # SDRAM CAS Latency
+                        ('sdr_trcar_d', 4),     # Auto-refresh period
+                        ('sdr_twr_d', 4),       # Write recovery delay
+                        ('sdr_rfsh', refresh_timer_sz),
+                        ('sdr_rfmax', refresh_row_count)
+                    ])
+
+
+class SDRAM(Elaboratable):
+    """SDRAM controller from opencores, nmigen wrapper.  remember to call
+       SDRAM.add_verilog_source.
+
+    * the SDRAM IC will be accessible over the Wishbone Bus
+    * sdr_* signals must be wired to the IC
+    * cfg parameters must match those listed in the SDRAM IC's datasheet
+    """
+
+    def __init__(self, bus=None, features=None, name=None,
+                       data_width=32, addr_width=26,
+                       sdr_data_width=16,
+                       cfg=None,
+                       pins=None):
+        if name is not None:
+            name = "sdram"
+        self.data_width = data_width
+        self.sdr_data_width = sdr_data_width
+        self.addr_width = addr_width
+        self.refresh_timer_sz = 12
+        self.refresh_row_count = 3
+
+        # set up the wishbone bus
+        if features is None:
+            features = frozenset({'cti'})
+        if bus is None:
+            bus = Interface(addr_width=addr_width,
+                            data_width=data_width,
+                            features=features,
+                            granularity=8,
+                            name=name)
+        self.bus = bus
+        assert len(self.bus.dat_r) == data_width, \
+                        "bus width must be %d" % data_width
+
+        byte_width = sdr_data_width // 8 # for individual byte masks/enables
+
+        # SDRAM signals
+        self.sdram_clk     = Signal()           # sdram phy clock
+        self.sdram_resetn  = Signal(reset_less=True) # sdram reset (low)
+        self.sdr_cs_n      = Signal()           # chip select
+        self.sdr_cke       = Signal()           # clock-enable
+        self.sdr_ras_n     = Signal()           # read-address strobe
+        self.sdr_cas_n     = Signal()           # cas
+        self.sdr_we_n      = Signal()           # write-enable
+        self.sdr_dqm       = Signal(byte_width) # data mask
+        self.sdr_ba        = Signal(2)          # bank enable
+        self.sdr_addr      = Signal(13)         # sdram address, 13 bits
+        # these combine to create a bi-direction inout, sdr_dq
+        # note, each bit of sdr_den_n covers a *byte* of sdr_din/sdr_dout
+        self.sdr_den_n     = Signal(byte_width)
+        self.sdr_din       = Signal(data_width)
+        self.sdr_dout      = Signal(data_width)
+
+        # configuration parameters, these need to match the SDRAM IC datasheet
+        self.sdr_init_done       = Signal()  # Indicate SDRAM init Done
+        if cfg is None:
+            cfg = SDRAMConfig(self.refresh_timer_sz,
+                                   self.refresh_row_count, name="sdr_cfg")
+
+        # config and pins resource
+        self.pins = pins
+        self.cfg = cfg
+
+    @classmethod
+    def add_verilog_source(cls, verilog_src_dir, platform):
+        # add each of the verilog sources, needed for when doing platform.build
+        for fname in [ './core/sdrc_bank_ctl.v', './core/sdrc_bank_fsm.v',
+                        './core/sdrc_bs_convert.v', './core/sdrc_core.v',
+                        './core/sdrc_req_gen.v', './core/sdrc_xfr_ctl.v',
+                        './core/sdrc_define.v',
+                        './lib/async_fifo.v', './lib/sync_fifo.v',
+                        './top/sdrc_top.v', './wb2sdrc/wb2sdrc.v',
+                     ]:
+            # prepend the src directory to each filename, add its contents
+            fullname = os.path.join(verilog_src_dir, fname)
+            with open(fullname) as f:
+                platform.add_file(fullname, f)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # create definition of external verilog 16550 uart here, so that                # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+        bus = self.bus
+
+        params = {
+            # clock/reset (use DomainRenamer if needed)
+            'i_wb_clk_i' : ClockSignal(),
+            'i_wb_rst_i' : ResetSignal(),
+
+            # wishbone bus signals
+            'i_wb_adr_i' : bus.adr,
+            'i_wb_dat_i' : bus.dat_w,
+            'i_wb_sel_i' : bus.sel,
+            'o_wb_dat_o' : bus.dat_r,
+            'i_wb_we_i' : bus.we,
+            'i_wb_stb_i' : bus.stb,
+            'i_wb_cyc_i' : bus.cyc,
+            'o_wb_ack_o' : bus.ack,
+
+            # SDRAM signals
+            'i_sdram_clk'      :  self.sdram_clk,
+            'i_sdram_resetn'   :  self.sdram_resetn,
+            'o_sdr_cs_n'       :  self.sdr_cs_n,
+            'o_sdr_cke'        :  self.sdr_cke,
+            'o_sdr_ras_n'      :  self.sdr_ras_n,
+            'o_sdr_cas_n'      :  self.sdr_cas_n,
+            'o_sdr_we_n'       :  self.sdr_we_n,
+            'o_sdr_dqm'        :  self.sdr_dqm,
+            'o_sdr_ba'         :  self.sdr_ba,
+            'o_sdr_addr'       :  self.sdr_addr,
+            'o_sdr_den_n'      : self.sdr_den_n,
+            'i_sdr_din'        : self.sdr_din,
+            'o_sdr_dout'       : self.sdr_dout,
+
+            # configuration parameters (from the SDRAM IC datasheet)
+            'o_sdr_init_done'      : self.sdr_init_done       ,
+            'i_cfg_req_depth'      : self.cfg.req_depth       ,
+            'i_cfg_sdr_en'         : self.cfg.sdr_en          ,
+            'i_cfg_sdr_mode_reg'   : self.cfg.sdr_mode_reg    ,
+            'i_cfg_sdr_tras_d'     : self.cfg.sdr_tras_d      ,
+            'i_cfg_sdr_trp_d'      : self.cfg.sdr_trp_d       ,
+            'i_cfg_sdr_trcd_d'     : self.cfg.sdr_trcd_d      ,
+            'i_cfg_sdr_cas'        : self.cfg.sdr_cas         ,
+            'i_cfg_sdr_trcar_d'    : self.cfg.sdr_trcar_d     ,
+            'i_cfg_sdr_twr_d'      : self.cfg.sdr_twr_d       ,
+            'i_cfg_sdr_rfsh'       : self.cfg.sdr_rfsh        ,
+            'i_cfg_sdr_rfmax'      : self.cfg.sdr_rfmax,
+
+            # verilog parameters
+            'p_APP_AW'   : self.addr_width,    # Application Address Width
+            'p_APP_DW'   : self.data_width,    # Application Data Width
+            'p_APP_BW'   : self.addr_width//8, # Application Byte Width
+            'p_APP_RW'   : 9,                  # Application Request Width
+            'p_SDR_DW'   : self.sdr_data_width,    # SDR Data Width
+            'p_SDR_BW'   : self.sdr_data_width//8, # SDR Byte Width
+            'p_dw'       : self.data_width,    # data width
+            'p_tw'       : 8,   # tag id width
+            'p_bl'       : 9,   # burst_length_width
+        }
+        m.submodules['sdrc_top'] = Instance("sdrc_top", **params)
+
+        return m
+
+        if self.pins is not None:
+            comb += self.pins.tx.eq(self.tx_o)
+            comb += self.rx_i.eq(self.pins.rx)
+
+        return m
+
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+    vl = verilog.convert(dut, name=test_name, ports=ports)
+    with open("%s.v" % test_name, "w") as f:
+        f.write(vl)
+
+
+if __name__ == "__main__":
+    sdram = SDRAM(name="sdram", data_width=8)
+    create_ilang(sdram, [sdram.bus.cyc, sdram.bus.stb, sdram.bus.ack,
+                         sdram.bus.dat_r, sdram.bus.dat_w, sdram.bus.adr,
+                         sdram.bus.we, sdram.bus.sel,
+                         sdram.sdram_clk, sdram.sdram_resetn,
+                         sdram.sdr_cs_n, sdram.sdr_cke,
+                         sdram.sdr_ras_n, sdram.sdr_cas_n, sdram.sdr_we_n,
+                         sdram.sdr_dqm, sdram.sdr_ba, sdram.sdr_addr,
+                         sdram.sdr_den_n, sdram.sdr_din, sdram.sdr_dout,
+                         sdram.sdr_init_done, sdram.cfg.req_depth,
+                         sdram.cfg.sdr_en, sdram.cfg.sdr_mode_reg,
+                         sdram.cfg.sdr_tras_d, sdram.cfg.sdr_trp_d,
+                         sdram.cfg.sdr_trcd_d, sdram.cfg.sdr_cas,
+                         sdram.cfg.sdr_trcar_d, sdram.cfg.sdr_twr_d,
+                         sdram.cfg.sdr_rfsh, sdram.cfg.sdr_rfmax,
+                       ], "sdram")
+
index 9819302ff80e2ed2492efc6406bbe055dcecd901..f025211417ff28d42456b7b4f75f0b236cd6ba3b 100644 (file)
@@ -60,7 +60,7 @@ class SRAM(Elaboratable):
                             data_width=self.memory.width,
                             granularity=granularity,
                             features=features,
-                            alignment=0,
+                            #alignment=0,
                             name=None)
         self.bus = bus
         self.granularity = bus.granularity
diff --git a/src/soc/bus/syscon.py b/src/soc/bus/syscon.py
new file mode 100644 (file)
index 0000000..f3dcfc0
--- /dev/null
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Copyright (C) 2022 Raptor Engineering, LLC <support@raptorengineering.com>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a System Console peripheral compatible with microwatt
+# https://github.com/antonblanchard/microwatt/blob/master/syscon.vhdl
+
+from nmigen import (Elaboratable, Cat, Module, Signal)
+from nmigen.cli import rtlil, verilog
+
+from lambdasoc.periph import Peripheral
+
+__all__ = ["MicrowattSYSCON"]
+
+
+class MicrowattSYSCON(Peripheral, Elaboratable):
+    """Microwatt-compatible (Sys)tem (Con)figuration module
+    """
+
+    def __init__(self, *, sys_clk_freq=100e6,
+                          core_clk_freq=100e6,
+                          mem_clk_freq=100e6,
+                          spi_offset=None,
+                          dram_addr=None,
+                          has_uart=True,
+                          uart_is_16550=True
+                          ):
+        super().__init__(name="syscon")
+        self.sys_clk_freq = sys_clk_freq
+        self.core_clk_freq = core_clk_freq
+        self.mem_clk_freq = mem_clk_freq
+        self.has_uart = has_uart
+        self.spi_offset = spi_offset
+        self.dram_addr = dram_addr
+        self.uart_is_16550 = uart_is_16550
+
+        # System control ports
+        self.dram_at_0 = Signal()
+        self.core_reset = Signal()
+        self.soc_reset = Signal()
+
+        # set up a CSR Bank and associated bridge. has to be in this order
+        # (declare bank, declare bridge) for some unknown reason.
+        # (r)ead regs will have a r_stb and r_data Record entry
+        # (w)rite regs will have a w_stb and w_data Record entry
+        bank = self.csr_bank()
+        self._reg_sig_r       = bank.csr(64, "r") # signature
+        self._reg_info_r      = bank.csr(64, "r") # info
+        self._bram_info_r     = bank.csr(64, "r") # bram info
+        self._dram_info_r     = bank.csr(64, "r") # dram info
+        self._clk_info_r      = bank.csr(64, "r") # nest clock frequency
+        self._ctrl_info_r     = bank.csr(64, "rw") # control info
+        self._dram_init_r     = bank.csr(64, "r") # dram initialisation info
+        self._spiflash_info_r = bank.csr(64, "r") # spi flash info
+        self._uart0_info_r    = bank.csr(64, "r") # UART0 info (baud etc.)
+        self._uart1_info_r    = bank.csr(64, "r") # UART1 info (baud etc.)
+        self._bram_bootaddr_r = bank.csr(64, "r") # BRAM boot address
+        self._core_clk_info_r = bank.csr(64, "r") # core clock frequency
+        self._mem_clk_info_r  = bank.csr(64, "r") # memory clock frequency
+
+        # bridge the above-created CSRs over wishbone.  ordering and size
+        # above mattered, the bridge automatically packs them together
+        # as memory-addressable "things" for us
+        self._bridge = self.bridge(data_width=32, granularity=8, alignment=3)
+        self.bus = self._bridge.bus
+
+    def elaborate(self, platform):
+        m = Module()
+        comb, sync = m.d.comb, m.d.comb
+        m.submodules.bridge = self._bridge
+
+        # enter data into the CSRs. r_data can be left live all the time,
+        # w_data obviously has to be set only when w_stb triggers.
+
+        # identifying signature
+        comb += self._reg_sig_r.r_data.eq(0xf00daa5500010001)
+
+        # nest clock rate (hz)
+        comb += self._clk_info_r.r_data.eq(int(self.sys_clk_freq)) # in hz
+
+        # core clock rate (hz)
+        comb += self._core_clk_info_r.r_data.eq(int(self.core_clk_freq)) # in hz
+
+        # memory clock rate (hz)
+        comb += self._mem_clk_info_r.r_data.eq(int(self.mem_clk_freq)) # in hz
+
+        # detect peripherals
+        has_spi = self.spi_offset is not None
+        has_dram = self.dram_addr is not None
+
+        # uart peripheral clock rate, currently assumed to be system clock
+        # 0 ..31  : UART clock freq (in HZ)
+        #     32  : UART is 16550 (otherwise pp)
+        comb += self._uart0_info_r.r_data[0:32].eq(int(self.sys_clk_freq))
+        comb += self._uart0_info_r.r_data[32].eq(1)
+
+        # Reg Info, defines what peripherals and characteristics are present
+        comb += self._reg_info_r.r_data[0].eq(self.has_uart) # has UART0
+        comb += self._reg_info_r.r_data[1].eq(has_dram)      # has DDR DRAM
+        comb += self._reg_info_r.r_data[3].eq(has_spi)       # has SPI Flash
+        comb += self._reg_info_r.r_data[5].eq(1)             # Large SYSCON
+
+        # system control
+        sysctrl = Cat(self.dram_at_0, self.core_reset, self.soc_reset)
+        with m.If(self._ctrl_info_r.w_stb):
+            sync += sysctrl.eq(self._ctrl_info_r.w_data)
+        comb += self._ctrl_info_r.r_data.eq(sysctrl)
+
+        # SPI Flash Address
+        comb += self._spiflash_info_r.r_data.eq(self.spi_offset or 0)
+
+        return m
+
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+    vl = verilog.convert(dut, name=test_name, ports=ports)
+    with open("%s.v" % test_name, "w") as f:
+        f.write(vl)
+
+
+if __name__ == "__main__":
+    from nmigen_soc import wishbone
+    class QuickDemo(Elaboratable):
+        def elaborate(self, platform):
+            m = Module()
+            arbiter = wishbone.Arbiter(addr_width=30, data_width=32,
+                                       granularity=8)
+            decoder = wishbone.Decoder(addr_width=30, data_width=32,
+                                       granularity=8)
+            m.submodules.syscon = syscon = MicrowattSYSCON()
+            m.submodules.decoder = decoder
+            m.submodules.arbiter = arbiter
+            decoder.add(syscon.bus, addr=0xc0000000)
+            m.d.comb += arbiter.bus.connect(decoder.bus)
+            return m
+    m = QuickDemo()
+    create_ilang(m, None, "syscondemo")
+
diff --git a/src/soc/bus/tercel.py b/src/soc/bus/tercel.py
new file mode 100644 (file)
index 0000000..54ba925
--- /dev/null
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2020-2022 Raptor Engineering LLC <support@raptorengineering.com>
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog tercel module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+                    ResetSignal, Const)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen_soc.memory import MemoryMap
+from nmigen.utils import log2_int
+from nmigen.cli import rtlil, verilog
+from nmutil.byterev import byte_reverse
+import os
+
+__all__ = ["Tercel"]
+
+
+class Tercel(Elaboratable):
+    """Tercel SPI controller from Raptor Engineering, nmigen wrapper.
+    remember to call Tercel.add_verilog_source
+    """
+
+    def __init__(self, bus=None, cfg_bus=None, features=None, name=None,
+                       data_width=32, spi_region_addr_width=28, pins=None,
+                       clk_freq=None,
+                       lattice_ecp5_usrmclk=False,
+                       adr_offset=0): # address offset (bytes)
+        if name is not None:
+            # convention: give the name in the format "name_number"
+            self.idx = int(name.split("_")[-1])
+        else:
+            self.idx = 0
+            name = "spi_0"
+        self.granularity = 8
+        self.data_width = data_width
+        self.dsize = log2_int(self.data_width//self.granularity)
+        self.adr_offset = adr_offset
+        self.lattice_ecp5_usrmclk = lattice_ecp5_usrmclk
+
+        # TODO, sort this out.
+        assert clk_freq is not None
+        clk_freq = round(clk_freq)
+        self.clk_freq = Const(clk_freq, 32) #clk_freq.bit_length())
+
+        # set up the wishbone busses
+        if features is None:
+            #features = frozenset({'err'}) # sigh
+            features = frozenset()
+        if bus is None:
+            bus = Interface(addr_width=spi_region_addr_width,
+                            data_width=data_width,
+                            features=features,
+                            granularity=8,
+                            name=name+"_wb_%d_0" % self.idx)
+        if cfg_bus is None:
+            cfg_bus = Interface(addr_width=6,
+                            data_width=data_width,
+                            features=features,
+                            granularity=8,
+                            name=name+"_wb_%d_1" % self.idx)
+        self.bus = bus
+        assert len(self.bus.dat_r) == data_width, \
+                        "bus width must be %d" % data_width
+        self.cfg_bus = cfg_bus
+        assert len(self.cfg_bus.dat_r) == data_width, \
+                        "bus width must be %d" % data_width
+
+        mmap = MemoryMap(addr_width=spi_region_addr_width+self.dsize,
+                        data_width=self.granularity)
+        cfg_mmap = MemoryMap(addr_width=6+self.dsize,
+                        data_width=self.granularity)
+
+        self.bus.memory_map = mmap
+        self.cfg_bus.memory_map = cfg_mmap
+
+        # QSPI signals
+        self.dq_out = Signal(4)       # Data
+        self.dq_direction = Signal(4)
+        self.dq_in = Signal(4)
+        self.cs_n_out = Signal()      # Slave select
+        self.spi_clk = Signal()       # Clock
+        self.dbg_port = Signal(8)     # debug info
+
+        # pins resource
+        self.pins = pins
+
+    @classmethod
+    def add_verilog_source(cls, verilog_src_dir, platform):
+        # add each of the verilog sources, needed for when doing platform.build
+        for fname in ['wishbone_spi_master.v', 'phy.v']:
+            # prepend the src directory to each filename, add its contents
+            fullname = os.path.join(verilog_src_dir, fname)
+            with open(fullname) as f:
+                platform.add_file(fullname, f)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        pins, bus, cfg_bus = self.pins, self.bus, self.cfg_bus
+
+        # Calculate SPI flash address
+        spi_bus_adr = Signal(30)
+        # wb address is in words, offset is in bytes
+        comb += spi_bus_adr.eq(bus.adr - (self.adr_offset >> 2))
+
+        # urrr.... byte-reverse the config bus and data bus read/write
+        cdat_w = Signal.like(cfg_bus.dat_w)
+        cdat_r = Signal.like(cfg_bus.dat_r)
+        dat_w = Signal.like(bus.dat_w)
+        dat_r = Signal.like(bus.dat_r)
+        comb += cdat_w.eq(byte_reverse(m, "rv_cdat_w", cfg_bus.dat_w, 4))
+        comb += cfg_bus.dat_r.eq(byte_reverse(m, "rv_cdat_r", cdat_r, 4))
+        comb += dat_w.eq(byte_reverse(m, "rv_dat_w", bus.dat_w, 4))
+        comb += bus.dat_r.eq(byte_reverse(m, "rv_dat_r", dat_r, 4))
+
+        # create definition of external verilog Tercel code here, so that
+        # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+        idx, bus = self.idx, self.bus
+        tercel = Instance("tercel_core",
+                            # System parameters
+                            i_sys_clk_freq = self.clk_freq,
+
+                            # Clock/reset (use DomainRenamer if needed)
+                            i_peripheral_clock=ClockSignal(),
+                            i_peripheral_reset=ResetSignal(),
+
+                            # SPI region Wishbone bus signals
+                            i_wishbone_adr=spi_bus_adr,
+                            i_wishbone_dat_w=dat_w,
+                            i_wishbone_sel=bus.sel,
+                            o_wishbone_dat_r=dat_r,
+                            i_wishbone_we=bus.we,
+                            i_wishbone_stb=bus.stb,
+                            i_wishbone_cyc=bus.cyc,
+                            o_wishbone_ack=bus.ack,
+                            #o_wishbone_err=bus.err,
+
+                            # Configuration region Wishbone bus signals
+                            i_cfg_wishbone_adr=cfg_bus.adr,
+                            i_cfg_wishbone_dat_w=cdat_w,
+                            i_cfg_wishbone_sel=cfg_bus.sel,
+                            o_cfg_wishbone_dat_r=cdat_r,
+                            i_cfg_wishbone_we=cfg_bus.we,
+                            i_cfg_wishbone_stb=cfg_bus.stb,
+                            i_cfg_wishbone_cyc=cfg_bus.cyc,
+                            o_cfg_wishbone_ack=cfg_bus.ack,
+                            #o_cfg_wishbone_err=cfg_bus.err,
+
+                            # QSPI signals
+                            o_spi_d_out=self.dq_out,
+                            o_spi_d_direction=self.dq_direction,
+                            i_spi_d_in=self.dq_in,
+                            o_spi_ss_n=self.cs_n_out,
+                            o_spi_clock=self.spi_clk,
+
+                            # debug port
+                            o_debug_port=self.dbg_port
+                            );
+
+        m.submodules['tercel_%d' % self.idx] = tercel
+
+        if pins is not None:
+            for i in range(4):
+                pad = getattr(pins, "dq%d" % i)
+                comb += pad.o.eq(self.dq_out[i])
+                comb += pad.oe.eq(self.dq_direction[i])
+                comb += self.dq_in[i].eq(pad.i)
+                # ECP5 needs special handling for the SPI clock, sigh.
+                if self.lattice_ecp5_usrmclk:
+                    comb += pad.o_clk.eq(ClockSignal())
+                    comb += pad.i_clk.eq(ClockSignal())
+            # XXX invert handled by SPIFlashResource
+            comb += pins.cs_n.eq(self.cs_n_out)
+            # ECP5 needs special handling for the SPI clock, sigh.
+            if self.lattice_ecp5_usrmclk:
+                m.submodules += Instance("USRMCLK",
+                    i_USRMCLKI  = self.spi_clk,
+                    i_USRMCLKTS = 0
+                )
+            else:
+                comb += pins.clk.eq(self.spi_clk)
+
+        return m
+
+    def ports(self):
+        return [self.bus.cyc, self.bus.stb, self.bus.ack,
+                        self.bus.dat_r, self.bus.dat_w, self.bus.adr,
+                        self.bus.we, self.bus.sel,
+                        self.cfg_bus.cyc, self.cfg_bus.stb,
+                        self.cfg_bus.ack,
+                        self.cfg_bus.dat_r, self.cfg_bus.dat_w,
+                        self.cfg_bus.adr,
+                        self.cfg_bus.we, self.cfg_bus.sel,
+                        self.dq_out, self.dq_direction, self.dq_in,
+                        self.cs_n_out, self.spi_clk
+                       ]
+
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+    vl = verilog.convert(dut, name=test_name, ports=ports)
+    with open("%s.v" % test_name, "w") as f:
+        f.write(vl)
+
+
+if __name__ == "__main__":
+    tercel = Tercel(name="spi_0", data_width=32, clk_freq=100e6)
+    create_ilang(tercel, tercel.ports(), "spi_0")
+
index 8ee79b0c03fc2e0ff216c3636835d77c428488d6..8a43e88be1b1a29e9472faceabb4630fbd1bfac1 100644 (file)
@@ -2,13 +2,14 @@
 """
 
 
-def wb_write(bus, addr, data, sel=True):
+def wb_write(bus, addr, data, sel=0b1111):
 
     # write wb
     yield bus.we.eq(1)
     yield bus.cyc.eq(1)
     yield bus.stb.eq(1)
-    yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+    #yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+    yield bus.sel.eq(sel)
     yield bus.adr.eq(addr)
     yield bus.dat_w.eq(data)
 
@@ -33,13 +34,14 @@ def wb_write(bus, addr, data, sel=True):
     yield bus.dat_w.eq(0)
 
 
-def wb_read(bus, addr, sel=True):
+def wb_read(bus, addr, sel=0b1111):
 
     # read wb
     yield bus.cyc.eq(1)
     yield bus.stb.eq(1)
     yield bus.we.eq(0)
-    yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+    #yield bus.sel.eq(0b1111 if sel else 0b1) # 32-bit / 8-bit
+    yield bus.sel.eq(sel)
     yield bus.adr.eq(addr)
 
     # wait for ack to go high
diff --git a/src/soc/bus/uart_16550.py b/src/soc/bus/uart_16550.py
new file mode 100644 (file)
index 0000000..1a900ee
--- /dev/null
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+#
+# this is a wrapper around the opencores verilog uart16550 module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+                    ResetSignal)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen.cli import rtlil, verilog
+import os
+import tempfile
+
+__all__ = ["UART16550"]
+
+
+class UART16550(Elaboratable):
+    """16550 UART from opencores, nmigen wrapper.  remember to call
+       UART16550.add_verilog_source
+    """
+
+    def __init__(self, bus=None, features=None, name=None, data_width=32,
+                       pins=None, irq=None):
+        if name is not None:
+            # convention: give the name in the format "name_number"
+            self.idx = int(name.split("_")[-1])
+        else:
+            self.idx = 0
+            name = "uart_0"
+        self.data_width = data_width
+
+        # set up the wishbone bus
+        if features is None:
+            features = frozenset()
+        if bus is None:
+            bus = Interface(addr_width=5,
+                            data_width=data_width,
+                            features=features,
+                            granularity=8,
+                            name=name+"_wb_%d" % self.idx)
+        self.bus = bus
+        assert len(self.bus.dat_r) == data_width, \
+                        "bus width must be %d" % data_width
+
+        # IRQ for data buffer receive/xmit
+        if irq is None:
+            irq = Signal()
+        self.irq = irq
+
+        # 9-pin UART signals (if anyone still remembers those...)
+        self.tx_o = Signal() # transmit
+        self.rx_i = Signal() # receive
+        self.rts_o = Signal() # ready to send
+        self.cts_i = Signal() # clear to send
+        self.dtr_o = Signal() # data terminal ready
+        self.dsr_i = Signal() # data send ready
+        self.ri_i = Signal() # can't even remember what this is!
+        self.dcd_i = Signal() # or this!
+
+        # pins resource
+        self.pins = pins
+
+    @classmethod
+    def add_verilog_source(cls, verilog_src_dir, platform):
+        # create a temp file containing "`define DATA_BUS_WIDTH_8"
+        t = tempfile.NamedTemporaryFile(delete=False, suffix=".v")
+        t.write("`define DATA_BUS_WIDTH_8\n".encode())
+        t.flush()
+        t.seek(0)
+        platform.add_file(t.name, t)
+
+        # add each of the verilog sources, needed for when doing platform.build
+        for fname in ['raminfr.v', 'uart_defines.v', 'uart_rfifo.v',
+                      'uart_top.v', 'timescale.v', 'uart_receiver.v',
+                      'uart_sync_flops.v', 'uart_transmitter.v',
+                      'uart_debug_if.v', 'uart_regs.v',
+                      'uart_tfifo.v', 'uart_wb.v'
+                     ]:
+            # prepend the src directory to each filename, add its contents
+            fullname = os.path.join(verilog_src_dir, fname)
+            with open(fullname) as f:
+                platform.add_file(fullname, f)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        # create definition of external verilog 16550 uart here, so that                # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+        idx, bus = self.idx, self.bus
+        uart = Instance("uart_top",
+                            # clock/reset (use DomainRenamer if needed)
+                            i_wb_clk_i=ClockSignal(),
+                            i_wb_rst_i=ResetSignal(),
+                            # wishbone bus signals
+                            i_wb_adr_i=bus.adr,
+                            i_wb_dat_i=bus.dat_w,
+                            i_wb_sel_i=bus.sel,
+                            o_wb_dat_o=bus.dat_r,
+                            i_wb_we_i=bus.we,
+                            i_wb_stb_i=bus.stb,
+                            i_wb_cyc_i=bus.cyc,
+                            o_wb_ack_o=bus.ack,
+                            # interrupt line
+                            o_int_o=self.irq,
+                            # 9-pin RS232/UART signals
+                            o_stx_pad_o=self.tx_o,
+                            i_srx_pad_i=self.rx_i,
+                            o_rts_pad_o=self.rts_o,
+                            i_cts_pad_i=self.cts_i,
+                            o_dtr_pad_o=self.dtr_o,
+                            i_dsr_pad_i=self.dsr_i,
+                            i_ri_pad_i=self.ri_i,
+                            i_dcd_pad_i=self.dcd_i
+                            );
+
+        m.submodules['uart16550_%d' % self.idx] = uart
+
+        if self.pins is not None:
+            comb += self.pins.tx.eq(self.tx_o)
+            comb += self.rx_i.eq(self.pins.rx)
+
+        return m
+
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+    vl = verilog.convert(dut, name=test_name, ports=ports)
+    with open("%s.v" % test_name, "w") as f:
+        f.write(vl)
+
+
+if __name__ == "__main__":
+    uart = UART16550(name="uart_0", data_width=8)
+    create_ilang(uart, [uart.bus.cyc, uart.bus.stb, uart.bus.ack,
+                        uart.bus.dat_r, uart.bus.dat_w, uart.bus.adr,
+                        uart.bus.we, uart.bus.sel,
+                        uart.irq,
+                        uart.tx_o, uart.rx_i, uart.rts_o, uart.cts_i,
+                        uart.dtr_o, uart.dsr_i, uart.ri_i, uart.dcd_i
+                       ], "uart_0")
+
diff --git a/src/soc/bus/wb_async.py b/src/soc/bus/wb_async.py
new file mode 100644 (file)
index 0000000..5e024c3
--- /dev/null
@@ -0,0 +1,179 @@
+# Copyright (C) 2022 Raptor Engineering, LLC <support@raptorengineering.com>
+#
+# Based partly on code from LibreSoC
+#
+# Modifications for the Libre-SOC Project funded by NLnet and NGI POINTER
+# under EU Grants 871528 and 957073, under the LGPLv3+ License
+#
+# this is a wrapper around the Verilog Wishbone Components wb_async_reg module
+
+from nmigen import (Elaboratable, Cat, Module, Signal, ClockSignal, Instance,
+                    ResetSignal, Const)
+
+from nmigen_soc.wishbone.bus import Interface
+from nmigen_soc.memory import MemoryMap
+from nmigen.utils import log2_int
+from nmigen.cli import rtlil, verilog
+from nmutil.byterev import byte_reverse
+import os
+
+__all__ = ["WBAsyncBridge"]
+
+
+class WBAsyncBridge(Elaboratable):
+    """Verilog Wishbone Components wb_async_reg module, nmigen wrapper.
+    remember to call WBAsyncBridge.add_verilog_source
+    """
+
+    def __init__(self, master_bus=None, slave_bus=None, master_features=None,
+                       slave_features=None, name=None,
+                       address_width=30, data_width=32, granularity=8,
+                       master_clock_domain=None, slave_clock_domain=None):
+        if name is not None:
+            # convention: give the name in the format "name_number"
+            self.idx = int(name.split("_")[-1])
+        else:
+            self.idx = 0
+            name = "wbasyncbridge_0"
+        self.address_width = address_width
+        self.data_width = data_width
+        self.granularity = granularity
+        self.dsize = log2_int(self.data_width//self.granularity)
+
+        # set up the clock domains
+        if master_clock_domain is None:
+            self.wb_mclk = ClockSignal()
+            self.wb_mrst = ResetSignal()
+        else:
+            self.wb_mclk = ClockSignal(master_clock_domain)
+            self.wb_mrst = ResetSignal(master_clock_domain)
+        if slave_clock_domain is None:
+            self.wb_sclk = ClockSignal()
+            self.wb_srst = ResetSignal()
+        else:
+            self.wb_sclk = ClockSignal(slave_clock_domain)
+            self.wb_srst = ResetSignal(slave_clock_domain)
+
+        # set up the wishbone busses
+        if master_features is None:
+            master_features = frozenset()
+        if slave_features is None:
+            slave_features = frozenset()
+        if master_bus is None:
+            master_bus = Interface(addr_width=self.address_width,
+                            data_width=self.data_width,
+                            features=master_features,
+                            granularity=self.granularity,
+                            name=name+"_wb_%d_master" % self.idx)
+        if slave_bus is None:
+            slave_bus = Interface(addr_width=self.address_width,
+                            data_width=self.data_width,
+                            features=slave_features,
+                            granularity=self.granularity,
+                            name=name+"_wb_%d_slave" % self.idx)
+        self.master_bus = master_bus
+        assert len(self.master_bus.dat_r) == data_width, \
+                        "bus width must be %d" % data_width
+        self.slave_bus = slave_bus
+        assert len(self.slave_bus.dat_r) == data_width, \
+                        "bus width must be %d" % data_width
+
+    @classmethod
+    def add_verilog_source(cls, verilog_src_dir, platform):
+        # add each of the verilog sources, needed for when doing platform.build
+        for fname in ['wb_async_reg.v']:
+            # prepend the src directory to each filename, add its contents
+            fullname = os.path.join(verilog_src_dir, fname)
+            with open(fullname) as f:
+                platform.add_file(fullname, f)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        master_bus, slave_bus = self.master_bus, self.slave_bus
+        slave_err = Signal()
+        slave_rty = Signal()
+
+        # create definition of external verilog bridge code here, so that
+        # nmigen understands I/O directions (defined by i_ and o_ prefixes)
+        idx = self.idx
+        wb_async_bridge = Instance("wb_async_reg",
+                            # Parameters
+                            p_ADDR_WIDTH=self.address_width,
+                            p_DATA_WIDTH=self.data_width,
+                            # width of select is the data width
+                            # *divided* by the data granularity.
+                            # data_width=32-bit, data granularity=8-bit,
+                            # select_width ==> 32/8 ==> 4
+                            p_SELECT_WIDTH=self.data_width//self.granularity,
+
+                            # Clocks/resets
+                            i_wbm_clk=self.wb_mclk,
+                            i_wbm_rst=self.wb_mrst,
+                            i_wbs_clk=self.wb_sclk,
+                            i_wbs_rst=self.wb_srst,
+
+                            # Master Wishbone bus signals
+                            i_wbm_adr_i=self.master_bus.adr,
+                            i_wbm_dat_i=self.master_bus.dat_w,
+                            o_wbm_dat_o=self.master_bus.dat_r,
+                            i_wbm_we_i=self.master_bus.we,
+                            i_wbm_sel_i=self.master_bus.sel,
+                            i_wbm_stb_i=self.master_bus.stb,
+                            i_wbm_cyc_i=self.master_bus.cyc,
+                            o_wbm_ack_o=self.master_bus.ack,
+                            #o_wbm_err=self.master_bus.err,
+                            #o_wbm_rty_i=self.master_bus.rty,
+
+                            # Slave Wishbone bus signals
+                            o_wbs_adr_o=self.slave_bus.adr,
+                            i_wbs_dat_i=self.slave_bus.dat_r,
+                            o_wbs_dat_o=self.slave_bus.dat_w,
+                            o_wbs_we_o=self.slave_bus.we,
+                            o_wbs_sel_o=self.slave_bus.sel,
+                            o_wbs_stb_o=self.slave_bus.stb,
+                            o_wbs_cyc_o=self.slave_bus.cyc,
+                            i_wbs_ack_i=self.slave_bus.ack,
+                            i_wbs_err_i=slave_err,
+                            i_wbs_rty_i=slave_rty
+                            );
+
+        # Wire unused signals to 0
+        comb += slave_err.eq(0)
+        comb += slave_rty.eq(0)
+
+        m.submodules['wb_async_bridge_%d' % self.idx] = wb_async_bridge
+
+        return m
+
+    def ports(self):
+        return [self.master_bus.adr, self.master_bus.dat_w,
+                        self.master_bus.dat_r,
+                        self.master_bus.we, self.master_bus.sel,
+                        self.master_bus.stb,
+                        self.master_bus.cyc, self.master_bus.ack,
+                        self.master_bus.err,
+                        self.master_bus.rty,
+                        self.slave_bus.adr, self.slave_bus.dat_w,
+                        self.slave_bus.dat_r,
+                        self.slave_bus.we, self.slave_bus.sel,
+                        self.slave_bus.stb,
+                        self.slave_bus.cyc, self.slave_bus.ack,
+                        self.slave_bus.err,
+                        self.slave_bus.rty
+                       ]
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+def create_verilog(dut, ports, test_name):
+    vl = verilog.convert(dut, name=test_name, ports=ports)
+    with open("%s.v" % test_name, "w") as f:
+        f.write(vl)
+
+
+if __name__ == "__main__":
+    wbasyncbridge = WBAsyncBridge(name="wbasyncbridge_0", address_width=30, data_width=32, granularity=8)
+    create_ilang(wbasyncbridge, wbasyncbridge.ports(), "wbasyncbridge_0")
index 2fe2a921c4631a54ee67b06e3fab276ecbe36e92..fbf8239fe1c1ce157a8abb3e77ad0530c4058a3c 100644 (file)
@@ -47,20 +47,17 @@ class WishboneDownConvert(Elaboratable):
         shift_reg = Signal(dw_from)
 
         counter = Signal(log2_int(ratio, False))
-        counter_reset = Signal()
-        counter_ce = Signal()
-        with m.If(counter_reset):
-            sync += counter.eq(0)
-        with m.Elif(counter_ce):
-            sync += counter.eq(counter + 1)
+        cur_counter = Signal(log2_int(ratio, False))
 
         counter_done = Signal()
         comb += counter_done.eq(counter == ratio-1)
+        comb += cur_counter.eq(counter)
+        skip = Signal()
 
         # Main FSM
         with m.FSM() as fsm:
             with m.State("IDLE"):
-                comb += counter_reset.eq(1)
+                sync += counter.eq(0)
                 sync += cached_data.eq(0)
                 with m.If(master.stb & master.cyc):
                     with m.If(master.we):
@@ -70,12 +67,13 @@ class WishboneDownConvert(Elaboratable):
 
             with m.State("WRITE"):
                 comb += write.eq(1)
-                comb += slave.we.eq(1)
-                comb += slave.cyc.eq(1)
                 with m.If(master.stb & master.cyc):
+                    comb += skip.eq(slave.sel == 0)
+                    comb += slave.we.eq(1)
+                    comb += slave.cyc.eq(1)
                     comb += slave.stb.eq(1)
-                    with m.If(slave.ack):
-                        comb += counter_ce.eq(1)
+                    with m.If(slave.ack | skip):
+                        sync += counter.eq(counter + 1)
                         with m.If(counter_done):
                             comb += master.ack.eq(1)
                             m.next = "IDLE"
@@ -84,11 +82,13 @@ class WishboneDownConvert(Elaboratable):
 
             with m.State("READ"):
                 comb += read.eq(1)
-                comb += slave.cyc.eq(1)
                 with m.If(master.stb & master.cyc):
+                    comb += skip.eq(slave.sel == 0)
+                    comb += slave.cyc.eq(1)
                     comb += slave.stb.eq(1)
-                    with m.If(slave.ack):
-                        comb += counter_ce.eq(1)
+                    with m.If(slave.ack | skip):
+                        comb += cur_counter.eq(counter + 1) # TODO use Picker
+                        sync += counter.eq(cur_counter)
                         with m.If(counter_done):
                             comb += master.ack.eq(1)
                             comb += master.dat_r.eq(shift_reg)
@@ -102,7 +102,7 @@ class WishboneDownConvert(Elaboratable):
                 comb += slave.cti.eq(7) # indicate end of burst
             with m.Else():
                 comb += slave.cti.eq(2)
-        comb += slave.adr.eq(Cat(counter, master.adr))
+        comb += slave.adr.eq(Cat(cur_counter, master.adr))
 
         # write Datapath - select fragments of data, depending on "counter"
         with m.Switch(counter):
@@ -117,7 +117,7 @@ class WishboneDownConvert(Elaboratable):
         # read Datapath - uses cached_data and master.dat_r as a shift-register.
         # by the time "counter" is done (counter_done) this is complete
         comb += shift_reg.eq(Cat(cached_data[dw_to:], slave.dat_r))
-        with m.If(read & counter_ce):
+        with m.If(read & (slave.ack | skip)):
             sync += cached_data.eq(shift_reg)
 
 
index a73a89bc6c5b2fe8962d0072a0b2939010ffd3fe..35a9ddec0d230aa8f6354871faad6aa202dd7a33 100644 (file)
@@ -18,6 +18,18 @@ class ConfigFetchUnit:
                    'bare_wb': BareFetchUnit,
                    #'test_cache_wb': TestCacheFetchUnit
                   }
+        self.pspec = pspec
+        if self.pspec.imem_ifacetype in ['mmu_cache_wb', 'test_mmu_cache_wb']:
+            # XXX BLECH! use pspec to transfer the I-Cache which is
+            # created down inside LoadStore1!
+            self.fu = icache = pspec.icache # ICache already FetchUnitInterface
+            # tell I-Cache to connect up to its FetchUnitInterface
+            icache.use_fetch_interface()
+            return
+
         fukls = fudict[pspec.imem_ifacetype]
         self.fu = fukls(pspec)
 
+    def wb_bus(self):
+        return self.fu.ibus
+
index a1828c6a4aa434650270ec5421737ea11ee65aa8..9ebe4f7cd2ee18f0af9545fef246298a58401095 100644 (file)
@@ -3,6 +3,8 @@ import sys
 import json
 from pprint import pprint
 from collections import OrderedDict
+from openpower.util import log
+from nmigen.build.dsl import Resource, Subsignal, Pins
 
 
 def _byteify(data, ignore_dicts = False):
@@ -25,7 +27,40 @@ def _byteify(data, ignore_dicts = False):
     return data
 
 
+def get_pinspec_resources(chipname=None, subset=None, conn=None):
+    """get_pinspec_resources - returns an auto-generated list of resources
+    """
+    chip = load_pinouts(chipname)
+    pinmap = chip['pins.map']
+    specs = []
+    for k, bus in chip['pins.specs'].items():
+        k, num = k.lower().split(":")
+        name = '%s%s' % (k, num)
+        if subset is None or name in subset:
+            io = []
+            for pin in bus:
+                pin = pin.lower()
+                pin, pin_dir = pin[:-1], pin[-1] # split pin+ into pin, +
+                pname = '%s_%s' % (name, pin)
+                if pname in pinmap:
+                    newpin = pinmap[pname][2:]
+                    newpin = '_'.join(newpin.split("_")[1:])
+                    # turn direction into nmigen Pins direction format
+                    dirn = {'-': 'i', '+': 'o', '*': 'io'}[pin_dir]
+                # TODO: make assert_width not have to be 1
+                p = Pins(newpin, dir=dirn, conn=conn, assert_width=1)
+                io.append(Subsignal(pin, p))
+            spec = Resource.family(name, num, default_name=name, ios=io)
+            log("pinspec", name, repr(spec))
+            specs.append(spec)
+    return specs
+
+
 def get_pinspecs(chipname=None, subset=None):
+    """get_pinspecs - returns a dictionary of lists of pins for an IO function
+    example: {'uart': ['tx+', 'rx-'],
+             'i2c': ['sda*', 'scl+']}
+    """
     chip = load_pinouts(chipname)
     pinmap = chip['pins.map']
     specs = OrderedDict() # preserve order
@@ -62,7 +97,8 @@ def load_pinouts(chipname=None):
     pth = os.path.split(pth)[0]
 
     # path is relative to this filename, in the pinmux submodule
-    fname = "%s/../../../pinmux/%s/litex_pinpads.json" % (pth, chipname)
+    pinmux = os.getenv("PINMUX", "%s/../../../pinmux" % pth)
+    fname = "%s/%s/fabric_pinpads.json" % (pinmux, chipname)
     with open(fname) as f:
         txt = f.read()
 
@@ -73,7 +109,12 @@ def load_pinouts(chipname=None):
     return chip
 
 if __name__ == '__main__':
-    if sys.argv == 2:
+    # run this with:
+    # git submodule update --init --remote --recursive
+    # make mkpinmux
+    # python3 soc/config/pinouts.py ngi_pointer (or ls180, or other)
+    # it will print out a stack of debug stuff
+    if len(sys.argv) == 2:
         chipname = sys.argv[1]
     else:
         chipname = None
@@ -81,3 +122,5 @@ if __name__ == '__main__':
     for k, v in chip.items():
         print ("\n****", k, "****")
         pprint(v)
+    print ("chipname pinspec resources", sys.argv, chipname)
+    specs = get_pinspec_resources(chipname, subset=None)
index df9caf68871a7100950ec748a5c27b30d4ce3399..39437b3c94d63ee86785c8350438f72238e6b595 100644 (file)
@@ -13,21 +13,22 @@ import sys
 sys.setrecursionlimit(10**6)
 
 
-def read_from_addr(dut, addr):
+def read_from_addr(dut, addr, stall=True):
     yield dut.a_pc_i.eq(addr)
-    yield dut.a_valid_i.eq(1)
-    yield dut.f_valid_i.eq(1)
-    yield dut.a_stall_i.eq(1)
-    yield
-    yield dut.a_stall_i.eq(0)
+    yield dut.a_i_valid.eq(1)
+    yield dut.f_i_valid.eq(1)
+    if stall:
+        yield dut.a_stall_i.eq(1)
+        yield
+        yield dut.a_stall_i.eq(0)
     yield
     yield Settle()
     while (yield dut.f_busy_o):
         yield
     res = (yield dut.f_instr_o)
 
-    yield dut.a_valid_i.eq(0)
-    yield dut.f_valid_i.eq(0)
+    yield dut.a_i_valid.eq(0)
+    yield dut.f_i_valid.eq(0)
     yield
     return res
 
index 02c491f98c8aec9b4be3f4ead326468cdac25300..91435bd70edae0137b138e315f38453f5f0918f5 100644 (file)
@@ -16,9 +16,9 @@ def write_to_addr(dut, addr, value):
     yield dut.x_st_data_i.eq(value)
     yield dut.x_st_i.eq(1)
     yield dut.x_mask_i.eq(-1)
-    yield dut.x_valid_i.eq(1)
+    yield dut.x_i_valid.eq(1)
     yield dut.x_stall_i.eq(1)
-    yield dut.m_valid_i.eq(1)
+    yield dut.m_i_valid.eq(1)
     yield
     yield
 
@@ -33,7 +33,7 @@ def write_to_addr(dut, addr, value):
 def read_from_addr(dut, addr):
     yield dut.x_addr_i.eq(addr)
     yield dut.x_ld_i.eq(1)
-    yield dut.x_valid_i.eq(1)
+    yield dut.x_i_valid.eq(1)
     yield dut.x_stall_i.eq(1)
     yield
     yield dut.x_stall_i.eq(0)
@@ -42,7 +42,7 @@ def read_from_addr(dut, addr):
     yield Settle()
     while (yield dut.x_busy_o):
         yield
-    assert (yield dut.x_valid_i)
+    assert (yield dut.x_i_valid)
     return (yield dut.m_ld_data_o)
 
 
@@ -53,8 +53,8 @@ def write_byte(dut, addr, val):
     yield dut.x_st_i.eq(1)
     yield dut.x_mask_i.eq(1 << offset)
     print("write_byte", addr, bin(1 << offset), hex(val << (offset*8)))
-    yield dut.x_valid_i.eq(1)
-    yield dut.m_valid_i.eq(1)
+    yield dut.x_i_valid.eq(1)
+    yield dut.m_i_valid.eq(1)
 
     yield
     yield dut.x_st_i.eq(0)
@@ -66,13 +66,13 @@ def read_byte(dut, addr):
     offset = addr & 0x3
     yield dut.x_addr_i.eq(addr)
     yield dut.x_ld_i.eq(1)
-    yield dut.x_valid_i.eq(1)
+    yield dut.x_i_valid.eq(1)
     yield
     yield dut.x_ld_i.eq(0)
     yield Settle()
     while (yield dut.x_busy_o):
         yield
-    assert (yield dut.x_valid_i)
+    assert (yield dut.x_i_valid)
     val = (yield dut.m_ld_data_o)
     print("read_byte", addr, offset, hex(val))
     return (val >> (offset * 8)) & 0xff
index eb20177c2117e43ff1355be96877950d0b712bdb..96cadef23e4c6fd3e4a7c177cc703a664c5d6c3c 100644 (file)
@@ -6,24 +6,30 @@ from nmigen.cli import rtlil
 import unittest
 from soc.config.test.test_loadstore import TestMemPspec
 from soc.config.loadstore import ConfigMemoryPortInterface
+from openpower.exceptions import LDSTExceptionTuple
 
 
-def wait_busy(port, no=False):
+def wait_busy(port, no=False, debug=None):
+    cnt = 0
     while True:
         busy = yield port.busy_o
-        print("busy", no, busy)
+        print("busy", no, busy, cnt, debug)
         if bool(busy) == no:
             break
         yield
+        cnt += 1
 
 
-def wait_addr(port):
+def wait_addr(port,debug=None):
+    cnt = 0
     while True:
         addr_ok = yield port.addr_ok_o
-        print("addrok", addr_ok)
-        if addr_ok:
+        exc_happened = yield port.exc_o.happened
+        print("addrok", addr_ok,cnt,debug,exc_happened)
+        if addr_ok or exc_happened:
             break
         yield
+        cnt += 1
 
 
 def wait_ldok(port):
@@ -38,20 +44,48 @@ def wait_ldok(port):
         yield
 
 
-def pi_st(port1, addr, data, datalen, msr_pr=0):
+def pi_st(port1, addr, data, datalen, msr, is_dcbz=0):
 
     # have to wait until not busy
-    yield from wait_busy(port1, no=False)    # wait until not busy
+    yield from wait_busy(port1,debug="pi_st_A") # wait while busy
 
     # set up a ST on the port.  address first:
+    yield port1.is_dcbz_i.eq(is_dcbz)  # reset dcbz too
     yield port1.is_st_i.eq(1)  # indicate ST
     yield port1.data_len.eq(datalen)  # ST length (1/2/4/8)
-    yield port1.msr_pr.eq(msr_pr)  # MSR PR bit (1==>virt, 0==>real)
+    yield port1.priv_mode.eq(~msr.pr)
+    yield port1.virt_mode.eq(msr.dr)
+    yield port1.mode_32bit.eq(~msr.sf)
 
     yield port1.addr.data.eq(addr)  # set address
     yield port1.addr.ok.eq(1)  # set ok
     yield Settle()
+
+    # must check exception even before waiting for address.
+    # XXX TODO: wait_addr should check for exception
+    exc_info = yield from get_exception_info(port1.exc_o)
+    exc_happened = exc_info.happened
+    if exc_happened:
+        print("print fast ST exception happened")
+        yield # MUST wait for one clock cycle before de-asserting these
+        yield port1.is_st_i.eq(0)  # end
+        yield port1.addr.ok.eq(0)  # set !ok
+        yield port1.is_dcbz_i.eq(0)  # reset dcbz too
+        return "fast", exc_info
+
     yield from wait_addr(port1)             # wait until addr ok
+
+    exc_info = yield from get_exception_info(port1.exc_o)
+    exc_happened = exc_info.happened
+    if exc_happened:
+        print("print fast ST exception happened")
+        yield # MUST wait for one clock cycle before de-asserting these
+        yield port1.is_st_i.eq(0)  # end
+        yield port1.addr.ok.eq(0)  # set !ok
+        yield port1.is_dcbz_i.eq(0)  # reset dcbz too
+        return "fast", exc_info
+
+
     # yield # not needed, just for checking
     # yield # not needed, just for checking
     # assert "ST" for one cycle (required by the API)
@@ -59,44 +93,95 @@ def pi_st(port1, addr, data, datalen, msr_pr=0):
     yield port1.st.ok.eq(1)
     yield
     yield port1.st.ok.eq(0)
-    yield from wait_busy(port1, True)    # wait while busy
+    exc_info = yield from get_exception_info(port1.exc_o)
+    exc_happened = exc_info.happened
+    if exc_happened:
+        print("print fast ST exception happened")
+        yield # MUST wait for one clock cycle before de-asserting these
+        yield port1.is_st_i.eq(0)  # end
+        yield port1.addr.ok.eq(0)  # set !ok
+        yield port1.is_dcbz_i.eq(0)  # reset dcbz too
+        return "fast", exc_info
+
+    yield from wait_busy(port1,debug="pi_st_E") # wait while busy
+    exc_info = yield from get_exception_info(port1.exc_o)
+    exc_happened = exc_info.happened
+    if exc_happened:
+        yield  # needed if mmu/dache is used
+        yield port1.is_st_i.eq(0)  # end
+        yield port1.addr.ok.eq(0)  # set !ok
+        yield port1.is_dcbz_i.eq(0)  # reset dcbz too
+        yield  # needed if mmu/dache is used
+        return "slow", exc_info
 
     # can go straight to reset.
     yield port1.is_st_i.eq(0)  # end
     yield port1.addr.ok.eq(0)  # set !ok
+    yield port1.is_dcbz_i.eq(0)  # reset dcbz too
+    yield  # needed if mmu/dache is used
+
+    return None, None
+
+def get_exception_info(exc_o):
+    attrs = []
+    for fname in LDSTExceptionTuple._fields:
+        attr = getattr(exc_o, fname)
+        val = yield attr
+        attrs.append(val)
+    return LDSTExceptionTuple(*attrs)
+
 
+# copy of pi_st removed
 
-def pi_ld(port1, addr, datalen, msr_pr=0):
+def pi_ld(port1, addr, datalen, msr):
 
     # have to wait until not busy
-    yield from wait_busy(port1, no=False)    # wait until not busy
+    yield from wait_busy(port1,debug="pi_ld_A") # wait while busy
 
     # set up a LD on the port.  address first:
     yield port1.is_ld_i.eq(1)  # indicate LD
     yield port1.data_len.eq(datalen)  # LD length (1/2/4/8)
-    yield port1.msr_pr.eq(msr_pr)  # MSR PR bit (1==>virt, 0==>real)
+    yield port1.priv_mode.eq(~msr.pr)
+    yield port1.virt_mode.eq(msr.dr)
+    yield port1.mode_32bit.eq(~msr.sf)
 
     yield port1.addr.data.eq(addr)  # set address
     yield port1.addr.ok.eq(1)  # set ok
     yield Settle()
     yield from wait_addr(port1)             # wait until addr ok
+    exc_info = yield from get_exception_info(port1.exc_o)
+    exc_happened = exc_info.happened
+    if exc_happened:
+        print("print fast LD exception happened")
+        yield # MUST wait for one clock cycle before de-asserting these
+        yield port1.is_ld_i.eq(0)  # end
+        yield port1.addr.ok.eq(0)  # set !ok
+        return None, "fast", exc_info
+
     yield
     yield from wait_ldok(port1)             # wait until ld ok
     data = yield port1.ld.data
+    exc_info = yield from get_exception_info(port1.exc_o)
     exc_happened = yield port1.exc_o.happened
+    exc_happened = exc_info.happened
 
     # cleanup
     yield port1.is_ld_i.eq(0)  # end
     yield port1.addr.ok.eq(0)  # set !ok
     if exc_happened:
-        return 0
+        return None, "slow", exc_info
 
-    yield from wait_busy(port1, no=False)    # wait while not busy
+    yield from wait_busy(port1, debug="pi_ld_E") # wait while busy
 
-    return data
+    exc_info = yield from get_exception_info(port1.exc_o)
+    exc_happened = exc_info.happened
+    if exc_happened:
+        return None, "slow", exc_info
 
+    return data, None, None
 
-def pi_ldst(arg, dut, msr_pr=0):
+
+def pi_ldst(arg, dut, msr):
 
     # do two half-word stores at consecutive addresses, then two loads
     addr1 = 0x04
@@ -104,16 +189,19 @@ def pi_ldst(arg, dut, msr_pr=0):
     data = 0xbeef
     data2 = 0xf00f
     #data = 0x4
-    yield from pi_st(dut, addr1, data, 2, msr_pr)
-    yield from pi_st(dut, addr2, data2, 2, msr_pr)
-    result = yield from pi_ld(dut, addr1, 2, msr_pr)
-    result2 = yield from pi_ld(dut, addr2, 2, msr_pr)
+    assert(yield from pi_st(dut, addr1, data, 2, msr) is None)
+    assert(yield from pi_st(dut, addr2, data2, 2, msr) is None)
+    result, exc = yield from pi_ld(dut, addr1, 2, msr)
+    result2, exc2 = yield from pi_ld(dut, addr2, 2, msr)
+    assert(exc is None)
+    assert(exc2 is None)
     arg.assertEqual(data, result, "data %x != %x" % (result, data))
     arg.assertEqual(data2, result2, "data2 %x != %x" % (result2, data2))
 
     # now load both in a 32-bit load to make sure they're really consecutive
     data3 = data | (data2 << 16)
-    result3 = yield from pi_ld(dut, addr1, 4, msr_pr)
+    result3, exc3 = yield from pi_ld(dut, addr1, 4, msr)
+    assert(exc3 is None)
     arg.assertEqual(data3, result3, "data3 %x != %x" % (result3, data3))
 
 
@@ -123,7 +211,7 @@ def tst_config_pi(testcls, ifacetype):
     dut = Module()
     pspec = TestMemPspec(ldst_ifacetype=ifacetype,
                          imem_ifacetype='',
-                         addr_wid=48,
+                         addr_wid=64,
                          mask_wid=8,
                          reg_wid=64)
     cmpi = ConfigMemoryPortInterface(pspec)
@@ -138,8 +226,9 @@ def tst_config_pi(testcls, ifacetype):
                    vcd_name='test_pi_%s.vcd' % ifacetype)
 
 
+# FIXME: TypeError: pi_ldst() missing 1 required positional argument: 'msr'
+@unittest.skip('broken')
 class TestPIMem(unittest.TestCase):
-
     def test_pi_mem(self):
         tst_config_pi(self, 'testpi')
 
diff --git a/src/soc/debug/.gitignore b/src/soc/debug/.gitignore
new file mode 100644 (file)
index 0000000..8edaee0
--- /dev/null
@@ -0,0 +1 @@
+ls180_pins.py
index 4d897699737672ee9b58ebd8e2f585eed4f52803..03bd8dc8eabcde75a191147a666ad9086b120ac5 100644 (file)
@@ -11,12 +11,13 @@ from nmutil.iocontrol import RecordObject
 from nmigen.utils import log2_int
 from nmigen.cli import rtlil
 from soc.config.state import CoreState
+from openpower.consts import FastRegsEnum
 
 
 # DMI register addresses
 class DBGCore:
-    CTRL         = 0b0000
-    STAT         = 0b0001
+    CTRL         = 0b0000 # Control: start/stop/reset
+    STAT         = 0b0001 # Status (read started/stopped/stopping)
     NIA          = 0b0010 # NIA register (read only for now)
     MSR          = 0b0011 # MSR (read only)
     GSPR_IDX     = 0b0100 # GSPR register index
@@ -26,6 +27,7 @@ class DBGCore:
     CR           = 0b1000 # CR (read only)
     XER          = 0b1001 # XER (read only) - note this is a TEMPORARY hack
     SVSTATE      = 0b1010 # SVSTATE register (read only for now)
+    STOPADDR     = 0b1011 # Address at which the core automatically stops
 
 
 # CTRL register (direct actions, write 1 to act, read back 0)
@@ -98,20 +100,17 @@ class CoreDebug(Elaboratable):
         self.core_stop_o       = Signal()
         self.core_rst_o        = Signal()
         self.icache_rst_o      = Signal()
+        self.stopping_o = Signal(name="stopping")
 
         # Core status inputs
         self.terminate_i    = Signal()
         self.core_stopped_i = Signal()
         self.state = CoreState("core_dbg")
 
-        # GSPR register read port
-        self.d_gpr = DbgReg("d_gpr")
-
-        # CR register read port
-        self.d_cr = DbgReg("d_cr")
-
-        # XER register read port
-        self.d_xer = DbgReg("d_xer")
+        self.d_gpr = DbgReg("d_gpr") # GSPR register read port
+        self.d_fast = DbgReg("d_fast") # GSPR register read port
+        self.d_cr = DbgReg("d_cr")   # CR register read port
+        self.d_xer = DbgReg("d_xer") # XER register read port
 
         # Core logging data
         self.log_data_i        = Signal(256)
@@ -119,6 +118,10 @@ class CoreDebug(Elaboratable):
         self.log_read_data_o   = Signal(64)
         self.log_write_addr_o  = Signal(32)
 
+        # address at which the processor stops automatically
+        # set to 0xffffffffffffffff by default (impossible to reach)
+        self.stop_addr_o = Signal(64, reset=-1)
+
         # Misc
         self.terminated_o  = Signal()
 
@@ -127,6 +130,7 @@ class CoreDebug(Elaboratable):
         m = Module()
         comb, sync = m.d.comb, m.d.sync
         dmi, d_gpr, d_cr, d_xer, = self.dmi, self.d_gpr, self.d_cr, self.d_xer
+        d_fast = self.d_fast
 
         # DMI needs fixing... make a one clock pulse
         dmi_req_i_1 = Signal()
@@ -135,13 +139,17 @@ class CoreDebug(Elaboratable):
         stat_reg = Signal(64)
 
         # Some internal latches
-        stopping     = Signal()
+        stopping     = self.stopping_o
         do_step      = Signal()
         do_reset     = Signal()
         do_icreset   = Signal()
         terminated   = Signal()
         do_gspr_rd   = Signal()
+        # select either GPRs or FAST regs to read, based on GSPR_IDX
         gspr_index   = Signal.like(d_gpr.addr)
+        fast_index   = Signal.like(d_gpr.addr)
+        gspr_en      = Signal()
+        fast_en      = Signal()
 
         log_dmi_addr = Signal(32)
         log_dmi_data = Signal(64)
@@ -151,11 +159,15 @@ class CoreDebug(Elaboratable):
 
         LOG_INDEX_BITS = log2_int(self.LOG_LENGTH)
 
-        # Single cycle register accesses on DMI except for GSPR data
+        # Single cycle register accesses on DMI except for registers
         with m.Switch(dmi.addr_i):
             with m.Case(DBGCore.GSPR_DATA):
-                comb += dmi.ack_o.eq(d_gpr.ack)
-                comb += d_gpr.req.eq(dmi.req_i)
+                with m.If(gspr_en): # GPR requested, acknowledge GPR
+                    comb += dmi.ack_o.eq(d_gpr.ack)
+                    comb += d_gpr.req.eq(dmi.req_i)
+                with m.If(fast_en): # FAST requested
+                    comb += dmi.ack_o.eq(d_fast.ack)
+                    comb += d_fast.req.eq(dmi.req_i)
             with m.Case(DBGCore.CR):
                 comb += dmi.ack_o.eq(d_cr.ack)
                 comb += d_cr.req.eq(dmi.req_i)
@@ -163,6 +175,7 @@ class CoreDebug(Elaboratable):
                 comb += dmi.ack_o.eq(d_xer.ack)
                 comb += d_xer.req.eq(dmi.req_i)
             with m.Default():
+                # everything else is immediate-acknowledgement (combinatorial)
                 comb += dmi.ack_o.eq(dmi.req_i)
 
         # Status register read composition (DBUG_CORE_STAT_xxx)
@@ -172,24 +185,29 @@ class CoreDebug(Elaboratable):
 
         # DMI read data mux
         with m.Switch(dmi.addr_i):
-            with m.Case( DBGCore.STAT):
+            with m.Case( DBGCore.STAT):               # Status register
                 comb += dmi.dout.eq(stat_reg)
-            with m.Case( DBGCore.NIA):
+            with m.Case( DBGCore.NIA):                # NIA (PC)
                 comb += dmi.dout.eq(self.state.pc)
-            with m.Case( DBGCore.MSR):
+            with m.Case( DBGCore.MSR):                # MSR
                 comb += dmi.dout.eq(self.state.msr)
-            with m.Case( DBGCore.SVSTATE):
+            with m.Case( DBGCore.SVSTATE):            # SVSTATE
                 comb += dmi.dout.eq(self.state.svstate)
-            with m.Case( DBGCore.GSPR_DATA):
-                comb += dmi.dout.eq(d_gpr.data)
-            with m.Case( DBGCore.LOG_ADDR):
+            with m.Case( DBGCore.GSPR_DATA):          # GPR/FAST regs
+                with m.If(gspr_en):
+                    comb += dmi.dout.eq(d_gpr.data)   # GPR data selected
+                with m.If(fast_en):
+                    comb += dmi.dout.eq(d_fast.data)  # FAST reg read selected
+            with m.Case( DBGCore.LOG_ADDR):           # Logging
                 comb += dmi.dout.eq(Cat(log_dmi_addr, self.log_write_addr_o))
             with m.Case( DBGCore.LOG_DATA):
                 comb += dmi.dout.eq(log_dmi_data)
-            with m.Case(DBGCore.CR):
+            with m.Case(DBGCore.CR):                  # CR
                 comb += dmi.dout.eq(d_cr.data)
-            with m.Case(DBGCore.XER):
+            with m.Case(DBGCore.XER):                 # XER
                 comb += dmi.dout.eq(d_xer.data)
+            with m.Case(DBGCore.STOPADDR):            # Halt PC
+                comb += dmi.dout.eq(self.stop_addr_o)
 
         # DMI writes
         # Reset the 1-cycle "do" signals
@@ -224,12 +242,31 @@ class CoreDebug(Elaboratable):
 
                 # GSPR address
                 with m.Elif(dmi.addr_i == DBGCore.GSPR_IDX):
-                    sync += gspr_index.eq(dmi.din)
+                    sync += gspr_index.eq(0)
+                    sync += fast_index.eq(0)
+                    sync += gspr_en.eq(0)
+                    sync += fast_en.eq(0)
+                    with m.If(dmi.din <= 31):
+                        sync += gspr_index.eq(dmi.din)
+                        sync += gspr_en.eq(1)
+                    # cover the FastRegs LR, CTR, SRR0, SRR1 etc.
+                    # numbering is from microwatt
+                    for x, i in FastRegsEnum.__dict__.items():
+                        if not isinstance(i, int) or x == 'N_REGS':
+                            continue
+                        with m.If(dmi.din == 32+i):
+                            sync += fast_index.eq(i)
+                            sync += fast_en.eq(1)
 
                 # Log address
                 with m.Elif(dmi.addr_i == DBGCore.LOG_ADDR):
                     sync += log_dmi_addr.eq(dmi.din)
                     sync += do_dmi_log_rd.eq(1)
+
+                # set PC Halt address
+                with m.Elif(dmi.addr_i == DBGCore.STOPADDR):
+                    sync += self.stop_addr_o.eq(dmi.din)
+
             with m.Else():
                 # sync += Display("DMI read from " & to_string(dmi_addr))
                 pass
@@ -252,12 +289,16 @@ class CoreDebug(Elaboratable):
             sync += terminated.eq(1)
 
         comb += d_gpr.addr.eq(gspr_index)
+        comb += d_fast.addr.eq(fast_index)
 
         # Core control signals generated by the debug module
-        comb += self.core_stop_o.eq(stopping & ~do_step)
+        # Note: make stop and terminated synchronous, to help with timing
+        # however this *may* interfere with some of the DMI-based unit tests
+        # so has to be kept an eye on
+        sync += self.core_stop_o.eq((stopping & ~do_step) | self.terminate_i)
+        sync += self.terminated_o.eq(terminated | self.terminate_i)
         comb += self.core_rst_o.eq(do_reset)
         comb += self.icache_rst_o.eq(do_icreset)
-        comb += self.terminated_o.eq(terminated)
 
         # Logging RAM (none)
 
@@ -356,6 +397,7 @@ class CoreDebug(Elaboratable):
         yield from self.d_gpr
         yield from self.d_cr
         yield from self.d_xer
+        yield from self.d_fast
         yield self.log_data_i
         yield self.log_read_addr_i
         yield self.log_read_data_o
index a642bd1f358cbab7f639505a3f5475ec0625ae7c..79418e28ed27c88037fac58268e758f2aedfacbc 100644 (file)
@@ -157,7 +157,7 @@ class JTAGServer:
 
     def jtagremote_server_recv(self, tdo):
         data = self.get_data(1, 0) # read 1 byte, non-blocking
-        if data is None:
+        if data is None or len(data) == 0:
             return None # no data read
         data = bytes.decode(data)
         if self.debug:
index 528aa34ab2e19fb8bc29a948b8db3e967f25cb17..6c984ed34a6c776f5f2f5a4d89889c2533c36e18 100644 (file)
@@ -25,11 +25,16 @@ def tms_state_set(dut, bits):
         yield
     yield dut.bus.tms.eq(0)
 
+def tms_data_getset(dut, tms, d_len, d_in=0, reverse=False):
+    if reverse:
+        # Reverse the for loop to transmit MSB-first
+        bit_range = range(d_len-1, -1, -1)
+    else:
+        bit_range = range(d_len)
 
-def tms_data_getset(dut, tms, d_len, d_in=0):
     res = 0
     yield dut.bus.tms.eq(tms)
-    for i in range(d_len):
+    for i in bit_range:
         tdi = 1 if (d_in & (1<<i)) else 0
         yield dut.bus.tck.eq(1)
         res |= (1<<i) if (yield dut.bus.tdo) else 0
@@ -58,14 +63,14 @@ def jtag_set_idle(dut):
     yield from tms_state_set(dut, [1, 1, 0])
 
 
-def jtag_read_write_reg(dut, addr, d_len, d_in=0):
+def jtag_read_write_reg(dut, addr, d_len, d_in=0, reverse=False):
     yield from jtag_set_run(dut)
     yield from jtag_set_shift_ir(dut)
     yield from tms_data_getset(dut, 0, dut._ir_width, addr)
     yield from jtag_set_idle(dut)
 
     yield from jtag_set_shift_dr(dut)
-    result = yield from tms_data_getset(dut, 0, d_len, d_in)
+    result = yield from tms_data_getset(dut, 0, d_len, d_in, reverse)
     yield from jtag_set_idle(dut)
     return result
 
@@ -115,7 +120,7 @@ def jtag_sim(dut):
     # read DMI CTRL register
     status = yield from jtag_read_write_reg(dut, DMI_READ, 64)
     print ("dmi ctrl status", hex(status))
-    assert status == 0
+    assert status == 6
 
     # write DMI MSR address
     yield from jtag_read_write_reg(dut, DMI_ADDR, 8, DBGCore.MSR)
index a72145754974946a705cb126d68064e5cf6ab5e1..b92f41ee355005e0782abf0ec50169adb30fa46f 100644 (file)
@@ -188,7 +188,7 @@ def jtag_sim(dut, srv_dut):
     # read DMI CTRL register
     status = yield from jtag_read_write_reg(dut, DMI_READ, 64)
     print ("dmi ctrl status", hex(status))
-    assert status == 0
+    assert status == 6
 
     # write DMI MSR address
     yield from jtag_read_write_reg(dut, DMI_ADDR, 8, DBGCore.MSR)
@@ -221,7 +221,7 @@ def jtag_sim(dut, srv_dut):
 
 
 if __name__ == '__main__':
-    dut = JTAG(test_pinset(), wb_data_wid=64)
+    dut = JTAG(test_pinset(), wb_data_wid=64, domain="sync")
     dut.stop = False
 
     # rather than the client access the JTAG bus directly
@@ -236,6 +236,8 @@ if __name__ == '__main__':
         cdut.c = JTAGClient()
         dut.s.get_connection()
     else:
+        print ("running server only as requested, use openocd remote to test")
+        sys.stdout.flush()
         dut.s.get_connection(None) # block waiting for connection
 
     # take copy of ir_width and scan_len
@@ -255,8 +257,6 @@ if __name__ == '__main__':
     sim.add_sync_process(wrap(jtag_srv(dut))) # jtag server
     if len(sys.argv) != 2 or sys.argv[1] != 'server':
         sim.add_sync_process(wrap(jtag_sim(cdut, dut))) # actual jtag tester
-    else:
-        print ("running server only as requested, use openocd remote to test")
     sim.add_sync_process(wrap(dmi_sim(dut)))  # handles (pretends to be) DMI
 
     with sim.write_vcd("dmi2jtag_test_srv.vcd"):
index 3fb1c6cfe595d627506a941f87f5d39ab2bb5079..e2e737733c13af80b3e8f92bb57a2c0ccefc9cd1 100644 (file)
@@ -7,11 +7,11 @@ intended to comply with both the CompALU API and the nmutil Pipeline API
 
 The basic rules are:
 
-1) p.ready_o is asserted on the initial ("Idle") state, otherwise it keeps low.
-2) n.valid_o is asserted on the final ("Done") state, otherwise it keeps low.
-3) The FSM stays in the Idle state while p.valid_i is low, otherwise
+1) p.o_ready is asserted on the initial ("Idle") state, otherwise it keeps low.
+2) n.o_valid is asserted on the final ("Done") state, otherwise it keeps low.
+3) The FSM stays in the Idle state while p.i_valid is low, otherwise
    it accepts the input data and moves on.
-4) The FSM stays in the Done state while n.ready_i is low, otherwise
+4) The FSM stays in the Done state while n.i_ready is low, otherwise
    it releases the output data and goes back to the Idle state.
 
 """
@@ -44,8 +44,8 @@ class Shifter(Elaboratable):
     """Simple sequential shifter
 
     Prev port data:
-    * p.data_i.data:  value to be shifted
-    * p.data_i.shift: shift amount
+    * p.i_data.data:  value to be shifted
+    * p.i_data.shift: shift amount
     *                 When zero, no shift occurs.
     *                 On POWER, range is 0 to 63 for 32-bit,
     *                 and 0 to 127 for 64-bit.
@@ -55,11 +55,11 @@ class Shifter(Elaboratable):
     * op.sdir:       shift direction (0 = left, 1 = right)
 
     Next port data:
-    * n.data_o.data: shifted value
+    * n.o_data.data: shifted value
     """
     class PrevData:
         def __init__(self, width):
-            self.data = Signal(width, name="p_data_i")
+            self.data = Signal(width, name="p_i_data")
             self.shift = Signal(width, name="p_shift_i")
             self.ctx = Dummy()  # comply with CompALU API
 
@@ -68,7 +68,7 @@ class Shifter(Elaboratable):
 
     class NextData:
         def __init__(self, width):
-            self.data = Signal(width, name="n_data_o")
+            self.data = Signal(width, name="n_o_data")
 
         def _get_data(self):
             return [self.data]
@@ -77,14 +77,14 @@ class Shifter(Elaboratable):
         self.width = width
         self.p = PrevControl()
         self.n = NextControl()
-        self.p.data_i = Shifter.PrevData(width)
-        self.n.data_o = Shifter.NextData(width)
+        self.p.i_data = Shifter.PrevData(width)
+        self.n.o_data = Shifter.NextData(width)
 
         # more pieces to make this example class comply with the CompALU API
         self.op = CompFSMOpSubset(name="op")
-        self.p.data_i.ctx.op = self.op
-        self.i = self.p.data_i._get_data()
-        self.out = self.n.data_o._get_data()
+        self.p.i_data.ctx.op = self.op
+        self.i = self.p.i_data._get_data()
+        self.out = self.n.o_data._get_data()
 
     def elaborate(self, platform):
         m = Module()
@@ -115,8 +115,8 @@ class Shifter(Elaboratable):
         # build the data flow
         m.d.comb += [
             # connect input and output
-            shift_in.eq(self.p.data_i.data),
-            self.n.data_o.data.eq(shift_reg),
+            shift_in.eq(self.p.i_data.data),
+            self.n.o_data.data.eq(shift_reg),
             # generate shifted views of the register
             shift_left_by_1.eq(Cat(0, shift_reg[:-1])),
             shift_right_by_1.eq(Cat(shift_reg[1:], 0)),
@@ -152,15 +152,15 @@ class Shifter(Elaboratable):
         with m.FSM():
             with m.State("IDLE"):
                 m.d.comb += [
-                    # keep p.ready_o active on IDLE
-                    self.p.ready_o.eq(1),
+                    # keep p.o_ready active on IDLE
+                    self.p.o_ready.eq(1),
                     # keep loading the shift register and shift count
                     load.eq(1),
-                    next_count.eq(self.p.data_i.shift),
+                    next_count.eq(self.p.i_data.shift),
                 ]
                 # capture the direction bit as well
                 m.d.sync += direction.eq(self.op.sdir)
-                with m.If(self.p.valid_i):
+                with m.If(self.p.i_valid):
                     # Leave IDLE when data arrives
                     with m.If(next_count == 0):
                         # short-circuit for zero shift
@@ -178,9 +178,9 @@ class Shifter(Elaboratable):
                     # exit when shift counter goes to zero
                     m.next = "DONE"
             with m.State("DONE"):
-                # keep n.valid_o active while the data is not accepted
-                m.d.comb += self.n.valid_o.eq(1)
-                with m.If(self.n.ready_i):
+                # keep n.o_valid active while the data is not accepted
+                m.d.comb += self.n.o_valid.eq(1)
+                with m.If(self.n.i_ready):
                     # go back to IDLE when the data is accepted
                     m.next = "IDLE"
 
@@ -188,13 +188,13 @@ class Shifter(Elaboratable):
 
     def __iter__(self):
         yield self.op.sdir
-        yield self.p.data_i.data
-        yield self.p.data_i.shift
-        yield self.p.valid_i
-        yield self.p.ready_o
-        yield self.n.ready_i
-        yield self.n.valid_o
-        yield self.n.data_o.data
+        yield self.p.i_data.data
+        yield self.p.i_data.shift
+        yield self.p.i_valid
+        yield self.p.o_ready
+        yield self.n.i_ready
+        yield self.n.o_valid
+        yield self.n.o_data.data
 
     def ports(self):
         return list(self)
@@ -222,20 +222,20 @@ def test_shifter():
         {'comment': 'Shifter Demonstration'},
         ('prev port', [
             ('op__sdir', 'in'),
-            ('p_data_i[7:0]', 'in'),
+            ('p_i_data[7:0]', 'in'),
             ('p_shift_i[7:0]', 'in'),
             ({'submodule': 'p'}, [
-                ('p_valid_i', 'in'),
-                ('p_ready_o', 'out')])]),
+                ('p_i_valid', 'in'),
+                ('p_o_ready', 'out')])]),
         ('internal', [
             'fsm_state' if is_engine_pysim() else 'fsm_state[1:0]',
             'count[3:0]',
             'shift_reg[7:0]']),
         ('next port', [
-            ('n_data_o[7:0]', 'out'),
+            ('n_o_data[7:0]', 'out'),
             ({'submodule': 'n'}, [
-                ('n_valid_o', 'out'),
-                ('n_ready_i', 'in')])])]
+                ('n_o_valid', 'out'),
+                ('n_i_ready', 'in')])])]
 
     write_gtkw("test_shifter.gtkw", "test_shifter.vcd",
                gtkwave_desc,  gtkwave_style,
@@ -245,32 +245,32 @@ def test_shifter():
     sim.add_clock(1e-6)
 
     def send(data, shift, direction):
-        # present input data and assert valid_i
-        yield dut.p.data_i.data.eq(data)
-        yield dut.p.data_i.shift.eq(shift)
+        # present input data and assert i_valid
+        yield dut.p.i_data.data.eq(data)
+        yield dut.p.i_data.shift.eq(shift)
         yield dut.op.sdir.eq(direction)
-        yield dut.p.valid_i.eq(1)
+        yield dut.p.i_valid.eq(1)
         yield
-        # wait for p.ready_o to be asserted
-        while not (yield dut.p.ready_o):
+        # wait for p.o_ready to be asserted
+        while not (yield dut.p.o_ready):
             yield
-        # clear input data and negate p.valid_i
-        yield dut.p.valid_i.eq(0)
-        yield dut.p.data_i.data.eq(0)
-        yield dut.p.data_i.shift.eq(0)
+        # clear input data and negate p.i_valid
+        yield dut.p.i_valid.eq(0)
+        yield dut.p.i_data.data.eq(0)
+        yield dut.p.i_data.shift.eq(0)
         yield dut.op.sdir.eq(0)
 
     def receive(expected):
         # signal readiness to receive data
-        yield dut.n.ready_i.eq(1)
+        yield dut.n.i_ready.eq(1)
         yield
-        # wait for n.valid_o to be asserted
-        while not (yield dut.n.valid_o):
+        # wait for n.o_valid to be asserted
+        while not (yield dut.n.o_valid):
             yield
         # read result
-        result = yield dut.n.data_o.data
-        # negate n.ready_i
-        yield dut.n.ready_i.eq(0)
+        result = yield dut.n.o_data.data
+        # negate n.i_ready
+        yield dut.n.i_ready.eq(0)
         # check result
         assert result == expected
 
index dbe8465fbdff095736ad2a47332f1cab8172535b..459bbd951cb41a35e5f06089162e365fd8b03d9b 100644 (file)
@@ -9,7 +9,7 @@ A "real" integer ALU would place the answers onto the output bus after
 only one cycle (sync)
 """
 
-from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
+from nmigen import Elaboratable, Signal, Module, Const, Mux
 from nmigen.hdl.rec import Record, Layout
 from nmigen.cli import main
 from nmigen.cli import verilog, rtlil
@@ -28,6 +28,10 @@ from openpower.decoder.power_enums import MicrOp, Function, CryIn
 from soc.fu.alu.alu_input_record import CompALUOpSubset
 from soc.fu.cr.cr_input_record import CompCROpSubset
 
+from soc.fu.pipe_data import FUBaseData
+from soc.fu.alu.pipe_data import CommonPipeSpec
+from soc.fu.compunits.compunits import FunctionUnitBaseSingle
+
 import operator
 
 
@@ -105,42 +109,42 @@ class Dummy:
 class DummyALU(Elaboratable):
     def __init__(self, width):
         self.p = Dummy()  # make look like nmutil pipeline API
-        self.p.data_i = Dummy()
-        self.p.data_i.ctx = Dummy()
+        self.p.i_data = Dummy()
+        self.p.i_data.ctx = Dummy()
         self.n = Dummy()  # make look like nmutil pipeline API
-        self.n.data_o = Dummy()
-        self.p.valid_i = Signal()
-        self.p.ready_o = Signal()
-        self.n.ready_i = Signal()
-        self.n.valid_o = Signal()
+        self.n.o_data = Dummy()
+        self.p.i_valid = Signal()
+        self.p.o_ready = Signal()
+        self.n.i_ready = Signal()
+        self.n.o_valid = Signal()
         self.counter = Signal(4)
         self.op = CompCROpSubset()
         i = []
         i.append(Signal(width, name="i1"))
         i.append(Signal(width, name="i2"))
         i.append(Signal(width, name="i3"))
-        self.i = Array(i)
+        self.i = i
         self.a, self.b, self.c = i[0], i[1], i[2]
-        self.out = Array([Signal(width, name="alu_o")])
+        self.out = tuple([Signal(width, name="alu_o")])
         self.o = self.out[0]
         self.width = width
         # more "look like nmutil pipeline API"
-        self.p.data_i.ctx.op = self.op
-        self.p.data_i.a = self.a
-        self.p.data_i.b = self.b
-        self.p.data_i.c = self.c
-        self.n.data_o.o = self.o
+        self.p.i_data.ctx.op = self.op
+        self.p.i_data.a = self.a
+        self.p.i_data.b = self.b
+        self.p.i_data.c = self.c
+        self.n.o_data.o = self.o
 
     def elaborate(self, platform):
         m = Module()
 
         go_now = Signal(reset_less=True)  # testing no-delay ALU
 
-        with m.If(self.p.valid_i):
+        with m.If(self.p.i_valid):
             # input is valid. next check, if we already said "ready" or not
-            with m.If(~self.p.ready_o):
+            with m.If(~self.p.o_ready):
                 # we didn't say "ready" yet, so say so and initialise
-                m.d.sync += self.p.ready_o.eq(1)
+                m.d.sync += self.p.o_ready.eq(1)
 
                 m.d.sync += self.o.eq(self.a)
                 m.d.comb += go_now.eq(1)
@@ -149,14 +153,14 @@ class DummyALU(Elaboratable):
         with m.Else():
             # input says no longer valid, so drop ready as well.
             # a "proper" ALU would have had to sync in the opcode and a/b ops
-            m.d.sync += self.p.ready_o.eq(0)
+            m.d.sync += self.p.o_ready.eq(0)
 
         # ok so the counter's running: when it gets to 1, fire the output
         with m.If((self.counter == 1) | go_now):
             # set the output as valid if the recipient is ready for it
-            m.d.sync += self.n.valid_o.eq(1)
-        with m.If(self.n.ready_i & self.n.valid_o):
-            m.d.sync += self.n.valid_o.eq(0)
+            m.d.sync += self.n.o_valid.eq(1)
+        with m.If(self.n.i_ready & self.n.o_valid):
+            m.d.sync += self.n.o_valid.eq(0)
             # recipient said it was ready: reset back to known-good.
             m.d.sync += self.counter.eq(0)  # reset the counter
             m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
@@ -177,38 +181,86 @@ class DummyALU(Elaboratable):
     def ports(self):
         return list(self)
 
+#####################
+# converting even this dummy ALU over to the FunctionUnit RegSpecs API
+# which, errr, note that the regspecs are totally ignored below, but
+# at least the widths are all 64-bit so it's okay.
+#####################
+
+# input (and output) for logical initial stage (common input)
+
+
+class ALUInputData(FUBaseData):
+    regspec = [('INT', 'a', '0:63'),  # RA
+               ('INT', 'b', '0:63'),  # RB/immediate
+               ]
+
+    def __init__(self, pspec):
+        super().__init__(pspec, False)
+
+
+# output from ALU final stage
+class ALUOutputData(FUBaseData):
+    regspec = [('INT', 'o', '0:63'),        # RT
+               ]
+
+    def __init__(self, pspec):
+        super().__init__(pspec, True)
+
+
+# ALU pipe specification class
+class ALUPipeSpec(CommonPipeSpec):
+    regspec = (ALUInputData.regspec, ALUOutputData.regspec)
+    opsubsetkls = CompALUOpSubset
+
+
+class ALUFunctionUnit(FunctionUnitBaseSingle):
+    # class ALUFunctionUnit(FunctionUnitBaseMulti):
+    fnunit = Function.ALU
+
+    def __init__(self, idx, parent_pspec):
+        super().__init__(ALUPipeSpec, ALU, 1, parent_pspec)
+
 
 class ALU(Elaboratable):
     def __init__(self, width):
+        # XXX major temporary hack: attempting to convert
+        # ALU over to RegSpecs API, FunctionUnitBaseSingle passes in
+        # a regspec here which we can't cope with.  therefore, errr...
+        # just throw it away and set the width to 64
+        if not isinstance(width, int):
+            width = 64
+        # TODO, really this should just inherit from ControlBase it would
+        # be a lot less messy.
         self.p = Dummy()  # make look like nmutil pipeline API
-        self.p.data_i = Dummy()
-        self.p.data_i.ctx = Dummy()
+        self.p.i_data = Dummy()
+        self.p.i_data.ctx = Dummy()
         self.n = Dummy()  # make look like nmutil pipeline API
-        self.n.data_o = Dummy()
-        self.p.valid_i = Signal()
-        self.p.ready_o = Signal()
-        self.n.ready_i = Signal()
-        self.n.valid_o = Signal()
+        self.n.o_data = Dummy()
+        self.p.i_valid = Signal()
+        self.p.o_ready = Signal()
+        self.n.i_ready = Signal()
+        self.n.o_valid = Signal()
         self.counter = Signal(4)
         self.op = CompALUOpSubset(name="op")
         i = []
         i.append(Signal(width, name="i1"))
         i.append(Signal(width, name="i2"))
-        self.i = Array(i)
+        self.i = i
         self.a, self.b = i[0], i[1]
         out = []
         out.append(Data(width, name="alu_o"))
         out.append(Data(width, name="alu_cr"))
-        self.out = Array(out)
+        self.out = tuple(out)
         self.o = self.out[0]
         self.cr = self.out[1]
         self.width = width
-        # more "look like nmutil pipeline API"
-        self.p.data_i.ctx.op = self.op
-        self.p.data_i.a = self.a
-        self.p.data_i.b = self.b
-        self.n.data_o.o = self.o
-        self.n.data_o.cr = self.cr
+        # more "look like nmutil ControlBase pipeline API" stuff
+        self.p.i_data.ctx.op = self.op
+        self.p.i_data.a = self.a
+        self.p.i_data.b = self.b
+        self.n.o_data.o = self.o
+        self.n.o_data.cr = self.cr
 
     def elaborate(self, platform):
         m = Module()
@@ -254,16 +306,16 @@ class ALU(Elaboratable):
         with m.If(go_now):
             # with a combinatorial, no-delay ALU, just pass through
             # the handshake signals to the other side
-            m.d.comb += self.p.ready_o.eq(self.n.ready_i)
-            m.d.comb += self.n.valid_o.eq(self.p.valid_i)
+            m.d.comb += self.p.o_ready.eq(self.n.i_ready)
+            m.d.comb += self.n.o_valid.eq(self.p.i_valid)
         with m.Else():
             # sequential ALU handshake:
-            # ready_o responds to valid_i, but only if the ALU is idle
-            m.d.comb += self.p.ready_o.eq(alu_idle)
-            # select the internally generated valid_o, above
-            m.d.comb += self.n.valid_o.eq(alu_done)
+            # o_ready responds to i_valid, but only if the ALU is idle
+            m.d.comb += self.p.o_ready.eq(alu_idle)
+            # select the internally generated o_valid, above
+            m.d.comb += self.n.o_valid.eq(alu_done)
 
-        # hold the ALU result until ready_o is asserted
+        # hold the ALU result until o_ready is asserted
         alu_r = Signal(self.width)
 
         # output masks
@@ -275,7 +327,7 @@ class ALU(Elaboratable):
         m.d.comb += self.cr.ok.eq(self.op.rc.rc)
 
         with m.If(alu_idle):
-            with m.If(self.p.valid_i):
+            with m.If(self.p.i_valid):
 
                 # as this is a "fake" pipeline, just grab the output right now
                 with m.If(self.op.insn_type == MicrOp.OP_ADD):
@@ -311,7 +363,7 @@ class ALU(Elaboratable):
                 with m.Else():
                     m.d.comb += go_now.eq(1)
 
-        with m.Elif(~alu_done | self.n.ready_i):
+        with m.Elif(~alu_done | self.n.i_ready):
             # decrement the counter while the ALU is neither idle nor finished
             m.d.sync += self.counter.eq(self.counter - 1)
 
@@ -337,10 +389,10 @@ class ALU(Elaboratable):
         yield self.a
         yield self.b
         yield from self.o.ports()
-        yield self.p.valid_i
-        yield self.p.ready_o
-        yield self.n.valid_o
-        yield self.n.ready_i
+        yield self.p.i_valid
+        yield self.p.o_ready
+        yield self.n.o_valid
+        yield self.n.i_ready
 
     def ports(self):
         return list(self)
@@ -362,22 +414,22 @@ class BranchOp(Elaboratable):
 class BranchALU(Elaboratable):
     def __init__(self, width):
         self.p = Dummy()  # make look like nmutil pipeline API
-        self.p.data_i = Dummy()
-        self.p.data_i.ctx = Dummy()
+        self.p.i_data = Dummy()
+        self.p.i_data.ctx = Dummy()
         self.n = Dummy()  # make look like nmutil pipeline API
-        self.n.data_o = Dummy()
-        self.p.valid_i = Signal()
-        self.p.ready_o = Signal()
-        self.n.ready_i = Signal()
-        self.n.valid_o = Signal()
+        self.n.o_data = Dummy()
+        self.p.i_valid = Signal()
+        self.p.o_ready = Signal()
+        self.n.i_ready = Signal()
+        self.n.o_valid = Signal()
         self.counter = Signal(4)
         self.op = Signal(2)
         i = []
         i.append(Signal(width, name="i1"))
         i.append(Signal(width, name="i2"))
-        self.i = Array(i)
+        self.i = i
         self.a, self.b = i[0], i[1]
-        self.out = Array([Signal(width)])
+        self.out = tuple([Signal(width)])
         self.o = self.out[0]
         self.width = width
 
@@ -399,11 +451,11 @@ class BranchALU(Elaboratable):
             ]
 
         go_now = Signal(reset_less=True)  # testing no-delay ALU
-        with m.If(self.p.valid_i):
+        with m.If(self.p.i_valid):
             # input is valid. next check, if we already said "ready" or not
-            with m.If(~self.p.ready_o):
+            with m.If(~self.p.o_ready):
                 # we didn't say "ready" yet, so say so and initialise
-                m.d.sync += self.p.ready_o.eq(1)
+                m.d.sync += self.p.o_ready.eq(1)
 
                 # as this is a "fake" pipeline, just grab the output right now
                 with m.Switch(self.op):
@@ -416,14 +468,14 @@ class BranchALU(Elaboratable):
         with m.Else():
             # input says no longer valid, so drop ready as well.
             # a "proper" ALU would have had to sync in the opcode and a/b ops
-            m.d.sync += self.p.ready_o.eq(0)
+            m.d.sync += self.p.o_ready.eq(0)
 
         # ok so the counter's running: when it gets to 1, fire the output
         with m.If((self.counter == 1) | go_now):
             # set the output as valid if the recipient is ready for it
-            m.d.sync += self.n.valid_o.eq(1)
-        with m.If(self.n.ready_i & self.n.valid_o):
-            m.d.sync += self.n.valid_o.eq(0)
+            m.d.sync += self.n.o_valid.eq(1)
+        with m.If(self.n.i_ready & self.n.o_valid):
+            m.d.sync += self.n.o_valid.eq(0)
             # recipient said it was ready: reset back to known-good.
             m.d.sync += self.counter.eq(0)  # reset the counter
             m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
@@ -449,28 +501,28 @@ def run_op(dut, a, b, op, inv_a=0):
     yield dut.b.eq(b)
     yield dut.op.insn_type.eq(op)
     yield dut.op.invert_in.eq(inv_a)
-    yield dut.n.ready_i.eq(0)
-    yield dut.p.valid_i.eq(1)
-    yield dut.n.ready_i.eq(1)
+    yield dut.n.i_ready.eq(0)
+    yield dut.p.i_valid.eq(1)
+    yield dut.n.i_ready.eq(1)
     yield
 
     # wait for the ALU to accept our input data
-    while not (yield dut.p.ready_o):
+    while not (yield dut.p.o_ready):
         yield
 
-    yield dut.p.valid_i.eq(0)
+    yield dut.p.i_valid.eq(0)
     yield dut.a.eq(0)
     yield dut.b.eq(0)
     yield dut.op.insn_type.eq(0)
     yield dut.op.invert_in.eq(0)
 
     # wait for the ALU to present the output data
-    while not (yield dut.n.valid_o):
+    while not (yield dut.n.o_valid):
         yield
 
     # latch the result and lower read_i
     result = yield dut.o.data
-    yield dut.n.ready_i.eq(0)
+    yield dut.n.i_ready.eq(0)
 
     return result
 
@@ -520,21 +572,21 @@ def test_alu_parallel():
     sim.add_clock(1e-6)
 
     def send(a, b, op, inv_a=0, rc=0):
-        # present input data and assert valid_i
+        # present input data and assert i_valid
         yield dut.a.eq(a)
         yield dut.b.eq(b)
         yield dut.op.insn_type.eq(op)
         yield dut.op.invert_in.eq(inv_a)
         yield dut.op.rc.rc.eq(rc)
-        yield dut.p.valid_i.eq(1)
+        yield dut.p.i_valid.eq(1)
         yield
-        # wait for ready_o to be asserted
-        while not (yield dut.p.ready_o):
+        # wait for o_ready to be asserted
+        while not (yield dut.p.o_ready):
             yield
-        # clear input data and negate valid_i
+        # clear input data and negate i_valid
         # if send is called again immediately afterwards, there will be no
         # visible transition (they will not be negated, after all)
-        yield dut.p.valid_i.eq(0)
+        yield dut.p.i_valid.eq(0)
         yield dut.a.eq(0)
         yield dut.b.eq(0)
         yield dut.op.insn_type.eq(0)
@@ -543,18 +595,18 @@ def test_alu_parallel():
 
     def receive():
         # signal readiness to receive data
-        yield dut.n.ready_i.eq(1)
+        yield dut.n.i_ready.eq(1)
         yield
-        # wait for valid_o to be asserted
-        while not (yield dut.n.valid_o):
+        # wait for o_valid to be asserted
+        while not (yield dut.n.o_valid):
             yield
         # read results
         result = yield dut.o.data
         cr = yield dut.cr.data
-        # negate ready_i
+        # negate i_ready
         # if receive is called again immediately afterwards, there will be no
         # visible transition (it will not be negated, after all)
-        yield dut.n.ready_i.eq(0)
+        yield dut.n.i_ready.eq(0)
         return result, cr
 
     def producer():
@@ -650,10 +702,10 @@ def write_alu_gtkw(gtkw_name, clk_period=1e-6, sub_module=None,
         'i2[15:0]',
         'op__insn_type' if pysim else 'op__insn_type[6:0]',
         'op__invert_in',
-        'valid_i',
-        'ready_o',
-        'valid_o',
-        'ready_i',
+        'i_valid',
+        'o_ready',
+        'o_valid',
+        'i_ready',
         'alu_o[15:0]',
         'alu_o_ok',
         'alu_cr[15:0]',
index 50ee1367cc84301bcf9cecf0f6cae51d13273227..784b9a8151fac8578dca8a1b95404480d9e509f3 100644 (file)
@@ -1,7 +1,8 @@
 # TODO: replace with Memory at some point
-from nmigen import Elaboratable, Signal, Array, Module
+from nmigen import Elaboratable, Signal, Array, Module, Memory
 from nmutil.util import Display
 
+
 class CacheRam(Elaboratable):
 
     def __init__(self, ROW_BITS=16, WIDTH = 64, TRACE=True, ADD_BUF=False,
@@ -28,30 +29,52 @@ class CacheRam(Elaboratable):
         ADD_BUF = self.ADD_BUF
         SIZE = 2**ROW_BITS
      
-        ram = Array(Signal(WIDTH) for i in range(SIZE))
+        # set up the Cache RAM Memory and create one read and one write port
+        # the read port is *not* transparent (does not pass write-thru-read)
         #attribute ram_style of ram : signal is "block";
-     
-        rd_data0 = Signal(WIDTH)
-     
+        ram = Memory(depth=SIZE, width=WIDTH,
+                     attrs={'syn_ramstyle': "block_ram"})
+        m.submodules.rdport = rdport = ram.read_port(transparent=False)
+        m.submodules.wrport = wrport = ram.write_port(granularity=8)
+
         with m.If(TRACE):
             with m.If(self.wr_sel.bool()):
                 sync += Display( "write ramno %d a: %%x "
                                  "sel: %%x dat: %%x" % self.ram_num,
                                 self.wr_addr,
                                 self.wr_sel, self.wr_data)
-        for i in range(WIDTH//8):
-            lbit = i * 8;
-            mbit = lbit + 8;
-            with m.If(self.wr_sel[i]):
-                sync += ram[self.wr_addr][lbit:mbit].eq(self.wr_data[lbit:mbit])
-        with m.If(self.rd_en):
-            sync += rd_data0.eq(ram[self.rd_addr])
-            if TRACE:
+
+        # read data output and a latched copy. behaves like microwatt cacheram
+        rd_data0 = Signal(WIDTH)
+        rd_data0l = Signal(WIDTH)
+
+        # delay on read address/en
+        rd_delay = Signal()
+        rd_delay_addr = Signal.like(self.rd_addr)
+        sync += rd_delay_addr.eq(self.rd_addr)
+        sync += rd_delay.eq(self.rd_en)
+
+        # write port
+        comb += wrport.addr.eq(self.wr_addr)
+        comb += wrport.en.eq(self.wr_sel)
+        comb += wrport.data.eq(self.wr_data)
+
+        # read port (include a latch on the output, for microwatt compatibility)
+        comb += rdport.addr.eq(self.rd_addr)
+        comb += rdport.en.eq(self.rd_en)
+        with m.If(rd_delay):
+            comb += rd_data0.eq(rdport.data)
+            sync += rd_data0l.eq(rd_data0)   # preserve latched data
+        with m.Else():
+            comb += rd_data0.eq(rd_data0l)   # output latched (last-read)
+
+        if TRACE:
+            with m.If(rd_delay):
                 sync += Display("read ramno %d a: %%x dat: %%x" % self.ram_num,
-                                self.rd_addr, ram[self.rd_addr])
+                                rd_delay_addr, rd_data0)
                 pass
 
-
+        # extra delay requested?
         if ADD_BUF:
             sync += self.rd_data_o.eq(rd_data0)
         else:
index 05539cd485ac833266595e23a92d7034c90d5d67..ff05b48f5717b642ee51d96d830391b9e21967ab 100644 (file)
@@ -61,9 +61,9 @@ class ComputationUnitNoDelay(Elaboratable):
         self.src2_i = Signal(rwid, reset_less=True)  # oper2 in
 
         self.busy_o = Signal(reset_less=True)  # fn busy out
-        self.data_o = Signal(rwid, reset_less=True)  # Dest out
+        self.o_data = Signal(rwid, reset_less=True)  # Dest out
         self.rd_rel_o = Signal(reset_less=True)  # release src1/src2 request
-        # release request out (valid_o)
+        # release request out (o_valid)
         self.req_rel_o = Signal(reset_less=True)
         self.done_o = self.req_rel_o  # 'normalise' API
 
@@ -133,21 +133,21 @@ class ComputationUnitNoDelay(Elaboratable):
         # NOTE: this spells TROUBLE if the ALU isn't ready!
         # go_read is only valid for one clock!
         with m.If(self.go_rd_i):                     # src operands ready, GO!
-            with m.If(~self.alu.p_ready_o):          # no ACK yet
-                m.d.comb += self.alu.p_valid_i.eq(1)  # so indicate valid
+            with m.If(~self.alu.p_o_ready):          # no ACK yet
+                m.d.comb += self.alu.p_i_valid.eq(1)  # so indicate valid
 
         # only proceed if ALU says its output is valid
-        with m.If(self.alu.n_valid_o):
+        with m.If(self.alu.n_o_valid):
             # when ALU ready, write req release out. waits for shadow
             m.d.comb += self.req_rel_o.eq(req_l.q & busy_o & self.shadown_i)
             # when output latch is ready, and ALU says ready, accept ALU output
             with m.If(self.req_rel_o & self.go_wr_i):
                 # tells ALU "thanks got it"
-                m.d.comb += self.alu.n_ready_i.eq(1)
+                m.d.comb += self.alu.n_i_ready.eq(1)
 
         # output the data from the latch on go_write
         with m.If(self.go_wr_i):
-            m.d.comb += self.data_o.eq(data_r)
+            m.d.comb += self.o_data.eq(data_r)
 
         return m
 
@@ -163,7 +163,7 @@ class ComputationUnitNoDelay(Elaboratable):
         yield self.busy_o
         yield self.rd_rel_o
         yield self.req_rel_o
-        yield self.data_o
+        yield self.o_data
 
     def ports(self):
         return list(self)
@@ -192,18 +192,18 @@ def op_sim(dut, a, b, op, inv_a=0, imm=0, imm_ok=0):
     yield
     yield dut.go_rd_i.eq(0)
     req_rel_o = yield dut.req_rel_o
-    result = yield dut.data_o
+    result = yield dut.o_data
     print("req_rel", req_rel_o, result)
     while True:
         req_rel_o = yield dut.req_rel_o
-        result = yield dut.data_o
+        result = yield dut.o_data
         print("req_rel", req_rel_o, result)
         if req_rel_o:
             break
         yield
     yield dut.go_wr_i.eq(1)
     yield
-    result = yield dut.data_o
+    result = yield dut.o_data
     print("result", result)
     yield dut.go_wr_i.eq(0)
     yield
index d7e32f28c556e76aff9be146ce280eba9745bb09..23ef36ea03077bef1b73e75c1d4b169b454ee99b 100644 (file)
@@ -106,10 +106,12 @@ class CompUnitRecord(RegSpec, RecordObject):
         # output (busy/done)
         self.busy_o = Signal(name="cu_busy_o", reset_less=True)  # fn busy out
         self.done_o = Signal(name="cu_done_o", reset_less=True)
+        self.alu_done_o = Signal(name="cu_alu_done_o", reset_less=True)
 
 
 class MultiCompUnit(RegSpecALUAPI, Elaboratable):
-    def __init__(self, rwid, alu, opsubsetkls, n_src=2, n_dst=1, name=None):
+    def __init__(self, rwid, alu, opsubsetkls, n_src=2, n_dst=1, name=None,
+                       sync_rw=True):
         """MultiCompUnit
 
         * :rwid:        width of register latches (TODO: allocate per regspec)
@@ -119,6 +121,7 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         * :n_dst:       number of destination operands
         """
         RegSpecALUAPI.__init__(self, rwid, alu)
+        self.sync_rw = sync_rw
         self.alu_name = name or "alu"
         self.opsubsetkls = opsubsetkls
         self.cu = cu = CompUnitRecord(opsubsetkls, rwid, n_src, n_dst,
@@ -143,6 +146,7 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         self.wr = cu.wr
         self.rdmaskn = cu.rdmaskn
         self.wrmask = cu.wrmask
+        self.alu_done_o = cu.alu_done_o
         self.go_rd_i = self.rd.go_i  # temporary naming
         self.go_wr_i = self.wr.go_i  # temporary naming
         self.rd_rel_o = self.rd.rel_o  # temporary naming
@@ -157,7 +161,7 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
 
         self.busy_o = cu.busy_o
         self.dest = cu._dest
-        self.data_o = self.dest[0]  # Dest out
+        self.o_data = self.dest[0]  # Dest out
         self.done_o = cu.done_o
 
     def _mux_op(self, m, sl, op_is_imm, imm, i):
@@ -174,7 +178,20 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
 
     def elaborate(self, platform):
         m = Module()
-        setattr(m.submodules, self.alu_name, self.alu)
+        if self.sync_rw:
+            rw_domain = m.d.sync
+        else:
+            rw_domain = m.d.comb
+        # generate a pulse on system reset, to reset any latches, if needed
+        system_reset = Signal(reset=1)
+        m.d.sync += system_reset.eq(0)
+
+        # add the ALU to the MultiCompUnit only if it is a "real" ALU
+        # see AllFunctionUnits as to why: a FunctionUnitBaseMulti
+        # only has one "real" ALU but multiple pseudo front-ends,
+        # aka "ReservationStations" (ALUProxy "fronts")
+        if isinstance(self.alu, Elaboratable):
+            setattr(m.submodules, self.alu_name, self.alu)
         m.submodules.src_l = src_l = SRLatch(False, self.n_src, name="src")
         m.submodules.opc_l = opc_l = SRLatch(sync=False, name="opc")
         m.submodules.req_l = req_l = SRLatch(False, self.n_dst, name="req")
@@ -185,18 +202,18 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         # ALU only proceeds when all src are ready.  rd_rel_o is delayed
         # so combine it with go_rd_i.  if all bits are set we're good
         all_rd = Signal(reset_less=True)
-        m.d.comb += all_rd.eq(self.busy_o & rok_l.q &
+        m.d.comb += all_rd.eq(self.busy_o & # rok_l.q & # XXX LOOP
                               (((~self.rd.rel_o) | self.rd.go_i).all()))
 
         # generate read-done pulse
         all_rd_pulse = Signal(reset_less=True)
-        m.d.comb += all_rd_pulse.eq(rising_edge(m, all_rd))
+        m.d.comb += all_rd_pulse.eq(rising_edge(m, all_rd)) # XXX LOOP
 
         # create rising pulse from alu valid condition.
-        alu_done = Signal(reset_less=True)
+        alu_done = self.cu.alu_done_o
         alu_pulse = Signal(reset_less=True)
         alu_pulsem = Signal(self.n_dst, reset_less=True)
-        m.d.comb += alu_done.eq(self.alu.n.valid_o)
+        m.d.comb += alu_done.eq(self.alu.n.o_valid)
         m.d.comb += alu_pulse.eq(rising_edge(m, alu_done))
         m.d.comb += alu_pulsem.eq(Repl(alu_pulse, self.n_dst))
 
@@ -210,16 +227,14 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         # is enough, when combined with when read-phase is done (rst_l.q)
         wr_any = Signal(reset_less=True)
         req_done = Signal(reset_less=True)
-        m.d.comb += self.done_o.eq(self.busy_o &
-                                   ~((self.wr.rel_o & ~self.wrmask).bool()))
+        m.d.comb += self.done_o.eq(self.busy_o & ~(self.wr.rel_o).bool())
         m.d.comb += wr_any.eq(self.wr.go_i.bool() | prev_wr_go.bool())
-        m.d.comb += req_done.eq(wr_any & ~self.alu.n.ready_i &
-                                ((req_l.q & self.wrmask) == 0))
+        m.d.comb += req_done.eq(wr_any & ~self.alu.n.i_ready & (req_l.q == 0))
         # argh, complicated hack: if there are no regs to write,
         # instead of waiting for regs that are never going to happen,
         # we indicate "done" when the ALU is "done"
         with m.If((self.wrmask == 0) &
-                  self.alu.n.ready_i & self.alu.n.valid_o & self.busy_o):
+                  self.alu.n.i_ready & self.alu.n.o_valid & self.busy_o):
             m.d.comb += req_done.eq(1)
 
         # shadow/go_die
@@ -230,27 +245,30 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         m.d.comb += reset.eq(req_done | self.go_die_i)
         m.d.comb += rst_r.eq(self.issue_i | self.go_die_i)
         m.d.comb += reset_w.eq(self.wr.go_i | Repl(self.go_die_i, self.n_dst))
-        m.d.comb += reset_r.eq(self.rd.go_i | Repl(self.go_die_i, self.n_src))
+        m.d.comb += reset_r.eq(self.rd.go_i | Repl(rst_r, self.n_src))
 
         # read-done,wr-proceed latch
-        m.d.sync += rok_l.s.eq(self.issue_i)  # set up when issue starts
-        m.d.sync += rok_l.r.eq(self.alu.n.valid_o & self.busy_o)  # ALU done
+        rw_domain += rok_l.s.eq(self.issue_i)  # set up when issue starts
+        rw_domain += rok_l.r.eq(self.alu.n.o_valid & self.busy_o) # ALUdone LOOP
 
         # wr-done, back-to-start latch
-        m.d.sync += rst_l.s.eq(all_rd)     # set when read-phase is fully done
-        m.d.sync += rst_l.r.eq(rst_r)        # *off* on issue
+        rw_domain += rst_l.s.eq(all_rd)     # set when read-phase is fully done
+        rw_domain += rst_l.r.eq(rst_r)        # *off* on issue
 
         # opcode latch (not using go_rd_i) - inverted so that busy resets to 0
         m.d.sync += opc_l.s.eq(self.issue_i)       # set on issue
         m.d.sync += opc_l.r.eq(req_done)  # reset on ALU
 
-        # src operand latch (not using go_wr_i)
-        m.d.sync += src_l.s.eq(Repl(self.issue_i, self.n_src))
+        # src operand latch (not using go_wr_i) ANDed with rdmask
+        rdmaskn = Signal(self.n_src)
+        latchregister(m, self.rdmaskn, rdmaskn, self.issue_i, name="rdmask_l")
+        m.d.sync += src_l.s.eq(Repl(self.issue_i, self.n_src) & ~rdmaskn)
         m.d.sync += src_l.r.eq(reset_r)
 
         # dest operand latch (not using issue_i)
-        m.d.sync += req_l.s.eq(alu_pulsem & self.wrmask)
-        m.d.sync += req_l.r.eq(reset_w | prev_wr_go)
+        rw_domain += req_l.s.eq(alu_pulsem & self.wrmask)
+        m.d.comb += req_l.r.eq(reset_w | prev_wr_go |
+                               Repl(system_reset, self.n_dst))
 
         # pass operation to the ALU (sync: plenty time to wait for src reads)
         op = self.get_op()
@@ -264,20 +282,27 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
             name = "data_r%d" % i
             lro = self.get_out(i)
             ok = Const(1, 1)
+            data_r_ok = Const(1, 1)
             if isinstance(lro, Record):
+                print("wr fields", i, lro, lro.fields)
                 data_r = Record.like(lro, name=name)
-                print("wr fields", i, lro, data_r.fields)
                 # bye-bye abstract interface design..
-                fname = find_ok(data_r.fields)
+                fname = find_ok(lro.fields)
                 if fname:
                     ok = getattr(lro, fname)
+                    data_r_ok = getattr(data_r, fname)
+                # write-ok based on incoming output *and* whether the latched
+                # data was ok.
+                # XXX fails - wrok.append((ok|data_r_ok) & self.busy_o)
+                wrok.append(ok & self.busy_o)
             else:
-                data_r = Signal.like(lro, name=name, reset_less=True)
-            wrok.append(ok & self.busy_o)
-            with m.If(alu_pulse):
-                m.d.sync += data_r.eq(lro)
+                data_r = Signal.like(lro, name=name)
+                # really should retire this but it's part of unit tests
+                wrok.append(ok & self.busy_o)
+            #latchregister(m, lro, data_r, ok & self.busy_o, name=name)
+            latchregister(m, lro, data_r, alu_pulse, name=name)
             with m.If(self.issue_i):
-                m.d.sync += data_r.eq(0)
+                m.d.comb += data_r.eq(0)
             drl.append(data_r)
 
         # ok, above we collated anything with an "ok" on the output side
@@ -315,7 +340,10 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         # create a latch/register for src1/src2 (even if it is a copy of imm)
         for i in range(self.n_src):
             src, alusrc, latch, _ = sl[i]
-            latchregister(m, src, alusrc, latch, name="src_r%d" % i)
+            reg = latchregister(m, src, alusrc, latch, name="src_r%d" % i)
+            # rdmask stops src latches from being set.  clear all if not busy
+            with m.If(~self.busy_o):
+                m.d.sync += reg.eq(0)
 
         # -----
         # ALU connection / interaction
@@ -323,16 +351,16 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
 
         # on a go_read, tell the ALU we're accepting data.
         m.submodules.alui_l = alui_l = SRLatch(False, name="alui")
-        m.d.comb += self.alu.p.valid_i.eq(alui_l.q)
-        m.d.sync += alui_l.r.eq(self.alu.p.ready_o & alui_l.q)
+        m.d.comb += self.alu.p.i_valid.eq(alui_l.q)
+        m.d.sync += alui_l.r.eq(self.alu.p.o_ready & alui_l.q)
         m.d.comb += alui_l.s.eq(all_rd_pulse)
 
         # ALU output "ready" side.  alu "ready" indication stays hi until
         # ALU says "valid".
         m.submodules.alu_l = alu_l = SRLatch(False, name="alu")
-        m.d.comb += self.alu.n.ready_i.eq(alu_l.q)
-        m.d.sync += alu_l.r.eq(self.alu.n.valid_o & alu_l.q)
-        m.d.comb += alu_l.s.eq(all_rd_pulse)
+        m.d.comb += self.alu.n.i_ready.eq(alu_l.q)
+        m.d.sync += alu_l.r.eq(self.alu.n.o_valid & alu_l.q)
+        m.d.comb += alu_l.s.eq(all_rd_pulse) # XXX LOOP
 
         # -----
         # outputs
@@ -343,12 +371,15 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         m.d.comb += self.busy_o.eq(opc_l.q)  # busy out
 
         # read-release gated by busy (and read-mask)
-        bro = Repl(self.busy_o, self.n_src)
-        m.d.comb += self.rd.rel_o.eq(src_l.q & bro & slg & ~self.rdmaskn)
+        if True: #self.sync_rw: - experiment (doesn't work)
+            bro = Repl(self.busy_o, self.n_src)
+        else:
+            bro = Repl(self.busy_o|self.issue_i, self.n_src)
+        m.d.comb += self.rd.rel_o.eq(src_l.q & bro & slg)
 
         # write-release gated by busy and by shadow (and write-mask)
         brd = Repl(self.busy_o & self.shadown_i, self.n_dst)
-        m.d.comb += self.wr.rel_o.eq(req_l.q & brd & self.wrmask)
+        m.d.comb += self.wr.rel_o.eq(req_l.q_int & brd)
 
         # output the data from the latch on go_write
         for i in range(self.n_dst):
@@ -372,7 +403,7 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         yield self.busy_o
         yield self.rd.rel_o
         yield self.wr.rel_o
-        yield self.data_o
+        yield self.o_data
 
     def ports(self):
         return list(self)
index 7dfdb15ecb45a0f41449418db612c3f774cddccc..2a54e51bf0caacddd0af4df62b7facb45c1a6395 100644 (file)
@@ -20,6 +20,11 @@ Loads are activated when Go_Write[0] is enabled.  The EA is computed,
 and (as long as there was no exception) the data comes out (at any
 time from the PortInterface), and is captured by the LDCompSTUnit.
 
+TODO: dcbz, yes, that's going to be complicated, has to be done
+ with great care, to detect the case when dcbz is set
+ and *not* expect to read any data, just the address.
+ so, wait for RA but not RB.
+
 Both LD and ST may request that the address be computed from summing
 operand1 (src[0]) with operand2 (src[1]) *or* by summing operand1 with
 the immediate (from the opcode).
@@ -53,6 +58,8 @@ the nested FSMs below are *combinatorial*).
 
     * A third FSM activates to cover ST.  it activates if op_is_st is true
 
+    * TODO document DCBZ (not complete yet)
+
     * The "overall" (fourth) FSM coordinates the progression and completion
       of the three other FSMs, firing "WR_RESET" which switches off "busy"
 
@@ -80,7 +87,7 @@ Terminology:
 
 from nmigen.compat.sim import run_simulation
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array, Repl
+from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array, Repl, C
 from nmigen.hdl.rec import Record, Layout
 
 from nmutil.latch import SRLatch, latchregister
@@ -96,6 +103,10 @@ from openpower.decoder.power_enums import MicrOp, Function, LDSTMode
 from soc.fu.ldst.ldst_input_record import CompLDSTOpSubset
 from openpower.decoder.power_decoder2 import Data
 from openpower.consts import MSR
+from soc.config.test.test_loadstore import TestMemPspec
+
+# for debugging dcbz
+from nmutil.util import Display
 
 
 # TODO: LDSTInputData and LDSTOutputData really should be used
@@ -137,7 +148,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
 
     Data (outputs)
     --------------
-    * :data_o:  Dest out (LD)          - managed by wr[0] go/req
+    * :o_data:  Dest out (LD)          - managed by wr[0] go/req
     * :addr_o:  Address out (LD or ST) - managed by wr[1] go/req
     * :exc_o:   Address/Data Exception occurred.  LD/ST must terminate
 
@@ -178,17 +189,17 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
     TODO: use one module for the byte-reverse as it's quite expensive in gates
     """
 
-    def __init__(self, pi=None, rwid=64, awid=48, opsubset=CompLDSTOpSubset,
+    def __init__(self, pi=None, rwid=64, awid=64, opsubset=CompLDSTOpSubset,
                  debugtest=False, name=None):
         super().__init__(rwid)
         self.awid = awid
         self.pi = pi
         self.cu = cu = LDSTCompUnitRecord(rwid, opsubset, name=name)
-        self.debugtest = debugtest
+        self.debugtest = debugtest # enable debug output for unit testing
 
         # POWER-compliant LD/ST has index and update: *fixed* number of ports
         self.n_src = n_src = 3   # RA, RB, RT/RS
-        self.n_dst = n_dst = 2  # RA, RT/RS
+        self.n_dst = n_dst = 3  # RA, RT/RS, CR0
 
         # set up array of src and dest signals
         for i in range(n_src):
@@ -232,8 +243,9 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         self.oper_i = cu.oper_i
         self.src_i = cu._src_i
 
-        self.data_o = Data(self.data_wid, name="o")  # Dest1 out: RT
+        self.o_data = Data(self.data_wid, name="o")  # Dest1 out: RT
         self.addr_o = Data(self.data_wid, name="ea")  # Addr out: Update => RA
+        self.cr_o = Data(4, name="cr0")  # CR0 (for stdcx etc)
         self.exc_o = cu.exc_o
         self.done_o = cu.done_o
         self.busy_o = cu.busy_o
@@ -254,7 +266,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
 
         #####################
         # latches for the FSM.
-        m.submodules.opc_l = opc_l = SRLatch(sync=False, name="opc")
+        m.submodules.opc_l = opc_l = SRLatch(sync=True, name="opc")
         m.submodules.src_l = src_l = SRLatch(False, self.n_src, name="src")
         m.submodules.alu_l = alu_l = SRLatch(sync=False, name="alu")
         m.submodules.adr_l = adr_l = SRLatch(sync=False, name="adr")
@@ -262,6 +274,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         m.submodules.sto_l = sto_l = SRLatch(sync=False, name="sto")
         m.submodules.wri_l = wri_l = SRLatch(sync=False, name="wri")
         m.submodules.upd_l = upd_l = SRLatch(sync=False, name="upd")
+        m.submodules.cr0_l = cr0_l = SRLatch(sync=False, name="cr0")
         m.submodules.rst_l = rst_l = SRLatch(sync=False, name="rst")
         m.submodules.lsd_l = lsd_l = SRLatch(sync=False, name="lsd") # done
 
@@ -271,6 +284,9 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         # opcode decode
         op_is_ld = Signal(reset_less=True)
         op_is_st = Signal(reset_less=True)
+        op_is_dcbz = Signal(reset_less=True)
+        op_is_st_or_dcbz = Signal(reset_less=True)
+        op_is_atomic = Signal(reset_less=True)
 
         # ALU/LD data output control
         alu_valid = Signal(reset_less=True)  # ALU operands are valid
@@ -281,6 +297,8 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         rda_any = Signal(reset_less=True)   # any read for address ops
         rd_done = Signal(reset_less=True)   # all *necessary* operands read
         wr_reset = Signal(reset_less=True)  # final reset condition
+        canceln = Signal(reset_less=True)   # cancel (active low)
+        store_done = Signal(reset_less=True) # store has been actioned
 
         # LD and ALU out
         alu_o = Signal(self.data_wid, reset_less=True)
@@ -293,26 +311,41 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         reset_o = Signal(reset_less=True)             # reset opcode
         reset_w = Signal(reset_less=True)             # reset write
         reset_u = Signal(reset_less=True)             # reset update
+        reset_c = Signal(reset_less=True)             # reset cr0
         reset_a = Signal(reset_less=True)             # reset adr latch
         reset_i = Signal(reset_less=True)             # issue|die (use a lot)
         reset_r = Signal(self.n_src, reset_less=True)  # reset src
         reset_s = Signal(reset_less=True)             # reset store
 
-        comb += reset_i.eq(issue_i | self.go_die_i)       # various
-        comb += reset_o.eq(self.done_o | self.go_die_i)      # opcode reset
-        comb += reset_w.eq(self.wr.go_i[0] | self.go_die_i)  # write reg 1
-        comb += reset_u.eq(self.wr.go_i[1] | self.go_die_i)  # update (reg 2)
-        comb += reset_s.eq(self.go_st_i | self.go_die_i)  # store reset
-        comb += reset_r.eq(self.rd.go_i | Repl(self.go_die_i, self.n_src))
-        comb += reset_a.eq(self.go_ad_i | self.go_die_i)
+        # end execution when a terminating condition is detected:
+        # - go_die_i: a speculative operation was cancelled
+        # - exc_o.happened: an exception has occurred
+        terminate = Signal()
+        comb += terminate.eq(self.go_die_i | self.exc_o.happened)
+
+        comb += reset_i.eq(issue_i | terminate)       # various
+        comb += reset_o.eq(self.done_o | terminate)      # opcode reset
+        comb += reset_w.eq(self.wr.go_i[0] | terminate)  # write reg 1
+        comb += reset_u.eq(self.wr.go_i[1] | terminate)  # update (reg 2)
+        comb += reset_c.eq(self.wr.go_i[2] | terminate)  # cr0 (reg 3)
+        comb += reset_s.eq(self.go_st_i | terminate)  # store reset
+        comb += reset_r.eq(self.rd.go_i | Repl(terminate, self.n_src))
+        comb += reset_a.eq(self.go_ad_i | terminate)
 
         p_st_go = Signal(reset_less=True)
         sync += p_st_go.eq(self.st.go_i)
 
         # decode bits of operand (latched)
         oper_r = CompLDSTOpSubset(name="oper_r")  # Dest register
-        comb += op_is_st.eq(oper_r.insn_type == MicrOp.OP_STORE)  # ST
-        comb += op_is_ld.eq(oper_r.insn_type == MicrOp.OP_LOAD)  # LD
+        comb += op_is_st.eq(oper_r.insn_type == MicrOp.OP_STORE)   # ST
+        comb += op_is_ld.eq(oper_r.insn_type == MicrOp.OP_LOAD)    # LD
+        comb += op_is_dcbz.eq(oper_r.insn_type == MicrOp.OP_DCBZ)  # DCBZ
+        comb += op_is_atomic.eq(oper_r.reserve) # atomic LR/SC
+        comb += op_is_st_or_dcbz.eq(op_is_st | op_is_dcbz)
+        # dcbz is special case of store
+        #uncomment if needed
+        #comb += Display("compldst_multi: op_is_dcbz = %i",
+        #                (oper_r.insn_type == MicrOp.OP_DCBZ))
         op_is_update = oper_r.ldst_mode == LDSTMode.update           # UPDATE
         op_is_cix = oper_r.ldst_mode == LDSTMode.cix           # cache-inhibit
         comb += self.load_mem_o.eq(op_is_ld & self.go_ad_i)
@@ -328,6 +361,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         #       - alu_l : looks after add of src1/2/imm (EA)
         #       - adr_l : waits for add (EA)
         #       - upd_l : waits for adr and Regfile (port 2)
+        #       - cr0_l : waits for Rc=1 and CR0 Regfile (port 3)
         #    - src_l[2] : ST
         # - lod_l       : waits for adr (EA) and for LD Data
         # - wri_l       : waits for LD Data and Regfile (port 1)
@@ -338,12 +372,13 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         # opcode latch - inverted so that busy resets to 0
         # note this MUST be sync so as to avoid a combinatorial loop
         # between busy_o and issue_i on the reset latch (rst_l)
-        sync += opc_l.s.eq(issue_i)  # XXX NOTE: INVERTED FROM book!
-        sync += opc_l.r.eq(reset_o)  # XXX NOTE: INVERTED FROM book!
+        comb += opc_l.s.eq(issue_i)  # XXX NOTE: INVERTED FROM book!
+        comb += opc_l.r.eq(reset_o)  # XXX NOTE: INVERTED FROM book!
 
         # src operand latch
-        sync += src_l.s.eq(Repl(issue_i, self.n_src))
+        sync += src_l.s.eq(Repl(issue_i, self.n_src) & ~self.rdmaskn)
         sync += src_l.r.eq(reset_r)
+        #### sync += Display("reset_r = %i",reset_r)
 
         # alu latch.  use sync-delay between alu_ok and valid to generate pulse
         comb += alu_l.s.eq(reset_i)
@@ -365,12 +400,17 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
                             #self.done_o | (self.pi.busy_o & op_is_update),
                                           self.n_dst))
 
+        # CR0 operand latch (CR0 written to reg 3 if Rc=1)
+        op_is_rc1 = self.oper_i.rc.rc & self.oper_i.rc.ok
+        comb += cr0_l.s.eq(issue_i & op_is_rc1)
+        sync += cr0_l.r.eq(reset_c)
+
         # update-mode operand latch (EA written to reg 2)
         sync += upd_l.s.eq(reset_i)
         sync += upd_l.r.eq(reset_u)
 
         # store latch
-        comb += sto_l.s.eq(addr_ok & op_is_st)
+        comb += sto_l.s.eq(addr_ok & op_is_st_or_dcbz)
         sync += sto_l.r.eq(reset_s | p_st_go)
 
         # ld/st done.  needed to stop LD/ST from activating repeatedly
@@ -384,13 +424,18 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         # create a latch/register for the operand
         with m.If(self.issue_i):
             sync += oper_r.eq(self.oper_i)
-        with m.If(self.done_o):
+        with m.If(self.done_o | terminate):
             sync += oper_r.eq(0)
 
-        # and for LD
+        # and for LD and store-done
         ldd_r = Signal(self.data_wid, reset_less=True)  # Dest register
         latchregister(m, ldd_o, ldd_r, ld_ok, name="ldo_r")
 
+        # store actioned, communicate through CR0 (for atomic LR/SC)
+        latchregister(m, self.pi.store_done.data, store_done,
+                         self.pi.store_done.ok,
+                         name="std_r")
+
         # and for each input from the incoming src operands
         srl = []
         for i in range(self.n_src):
@@ -418,7 +463,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
 
         # now do the ALU addr add: one cycle, and say "ready" (next cycle, too)
         comb += alu_o.eq(src1_or_z + src2_or_imm)  # actual EA
-        m.d.sync += alu_ok.eq(alu_valid)             # keep ack in sync with EA
+        m.d.sync += alu_ok.eq(alu_valid & canceln) # keep ack in sync with EA
 
         ############################
         # Control Signal calculation
@@ -429,15 +474,16 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
 
         # 1st operand read-request only when zero not active
         # 2nd operand only needed when immediate is not active
-        slg = Cat(op_is_z, op_is_imm)
+        slg = Cat(op_is_z, op_is_imm) #is this correct ?
         bro = Repl(self.busy_o, self.n_src)
-        comb += self.rd.rel_o.eq(src_l.q & bro & ~slg & ~self.rdmaskn)
+        comb += self.rd.rel_o.eq(src_l.q & bro & ~slg)
 
         # note when the address-related read "go" signals are active
         comb += rda_any.eq(self.rd.go_i[0] | self.rd.go_i[1])
 
         # alu input valid when 1st and 2nd ops done (or imm not active)
-        comb += alu_valid.eq(busy_o & ~(self.rd.rel_o[0] | self.rd.rel_o[1]))
+        comb += alu_valid.eq(busy_o & ~(self.rd.rel_o[0] | self.rd.rel_o[1]) &
+                             canceln)
 
         # 3rd operand only needed when operation is a store
         comb += self.rd.rel_o[2].eq(src_l.q[2] & busy_o & op_is_st)
@@ -449,28 +495,33 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         comb += self.adr_rel_o.eq(alu_valid & adr_l.q & busy_o)
 
         # the write/store (etc) all must be cancelled if an exception occurs
-        cancel = Signal(reset_less=True)
-        comb += cancel.eq(self.exc_o.happened | self.shadown_i)
+        # note: cancel is active low, like shadown_i,
+        #       while exc_o.happpened is active high
+        comb += canceln.eq(~self.exc_o.happened & self.shadown_i)
 
         # store release when st ready *and* all operands read (and no shadow)
-        comb += self.st.rel_o.eq(sto_l.q & busy_o & rd_done & op_is_st &
-                               cancel)
+        # dcbz is special case of store -- TODO verify shadows
+        comb += self.st.rel_o.eq(sto_l.q & busy_o & rd_done & op_is_st_or_dcbz &
+                               canceln)
 
         # request write of LD result.  waits until shadow is dropped.
         comb += self.wr.rel_o[0].eq(rd_done & wri_l.q & busy_o & lod_l.qn &
-                                  op_is_ld & cancel)
+                                  op_is_ld & canceln)
 
         # request write of EA result only in update mode
         comb += self.wr.rel_o[1].eq(upd_l.q & busy_o & op_is_update &
-                                  alu_valid & cancel)
+                                  alu_valid & canceln)
+
+        # request write of CR0 result only in reserve and Rc=1
+        comb += self.wr.rel_o[2].eq(cr0_l.q & busy_o & op_is_atomic &
+                                  alu_valid & canceln)
 
         # provide "done" signal: select req_rel for non-LD/ST, adr_rel for LD/ST
         comb += wr_any.eq(self.st.go_i | p_st_go |
-                          self.wr.go_i[0] | self.wr.go_i[1])
-        comb += wr_reset.eq(rst_l.q & busy_o & cancel &
-                            ~(self.st.rel_o | self.wr.rel_o[0] |
-                              self.wr.rel_o[1]) &
-                            (lod_l.qn | op_is_st)
+                          self.wr.go_i.bool())
+        comb += wr_reset.eq(rst_l.q & busy_o & canceln &
+                            ~(self.st.rel_o | self.wr.rel_o.bool()) &
+                            (lod_l.qn | op_is_st_or_dcbz)
                             )
         comb += self.done_o.eq(wr_reset & (~self.pi.busy_o | op_is_ld))
 
@@ -478,18 +529,27 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         # Data/Address outputs
 
         # put the LD-output register directly onto the output bus on a go_write
-        comb += self.data_o.data.eq(self.dest[0])
+        comb += self.o_data.data.eq(self.dest[0])
+        comb += self.o_data.ok.eq(self.wr.rel_o[0])
         with m.If(self.wr.go_i[0]):
             comb += self.dest[0].eq(ldd_r)
 
         # "update" mode, put address out on 2nd go-write
         comb += self.addr_o.data.eq(self.dest[1])
+        comb += self.addr_o.ok.eq(self.wr.rel_o[1])
         with m.If(op_is_update & self.wr.go_i[1]):
             comb += self.dest[1].eq(addr_r)
 
+        # fun-fun-fun, calculate CR0 when Rc=1 requested.
+        cr0 = self.dest[2]
+        comb += self.cr_o.data.eq(cr0)
+        comb += self.cr_o.ok.eq(self.wr.rel_o[2])
+        with m.If(cr0_l.q):
+            comb += cr0.eq(Cat(C(0, 1), store_done, C(0, 2)))
+
         # need to look like MultiCompUnit: put wrmask out.
         # XXX may need to make this enable only when write active
-        comb += self.wrmask.eq(bro & Cat(op_is_ld, op_is_update))
+        comb += self.wrmask.eq(bro & Cat(op_is_ld, op_is_update, cr0_l.q))
 
         ###########################
         # PortInterface connections
@@ -497,15 +557,29 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
 
         # connect to LD/ST PortInterface.
         comb += pi.is_ld_i.eq(op_is_ld & busy_o)  # decoded-LD
-        comb += pi.is_st_i.eq(op_is_st & busy_o)  # decoded-ST
+        comb += pi.is_nc.eq(op_is_cix & busy_o)  # cache-inhibited
+        comb += pi.is_st_i.eq(op_is_st_or_dcbz & busy_o)  # decoded-ST
+        comb += pi.is_dcbz_i.eq(op_is_dcbz & busy_o)  # decoded-DCBZ
+        comb += pi.reserve.eq(oper_r.reserve & busy_o)  # atomic LR/SC
         comb += pi.data_len.eq(oper_r.data_len)  # data_len
         # address: use sync to avoid long latency
         sync += pi.addr.data.eq(addr_r)           # EA from adder
+        with m.If(op_is_dcbz):
+            sync += Display("LDSTCompUnit.DCBZ: EA from adder %x", addr_r)
+
         sync += pi.addr.ok.eq(alu_ok & lsd_l.q)  # "do address stuff" (once)
         comb += self.exc_o.eq(pi.exc_o)  # exception occurred
         comb += addr_ok.eq(self.pi.addr_ok_o)  # no exc, address fine
-        # connect MSR.PR for priv/virt operation
-        comb += pi.msr_pr.eq(oper_r.msr[MSR.PR])
+        # connect MSR.PR etc. for priv/virt operation
+        comb += pi.priv_mode.eq(~oper_r.msr[MSR.PR])
+        comb += pi.virt_mode.eq(oper_r.msr[MSR.DR])
+        comb += pi.mode_32bit.eq(~oper_r.msr[MSR.SF])
+        with m.If(self.issue_i): # display this only once
+            sync += Display("LDSTCompUnit: oper_r.msr %x pr=%x dr=%x sf=%x",
+                                      oper_r.msr,
+                                      oper_r.msr[MSR.PR],
+                                      oper_r.msr[MSR.DR],
+                                      oper_r.msr[MSR.SF])
 
         # byte-reverse on LD
         revnorev = Signal(64, reset_less=True)
@@ -539,6 +613,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
             comb += pi.st.data.eq(stdata_r)
         with m.Else():
             comb += pi.st.data.eq(op3)
+
         # store - data goes in based on go_st
         comb += pi.st.ok.eq(self.st.go_i)  # go store signals st data valid
 
@@ -549,9 +624,11 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         to LDSTOutputData o and o1 respectively.
         """
         if i == 0:
-            return self.data_o # LDSTOutputData.regspec o
+            return self.o_data # LDSTOutputData.regspec o
         if i == 1:
             return self.addr_o # LDSTOutputData.regspec o1
+        if i == 2:
+            return self.cr_o # LDSTOutputData.regspec cr_a
         # return self.dest[i]
 
     def get_fu_out(self, i):
@@ -572,8 +649,9 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         yield self.adr_rel_o
         yield self.sto_rel_o
         yield self.wr.rel_o
-        yield from self.data_o.ports()
+        yield from self.o_data.ports()
         yield from self.addr_o.ports()
+        yield from self.cr_o.ports()
         yield self.load_mem_o
         yield self.stwd_mem_o
 
@@ -603,9 +681,9 @@ def store(dut, src1, src2, src3, imm, imm_ok=True, update=False,
     yield dut.src1_i.eq(src1)
     yield dut.src2_i.eq(src2)
     yield dut.src3_i.eq(src3)
-    yield dut.oper_i.imm_data.imm.eq(imm)
+    yield dut.oper_i.imm_data.data.eq(imm)
     yield dut.oper_i.imm_data.ok.eq(imm_ok)
-    yield dut.oper_i.update.eq(update)
+    #guess: this one was removed -- yield dut.oper_i.update.eq(update)
     yield dut.issue_i.eq(1)
     yield
     yield dut.issue_i.eq(0)
@@ -620,9 +698,9 @@ def store(dut, src1, src2, src3, imm, imm_ok=True, update=False,
         if rel == active_rel:
             break
         yield
-    yield dut.rd.go.eq(active_rel)
+    yield dut.rd.go_i.eq(active_rel)
     yield
-    yield dut.rd.go.eq(0)
+    yield dut.rd.go_i.eq(0)
 
     yield from wait_for(dut.adr_rel_o, False, test1st=True)
     # yield from wait_for(dut.adr_rel_o)
@@ -659,7 +737,7 @@ def load(dut, src1, src2, imm, imm_ok=True, update=False, zero_a=False,
     yield dut.src1_i.eq(src1)
     yield dut.src2_i.eq(src2)
     yield dut.oper_i.zero_a.eq(zero_a)
-    yield dut.oper_i.imm_data.imm.eq(imm)
+    yield dut.oper_i.imm_data.data.eq(imm)
     yield dut.oper_i.imm_data.ok.eq(imm_ok)
     yield dut.issue_i.eq(1)
     yield
@@ -675,9 +753,9 @@ def load(dut, src1, src2, imm, imm_ok=True, update=False, zero_a=False,
 
     # wait for the operands (RA, RB, or both)
     if rd:
-        yield dut.rd.go.eq(rd)
+        yield dut.rd.go_i.eq(rd)
         yield from wait_for(dut.rd.rel_o)
-        yield dut.rd.go.eq(0)
+        yield dut.rd.go_i.eq(0)
 
     yield from wait_for(dut.adr_rel_o, False, test1st=True)
     # yield dut.ad.go.eq(1)
@@ -686,24 +764,24 @@ def load(dut, src1, src2, imm, imm_ok=True, update=False, zero_a=False,
 
     if update:
         yield from wait_for(dut.wr.rel_o[1])
-        yield dut.wr.go.eq(0b10)
+        yield dut.wr.go_i.eq(0b10)
         yield
         addr = yield dut.addr_o
         print("addr", addr)
-        yield dut.wr.go.eq(0)
+        yield dut.wr.go_i.eq(0)
     else:
         addr = None
 
     yield from wait_for(dut.wr.rel_o[0], test1st=True)
-    yield dut.wr.go.eq(1)
+    yield dut.wr.go_i.eq(1)
     yield
-    data = yield dut.data_o
-    print(data)
-    yield dut.wr.go.eq(0)
+    data = yield dut.o_data.o
+    data_ok = yield dut.o_data.o_ok
+    yield dut.wr.go_i.eq(0)
     yield from wait_for(dut.busy_o)
     yield
     # wait_for(dut.stwd_mem_o)
-    return data, addr
+    return data, data_ok, addr
 
 
 def ldst_sim(dut):
@@ -743,22 +821,31 @@ def ldst_sim(dut):
 
 class TestLDSTCompUnit(LDSTCompUnit):
 
-    def __init__(self, rwid):
+    def __init__(self, rwid, pspec):
         from soc.experiment.l0_cache import TstL0CacheBuffer
-        self.l0 = l0 = TstL0CacheBuffer()
-        pi = l0.l0.dports[0].pi
+        self.l0 = l0 = TstL0CacheBuffer(pspec)
+        pi = l0.l0.dports[0]
         LDSTCompUnit.__init__(self, pi, rwid, 4)
 
     def elaborate(self, platform):
         m = LDSTCompUnit.elaborate(self, platform)
         m.submodules.l0 = self.l0
-        m.d.comb += self.ad.go.eq(self.ad.rel)  # link addr-go direct to rel
+        # link addr-go direct to rel
+        m.d.comb += self.ad.go_i.eq(self.ad.rel_o)
         return m
 
 
 def test_scoreboard():
 
-    dut = TestLDSTCompUnit(16)
+    units = {}
+    pspec = TestMemPspec(ldst_ifacetype='bare_wb',
+                         imem_ifacetype='bare_wb',
+                         addr_wid=64,
+                         mask_wid=8,
+                         reg_wid=64,
+                         units=units)
+
+    dut = TestLDSTCompUnit(16,pspec)
     vl = rtlil.convert(dut, ports=dut.ports())
     with open("test_ldst_comp.il", "w") as f:
         f.write(vl)
@@ -768,24 +855,33 @@ def test_scoreboard():
 
 class TestLDSTCompUnitRegSpec(LDSTCompUnit):
 
-    def __init__(self):
+    def __init__(self, pspec):
         from soc.experiment.l0_cache import TstL0CacheBuffer
         from soc.fu.ldst.pipe_data import LDSTPipeSpec
         regspec = LDSTPipeSpec.regspec
-        self.l0 = l0 = TstL0CacheBuffer()
-        pi = l0.l0.dports[0].pi
+        self.l0 = l0 = TstL0CacheBuffer(pspec)
+        pi = l0.l0.dports[0]
         LDSTCompUnit.__init__(self, pi, regspec, 4)
 
     def elaborate(self, platform):
         m = LDSTCompUnit.elaborate(self, platform)
         m.submodules.l0 = self.l0
-        m.d.comb += self.ad.go.eq(self.ad.rel)  # link addr-go direct to rel
+        # link addr-go direct to rel
+        m.d.comb += self.ad.go_i.eq(self.ad.rel_o)
         return m
 
 
 def test_scoreboard_regspec():
 
-    dut = TestLDSTCompUnitRegSpec()
+    units = {}
+    pspec = TestMemPspec(ldst_ifacetype='bare_wb',
+                         imem_ifacetype='bare_wb',
+                         addr_wid=64,
+                         mask_wid=8,
+                         reg_wid=64,
+                         units=units)
+
+    dut = TestLDSTCompUnitRegSpec(pspec)
     vl = rtlil.convert(dut, ports=dut.ports())
     with open("test_ldst_comp.il", "w") as f:
         f.write(vl)
index 187918c0dbaa6f22ac02828a6d1dd1bdb4a953bc..bb9ff6e02e9dc8936960346c1cb37398d9c97878 100644 (file)
@@ -1,6 +1,6 @@
 from nmigen.compat.sim import run_simulation
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Const, Signal, Array, Cat, Elaboratable
+from nmigen import Module, Const, Signal, Cat, Elaboratable
 
 from regfile.regfile import RegFileArray, treereduce
 from scoreboard.fn_unit import IntFnUnit, FPFnUnit, LDFnUnit, STFnUnit
@@ -81,7 +81,7 @@ class Scoreboard(Elaboratable):
             int_src2_pend_v.append(fu.src2_pend_o)
             int_rd_pend_v.append(fu.int_rd_pend_o)
             int_wr_pend_v.append(fu.int_wr_pend_o)
-        int_fus = Array(if_l)
+        int_fus = if_l
 
         # Count of number of FUs
         n_int_fus = len(if_l)
@@ -217,7 +217,7 @@ class Scoreboard(Elaboratable):
         # merge (OR) all integer FU / ALU outputs to a single value
         # bit of a hack: treereduce needs a list with an item named "dest_o"
         dest_o = treereduce(int_alus)
-        m.d.sync += int_dest.data_i.eq(dest_o)
+        m.d.sync += int_dest.i_data.eq(dest_o)
 
         # connect ALUs
         for i, alu in enumerate(int_alus):
@@ -225,8 +225,8 @@ class Scoreboard(Elaboratable):
             m.d.comb += alu.go_wr_i.eq(intpick1.go_wr_o[i])
             m.d.comb += alu.issue_i.eq(fn_issue_l[i])
             # m.d.comb += fn_busy_l[i].eq(alu.busy_o)  # XXX ignore, use fnissue
-            m.d.comb += alu.src1_i.eq(int_src1.data_o)
-            m.d.comb += alu.src2_i.eq(int_src2.data_o)
+            m.d.comb += alu.src1_i.eq(int_src1.o_data)
+            m.d.comb += alu.src2_i.eq(int_src2.o_data)
             m.d.comb += if_l[i].req_rel_i.eq(alu.req_rel_o)  # pipe out ready
 
         return m
@@ -265,8 +265,12 @@ class RegSim:
         src2 = self.regs[src2]
         if op == IADD:
             val = (src1 + src2) & ((1 << (self.rwidth))-1)
+            print ("RegSim op: ADD", hex(src1), hex(src2), hex(val))
         elif op == ISUB:
             val = (src1 - src2) & ((1 << (self.rwidth))-1)
+            print ("RegSim op: SUB", hex(src1), hex(src2), hex(val))
+        else:
+            print ("RegSim op: UNSUPPORTED", op)
         self.regs[dest] = val
 
     def setval(self, dest, val):
index 8c002d28f542ca0c98c6702c2f23f2e2be28a64e..eae0bc7582866b702318609491f974f9cc9e8e38 100644 (file)
@@ -1,3 +1,17 @@
+#!/usr/bin/env python3
+#
+# Copyright (C) 2020,2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Copyright (C) 2020 Cole Poirier
+# Copyright (C) 2020,2021 Cesar Strauss
+# Copyright (C) 2021 Tobias Platen
+#
+# Original dcache.vhdl Copyright of its authors and licensed
+# by IBM under CC-BY 4.0
+# https://github.com/antonblanchard/microwatt
+#
+# Conversion to nmigen funded by NLnet and NGI POINTER under EU Grants
+# 871528 and 957073, under the LGPL-v3+ License
+
 """DCache
 
 based on Anton Blanchard microwatt dcache.vhdl
@@ -13,6 +27,8 @@ Links:
 
 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
 * https://bugs.libre-soc.org/show_bug.cgi?id=469
+* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
+  (discussion about brams for ECP5)
 
 """
 
@@ -24,12 +40,16 @@ sys.setrecursionlimit(1000000)
 
 from enum import Enum, unique
 
-from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
+from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
+                    Record, Memory)
 from nmutil.util import Display
+from nmigen.lib.coding import Decoder
 
 from copy import deepcopy
 from random import randint, seed
 
+from nmigen_soc.wishbone.bus import Interface
+
 from nmigen.cli import main
 from nmutil.iocontrol import RecordObject
 from nmigen.utils import log2_int
@@ -45,8 +65,8 @@ from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
                                 WBIOMasterOut, WBIOSlaveOut)
 
 from soc.experiment.cache_ram import CacheRam
-#from soc.experiment.plru import PLRU
-from nmutil.plru import PLRU
+from soc.experiment.plru import PLRU, PLRUs
+#from nmutil.plru import PLRU, PLRUs
 
 # for test
 from soc.bus.sram import SRAM
@@ -59,224 +79,248 @@ from nmutil.sim_tmp_alternative import Simulator
 
 from nmutil.util import wrap
 
-
-# TODO: make these parameters of DCache at some point
-LINE_SIZE = 64    # Line size in bytes
-NUM_LINES = 16    # Number of lines in a set
-NUM_WAYS = 4      # Number of ways
-TLB_SET_SIZE = 64 # L1 DTLB entries per set
-TLB_NUM_WAYS = 2  # L1 DTLB number of sets
-TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
 LOG_LENGTH = 0    # Non-zero to enable log data collection
 
-# BRAM organisation: We never access more than
-#     -- WB_DATA_BITS at a time so to save
-#     -- resources we make the array only that wide, and
-#     -- use consecutive indices for to make a cache "line"
-#     --
-#     -- ROW_SIZE is the width in bytes of the BRAM
-#     -- (based on WB, so 64-bits)
-ROW_SIZE = WB_DATA_BITS // 8;
-
-# ROW_PER_LINE is the number of row (wishbone
-# transactions) in a line
-ROW_PER_LINE = LINE_SIZE // ROW_SIZE
-
-# BRAM_ROWS is the number of rows in BRAM needed
-# to represent the full dcache
-BRAM_ROWS = NUM_LINES * ROW_PER_LINE
-
-print ("ROW_SIZE", ROW_SIZE)
-print ("ROW_PER_LINE", ROW_PER_LINE)
-print ("BRAM_ROWS", BRAM_ROWS)
-print ("NUM_WAYS", NUM_WAYS)
-
-# Bit fields counts in the address
-
-# REAL_ADDR_BITS is the number of real address
-# bits that we store
-REAL_ADDR_BITS = 56
-
-# ROW_BITS is the number of bits to select a row
-ROW_BITS = log2_int(BRAM_ROWS)
-
-# ROW_LINE_BITS is the number of bits to select
-# a row within a line
-ROW_LINE_BITS = log2_int(ROW_PER_LINE)
-
-# LINE_OFF_BITS is the number of bits for
-# the offset in a cache line
-LINE_OFF_BITS = log2_int(LINE_SIZE)
-
-# ROW_OFF_BITS is the number of bits for
-# the offset in a row
-ROW_OFF_BITS = log2_int(ROW_SIZE)
-
-# INDEX_BITS is the number if bits to
-# select a cache line
-INDEX_BITS = log2_int(NUM_LINES)
-
-# SET_SIZE_BITS is the log base 2 of the set size
-SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
-
-# TAG_BITS is the number of bits of
-# the tag part of the address
-TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
-
-# TAG_WIDTH is the width in bits of each way of the tag RAM
-TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
-
-# WAY_BITS is the number of bits to select a way
-WAY_BITS = log2_int(NUM_WAYS)
-
-# Example of layout for 32 lines of 64 bytes:
-layout = """\
-  ..  tag    |index|  line  |
-  ..         |   row   |    |
-  ..         |     |---|    | ROW_LINE_BITS  (3)
-  ..         |     |--- - --| LINE_OFF_BITS (6)
-  ..         |         |- --| ROW_OFF_BITS  (3)
-  ..         |----- ---|    | ROW_BITS      (8)
-  ..         |-----|        | INDEX_BITS    (5)
-  .. --------|              | TAG_BITS      (45)
-"""
-print (layout)
-print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
-            (TAG_BITS, INDEX_BITS, ROW_BITS,
-             ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
-print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
-print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
-print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
-
-TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
-
-print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
-
-def CacheTagArray():
-    return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
-                        for x in range(NUM_LINES))
-
-def CacheValidBitsArray():
-    return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
-                        for x in range(NUM_LINES))
-
-def RowPerLineValidArray():
-    return Array(Signal(name="rows_valid%d" % x) \
-                        for x in range(ROW_PER_LINE))
-
-# L1 TLB
-TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
-TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
-TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
-TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
-TLB_PTE_BITS     = 64
-TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
-
 def ispow2(x):
     return (1<<log2_int(x, False)) == x
 
-assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
-assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
-assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
-assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
-assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
-assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
-        "geometry bits don't add up"
-assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
-        "geometry bits don't add up"
-assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
-         "geometry bits don't add up"
-assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
-assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
-
-
-def TLBValidBitsArray():
-    return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
-                for x in range(TLB_SET_SIZE))
-
-def TLBTagEAArray():
-    return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
-                for x in range (TLB_NUM_WAYS))
-
-def TLBTagsArray():
-    return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
-                for x in range (TLB_SET_SIZE))
-
-def TLBPtesArray():
-    return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
-                for x in range(TLB_SET_SIZE))
-
-def HitWaySet():
-    return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
-                        for x in range(TLB_NUM_WAYS))
-
-# Cache RAM interface
-def CacheRamOut():
-    return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
-                 for x in range(NUM_WAYS))
-
-# PLRU output interface
-def PLRUOut():
-    return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
-                for x in range(NUM_LINES))
-
-# TLB PLRU output interface
-def TLBPLRUOut():
-    return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
-                for x in range(TLB_SET_SIZE))
-
-# Helper functions to decode incoming requests
-#
-# Return the cache line index (tag index) for an address
-def get_index(addr):
-    return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 
-# Return the cache row index (data memory) for an address
-def get_row(addr):
-    return addr[ROW_OFF_BITS:SET_SIZE_BITS]
+class DCacheConfig:
+    def __init__(self, LINE_SIZE = 64,    # Line size in bytes
+                       NUM_LINES = 64,    # Number of lines in a set
+                       NUM_WAYS = 2,      # Number of ways
+                       TLB_SET_SIZE = 64, # L1 DTLB entries per set
+                       TLB_NUM_WAYS = 2,  # L1 DTLB number of sets
+                       TLB_LG_PGSZ = 12): # L1 DTLB log_2(page_size)
+        self.LINE_SIZE = LINE_SIZE
+        self.NUM_LINES = NUM_LINES
+        self.NUM_WAYS = NUM_WAYS
+        self.TLB_SET_SIZE = TLB_SET_SIZE
+        self.TLB_NUM_WAYS = TLB_NUM_WAYS
+        self.TLB_LG_PGSZ = TLB_LG_PGSZ
+
+        # BRAM organisation: We never access more than
+        #     -- WB_DATA_BITS at a time so to save
+        #     -- resources we make the array only that wide, and
+        #     -- use consecutive indices to make a cache "line"
+        #     --
+        #     -- ROW_SIZE is the width in bytes of the BRAM
+        #     -- (based on WB, so 64-bits)
+        self.ROW_SIZE = WB_DATA_BITS // 8;
+
+        # ROW_PER_LINE is the number of row (wishbone
+        # transactions) in a line
+        self.ROW_PER_LINE = self.LINE_SIZE // self.ROW_SIZE
+
+        # BRAM_ROWS is the number of rows in BRAM needed
+        # to represent the full dcache
+        self.BRAM_ROWS = self.NUM_LINES * self.ROW_PER_LINE
+
+        print ("ROW_SIZE", self.ROW_SIZE)
+        print ("ROW_PER_LINE", self.ROW_PER_LINE)
+        print ("BRAM_ROWS", self.BRAM_ROWS)
+        print ("NUM_WAYS", self.NUM_WAYS)
+
+        # Bit fields counts in the address
+
+        # REAL_ADDR_BITS is the number of real address
+        # bits that we store
+        self.REAL_ADDR_BITS = 56
+
+        # ROW_BITS is the number of bits to select a row
+        self.ROW_BITS = log2_int(self.BRAM_ROWS)
+
+        # ROW_LINE_BITS is the number of bits to select
+        # a row within a line
+        self.ROW_LINE_BITS = log2_int(self.ROW_PER_LINE)
+
+        # LINE_OFF_BITS is the number of bits for
+        # the offset in a cache line
+        self.LINE_OFF_BITS = log2_int(self.LINE_SIZE)
+
+        # ROW_OFF_BITS is the number of bits for
+        # the offset in a row
+        self.ROW_OFF_BITS = log2_int(self.ROW_SIZE)
+
+        # INDEX_BITS is the number if bits to
+        # select a cache line
+        self.INDEX_BITS = log2_int(self.NUM_LINES)
+
+        # SET_SIZE_BITS is the log base 2 of the set size
+        self.SET_SIZE_BITS = self.LINE_OFF_BITS + self.INDEX_BITS
+
+        # TAG_BITS is the number of bits of
+        # the tag part of the address
+        self.TAG_BITS = self.REAL_ADDR_BITS - self.SET_SIZE_BITS
+
+        # TAG_WIDTH is the width in bits of each way of the tag RAM
+        self.TAG_WIDTH = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8)
+
+        # WAY_BITS is the number of bits to select a way
+        self.WAY_BITS = log2_int(self.NUM_WAYS)
+
+        # Example of layout for 32 lines of 64 bytes:
+        layout = f"""\
+          DCache Layout:
+         |.. -----------------------| REAL_ADDR_BITS ({self.REAL_ADDR_BITS})
+          ..         |--------------| SET_SIZE_BITS ({self.SET_SIZE_BITS})
+          ..  tag    |index|  line  |
+          ..         |   row   |    |
+          ..         |     |---|    | ROW_LINE_BITS ({self.ROW_LINE_BITS})
+          ..         |     |--- - --| LINE_OFF_BITS ({self.LINE_OFF_BITS})
+          ..         |         |- --| ROW_OFF_BITS  ({self.ROW_OFF_BITS})
+          ..         |----- ---|    | ROW_BITS      ({self.ROW_BITS})
+          ..         |-----|        | INDEX_BITS    ({self.INDEX_BITS})
+          .. --------|              | TAG_BITS      ({self.TAG_BITS})
+        """
+        print (layout)
+        print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
+                    (self.TAG_BITS, self.INDEX_BITS, self.ROW_BITS,
+                     self.ROW_OFF_BITS, self.LINE_OFF_BITS, self.ROW_LINE_BITS))
+        print ("index @: %d-%d" % (self.LINE_OFF_BITS, self.SET_SIZE_BITS))
+        print ("row @: %d-%d" % (self.LINE_OFF_BITS, self.ROW_OFF_BITS))
+        print ("tag @: %d-%d width %d" % (self.SET_SIZE_BITS,
+                                          self.REAL_ADDR_BITS, self.TAG_WIDTH))
+
+        self.TAG_RAM_WIDTH = self.TAG_WIDTH * self.NUM_WAYS
+
+        print ("TAG_RAM_WIDTH", self.TAG_RAM_WIDTH)
+        print ("    TAG_WIDTH", self.TAG_WIDTH)
+        print ("     NUM_WAYS", self.NUM_WAYS)
+        print ("    NUM_LINES", self.NUM_LINES)
+
+        # L1 TLB
+        self.TLB_SET_BITS     = log2_int(self.TLB_SET_SIZE)
+        self.TLB_WAY_BITS     = log2_int(self.TLB_NUM_WAYS)
+        self.TLB_EA_TAG_BITS  = 64 - (self.TLB_LG_PGSZ + self.TLB_SET_BITS)
+        self.TLB_TAG_WAY_BITS = self.TLB_NUM_WAYS * self.TLB_EA_TAG_BITS
+        self.TLB_PTE_BITS     = 64
+        self.TLB_PTE_WAY_BITS = self.TLB_NUM_WAYS * self.TLB_PTE_BITS;
+
+        assert (self.LINE_SIZE % self.ROW_SIZE) == 0, \
+                "LINE_SIZE not multiple of ROW_SIZE"
+        assert ispow2(self.LINE_SIZE), "LINE_SIZE not power of 2"
+        assert ispow2(self.NUM_LINES), "NUM_LINES not power of 2"
+        assert ispow2(self.ROW_PER_LINE), "ROW_PER_LINE not power of 2"
+        assert self.ROW_BITS == \
+                (self.INDEX_BITS + self.ROW_LINE_BITS), \
+                "geometry bits don't add up"
+        assert (self.LINE_OFF_BITS == \
+                self.ROW_OFF_BITS + self.ROW_LINE_BITS), \
+                "geometry bits don't add up"
+        assert self.REAL_ADDR_BITS == \
+                (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS), \
+                "geometry bits don't add up"
+        assert self.REAL_ADDR_BITS == \
+                (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS), \
+                 "geometry bits don't add up"
+        assert 64 == WB_DATA_BITS, \
+                "Can't yet handle wb width that isn't 64-bits"
+        assert self.SET_SIZE_BITS <= self.TLB_LG_PGSZ, \
+                "Set indexed by virtual address"
+
+    def CacheTagArray(self):
+        return Array(Signal(self.TAG_RAM_WIDTH, name="tag%d" % x) \
+                       for x in range(self.NUM_LINES))
+
+    def CacheValidsArray(self):
+        return Array(Signal(self.NUM_WAYS, name="tag_valids%d" % x)
+                     for x in range(self.NUM_LINES))
+
+    def RowPerLineValidArray(self):
+        return Array(Signal(name="rows_valid%d" % x) \
+                            for x in range(self.ROW_PER_LINE))
+
+    def TLBHit(self, name):
+        return Record([('valid', 1),
+                       ('way', self.TLB_WAY_BITS)], name=name)
+
+    def TLBTagEAArray(self):
+        return Array(Signal(self.TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
+                    for x in range (self.TLB_NUM_WAYS))
+
+    def TLBRecord(self, name):
+        tlb_layout = [('valid', self.TLB_NUM_WAYS),
+                      ('tag', self.TLB_TAG_WAY_BITS),
+                      ('pte', self.TLB_PTE_WAY_BITS)
+                     ]
+        return Record(tlb_layout, name=name)
+
+    def TLBValidArray(self):
+        return Array(Signal(self.TLB_NUM_WAYS, name="tlb_valid%d" % x)
+                            for x in range(self.TLB_SET_SIZE))
+
+    def HitWaySet(self):
+        return Array(Signal(self.WAY_BITS, name="hitway_%d" % x) \
+                            for x in range(self.TLB_NUM_WAYS))
+
+    # Cache RAM interface
+    def CacheRamOut(self):
+        return Array(Signal(self.WB_DATA_BITS, name="cache_out%d" % x) \
+                     for x in range(self.NUM_WAYS))
+
+    # PLRU output interface
+    def PLRUOut(self):
+        return Array(Signal(self.WAY_BITS, name="plru_out%d" % x) \
+                    for x in range(self.NUM_LINES))
+
+    # TLB PLRU output interface
+    def TLBPLRUOut(self):
+        return Array(Signal(self.TLB_WAY_BITS, name="tlbplru_out%d" % x) \
+                    for x in range(self.TLB_SET_SIZE))
+
+    # Helper functions to decode incoming requests
+    #
+    # Return the cache line index (tag index) for an address
+    def get_index(self, addr):
+        return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS]
+
+    # Return the cache row index (data memory) for an address
+    def get_row(self, addr):
+        return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS]
 
-# Return the index of a row within a line
-def get_row_of_line(row):
-    return row[:ROW_BITS][:ROW_LINE_BITS]
+    # Return the index of a row within a line
+    def get_row_of_line(self, row):
+        return row[:self.ROW_BITS][:self.ROW_LINE_BITS]
 
-# Returns whether this is the last row of a line
-def is_last_row_addr(addr, last):
-    return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
+    # Returns whether this is the last row of a line
+    def is_last_row_addr(self, addr, last):
+        return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last
 
-# Returns whether this is the last row of a line
-def is_last_row(row, last):
-    return get_row_of_line(row) == last
+    # Returns whether this is the last row of a line
+    def is_last_row(self, row, last):
+        return self.get_row_of_line(row) == last
 
-# Return the next row in the current cache line. We use a
-# dedicated function in order to limit the size of the
-# generated adder to be only the bits within a cache line
-# (3 bits with default settings)
-def next_row(row):
-    row_v = row[0:ROW_LINE_BITS] + 1
-    return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
+    # Return the next row in the current cache line. We use a
+    # dedicated function in order to limit the size of the
+    # generated adder to be only the bits within a cache line
+    # (3 bits with default settings)
+    def next_row(self, row):
+        row_v = row[0:self.ROW_LINE_BITS] + 1
+        return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:])
 
-# Get the tag value from the address
-def get_tag(addr):
-    return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
+    # Get the tag value from the address
+    def get_tag(self, addr):
+        return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS]
 
-# Read a tag from a tag memory row
-def read_tag(way, tagset):
-    return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
+    # Read a tag from a tag memory row
+    def read_tag(self, way, tagset):
+        return tagset.word_select(way, self.TAG_WIDTH)[:self.TAG_BITS]
 
-# Read a TLB tag from a TLB tag memory row
-def read_tlb_tag(way, tags):
-    return tags.word_select(way, TLB_EA_TAG_BITS)
+    # Read a TLB tag from a TLB tag memory row
+    def read_tlb_tag(self, way, tags):
+        return tags.word_select(way, self.TLB_EA_TAG_BITS)
 
-# Write a TLB tag to a TLB tag memory row
-def write_tlb_tag(way, tags, tag):
-    return read_tlb_tag(way, tags).eq(tag)
+    # Write a TLB tag to a TLB tag memory row
+    def write_tlb_tag(self, way, tags, tag):
+        return self.read_tlb_tag(way, tags).eq(tag)
 
-# Read a PTE from a TLB PTE memory row
-def read_tlb_pte(way, ptes):
-    return ptes.word_select(way, TLB_PTE_BITS)
+    # Read a PTE from a TLB PTE memory row
+    def read_tlb_pte(self, way, ptes):
+        return ptes.word_select(way, self.TLB_PTE_BITS)
 
-def write_tlb_pte(way, ptes, newpte):
-    return read_tlb_pte(way, ptes).eq(newpte)
+    def write_tlb_pte(self, way, ptes, newpte):
+        return self.read_tlb_pte(way, ptes).eq(newpte)
 
 
 # Record for storing permission, attribute, etc. bits from a PTE
@@ -347,15 +391,15 @@ class RegStage0(RecordObject):
 
 
 class MemAccessRequest(RecordObject):
-    def __init__(self, name=None):
+    def __init__(self, cfg, name=None):
         super().__init__(name=name)
         self.op        = Signal(Op)
         self.valid     = Signal()
         self.dcbz      = Signal()
-        self.real_addr = Signal(REAL_ADDR_BITS)
+        self.real_addr = Signal(cfg.REAL_ADDR_BITS)
         self.data      = Signal(64)
         self.byte_sel  = Signal(8)
-        self.hit_way   = Signal(WAY_BITS)
+        self.hit_way   = Signal(cfg.WAY_BITS)
         self.same_tag  = Signal()
         self.mmu_req   = Signal()
 
@@ -363,31 +407,30 @@ class MemAccessRequest(RecordObject):
 # First stage register, contains state for stage 1 of load hits
 # and for the state machine used by all other operations
 class RegStage1(RecordObject):
-    def __init__(self, name=None):
+    def __init__(self, cfg, name=None):
         super().__init__(name=name)
         # Info about the request
         self.full             = Signal() # have uncompleted request
         self.mmu_req          = Signal() # request is from MMU
-        self.req              = MemAccessRequest(name="reqmem")
+        self.req              = MemAccessRequest(cfg, name="reqmem")
 
         # Cache hit state
-        self.hit_way          = Signal(WAY_BITS)
+        self.hit_way          = Signal(cfg.WAY_BITS)
         self.hit_load_valid   = Signal()
-        self.hit_index        = Signal(INDEX_BITS)
+        self.hit_index        = Signal(cfg.INDEX_BITS)
         self.cache_hit        = Signal()
 
         # TLB hit state
-        self.tlb_hit          = Signal()
-        self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
-        self.tlb_hit_index    = Signal(TLB_WAY_BITS)
+        self.tlb_hit          = cfg.TLBHit("tlb_hit")
+        self.tlb_hit_index    = Signal(cfg.TLB_SET_BITS)
 
         # 2-stage data buffer for data forwarded from writes to reads
         self.forward_data1    = Signal(64)
         self.forward_data2    = Signal(64)
         self.forward_sel1     = Signal(8)
         self.forward_valid1   = Signal()
-        self.forward_way1     = Signal(WAY_BITS)
-        self.forward_row1     = Signal(ROW_BITS)
+        self.forward_way1     = Signal(cfg.WAY_BITS)
+        self.forward_row1     = Signal(cfg.ROW_BITS)
         self.use_forward1     = Signal()
         self.forward_sel      = Signal(8)
 
@@ -398,12 +441,12 @@ class RegStage1(RecordObject):
         self.write_tag        = Signal()
         self.slow_valid       = Signal()
         self.wb               = WBMasterOut("wb")
-        self.reload_tag       = Signal(TAG_BITS)
-        self.store_way        = Signal(WAY_BITS)
-        self.store_row        = Signal(ROW_BITS)
-        self.store_index      = Signal(INDEX_BITS)
-        self.end_row_ix       = Signal(ROW_LINE_BITS)
-        self.rows_valid       = RowPerLineValidArray()
+        self.reload_tag       = Signal(cfg.TAG_BITS)
+        self.store_way        = Signal(cfg.WAY_BITS)
+        self.store_row        = Signal(cfg.ROW_BITS)
+        self.store_index      = Signal(cfg.INDEX_BITS)
+        self.end_row_ix       = Signal(cfg.ROW_LINE_BITS)
+        self.rows_valid       = cfg.RowPerLineValidArray()
         self.acks_pending     = Signal(3)
         self.inc_acks         = Signal()
         self.dec_acks         = Signal()
@@ -421,94 +464,178 @@ class RegStage1(RecordObject):
 
 # Reservation information
 class Reservation(RecordObject):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, cfg, name=None):
+        super().__init__(name=name)
         self.valid = Signal()
-        self.addr  = Signal(64-LINE_OFF_BITS)
+        self.addr  = Signal(64-cfg.LINE_OFF_BITS)
 
 
 class DTLBUpdate(Elaboratable):
-    def __init__(self):
+    def __init__(self, cfg):
+        self.cfg = cfg
         self.tlbie    = Signal()
         self.tlbwe    = Signal()
         self.doall    = Signal()
-        self.updated  = Signal()
-        self.v_updated  = Signal()
-        self.tlb_hit    = Signal()
-        self.tlb_req_index = Signal(TLB_SET_BITS)
+        self.tlb_hit     = cfg.TLBHit("tlb_hit")
+        self.tlb_req_index = Signal(cfg.TLB_SET_BITS)
 
-        self.tlb_hit_way     = Signal(TLB_WAY_BITS)
-        self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
-        self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
-        self.repl_way        = Signal(TLB_WAY_BITS)
-        self.eatag           = Signal(TLB_EA_TAG_BITS)
-        self.pte_data        = Signal(TLB_PTE_BITS)
+        self.repl_way        = Signal(cfg.TLB_WAY_BITS)
+        self.eatag           = Signal(cfg.TLB_EA_TAG_BITS)
+        self.pte_data        = Signal(cfg.TLB_PTE_BITS)
 
-        self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
-
-        self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
-        self.db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
-        self.pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+        # read from dtlb array
+        self.tlb_read       = Signal()
+        self.tlb_read_index = Signal(cfg.TLB_SET_BITS)
+        self.tlb_way        = cfg.TLBRecord("o_tlb_way")
 
     def elaborate(self, platform):
         m = Module()
         comb = m.d.comb
         sync = m.d.sync
-
-        tagset   = Signal(TLB_TAG_WAY_BITS)
-        pteset   = Signal(TLB_PTE_WAY_BITS)
-
-        tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
-        comb += db_out.eq(self.dv)
+        cfg = self.cfg
+
+        # there are 3 parts to this:
+        # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
+        # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
+        # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
+        # be a Memory because they can all be cleared (tlbie, doall), i mean,
+        # we _could_, in theory, by overriding the Reset Signal of the Memory,
+        # hmmm....
+
+        dtlb_valid = cfg.TLBValidArray()
+        tlb_req_index = self.tlb_req_index
+
+        print ("TLB_TAG_WAY_BITS", cfg.TLB_TAG_WAY_BITS)
+        print ("     TLB_EA_TAG_BITS", cfg.TLB_EA_TAG_BITS)
+        print ("        TLB_NUM_WAYS", cfg.TLB_NUM_WAYS)
+        print ("TLB_PTE_WAY_BITS", cfg.TLB_PTE_WAY_BITS)
+        print ("    TLB_PTE_BITS", cfg.TLB_PTE_BITS)
+        print ("    TLB_NUM_WAYS", cfg.TLB_NUM_WAYS)
+
+        # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
+        tagway = Memory(depth=cfg.TLB_SET_SIZE, width=cfg.TLB_TAG_WAY_BITS,
+                             attrs={'syn_ramstyle': "block_ram"})
+        m.submodules.rd_tagway = rd_tagway = tagway.read_port()
+        m.submodules.wr_tagway = wr_tagway = tagway.write_port(
+                                    granularity=cfg.TLB_EA_TAG_BITS)
+
+        pteway = Memory(depth=cfg.TLB_SET_SIZE, width=cfg.TLB_PTE_WAY_BITS,
+                             attrs={'syn_ramstyle': "block_ram"})
+        m.submodules.rd_pteway = rd_pteway = pteway.read_port()
+        m.submodules.wr_pteway = wr_pteway = pteway.write_port(
+                                    granularity=cfg.TLB_PTE_BITS)
+
+        # commented out for now, can be put in if Memory.reset can be
+        # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
+        #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
+        #m.submodules.rd_valid = rd_valid = validm.read_port()
+        #m.submodules.wr_valid = wr_valid = validm.write_port(
+                                    #granularity=1)
+
+        # connect up read and write addresses to Valid/PTE/TAG SRAMs
+        m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
+        m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
+        #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
+        m.d.comb += wr_tagway.addr.eq(tlb_req_index)
+        m.d.comb += wr_pteway.addr.eq(tlb_req_index)
+        #m.d.comb += wr_valid.addr.eq(tlb_req_index)
+
+        updated  = Signal()
+        v_updated  = Signal()
+        tb_out = Signal(cfg.TLB_TAG_WAY_BITS) # tlb_way_tags_t
+        db_out = Signal(cfg.TLB_NUM_WAYS)     # tlb_way_valids_t
+        pb_out = Signal(cfg.TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+        dv = Signal(cfg.TLB_NUM_WAYS) # tlb_way_valids_t
+
+        comb += dv.eq(dtlb_valid[tlb_req_index])
+        comb += db_out.eq(dv)
 
         with m.If(self.tlbie & self.doall):
-            pass # clear all back in parent
+            # clear all valid bits at once
+            # XXX hmmm, validm _could_ use Memory reset here...
+            for i in range(cfg.TLB_SET_SIZE):
+                sync += dtlb_valid[i].eq(0)
         with m.Elif(self.tlbie):
-            with m.If(self.tlb_hit):
-                comb += db_out.bit_select(self.tlb_hit_way, 1).eq(0)
-                comb += self.v_updated.eq(1)
-
+            # invalidate just the hit_way
+            with m.If(self.tlb_hit.valid):
+                comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
+                comb += v_updated.eq(1)
         with m.Elif(self.tlbwe):
-
-            comb += tagset.eq(self.tlb_tag_way)
-            comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
-            comb += tb_out.eq(tagset)
-
-            comb += pteset.eq(self.tlb_pte_way)
-            comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
-            comb += pb_out.eq(pteset)
-
+            # write to the requested tag and PTE
+            comb += cfg.write_tlb_tag(self.repl_way, tb_out, self.eatag)
+            comb += cfg.write_tlb_pte(self.repl_way, pb_out, self.pte_data)
+            # set valid bit
             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 
-            comb += self.updated.eq(1)
-            comb += self.v_updated.eq(1)
+            comb += updated.eq(1)
+            comb += v_updated.eq(1)
+
+        # above, sometimes valid is requested to be updated but data not
+        # therefore split them out, here.  note the granularity thing matches
+        # with the shift-up of the eatag/pte_data into the correct TLB way.
+        # thus is it not necessary to write the entire lot, just the portion
+        # being altered: hence writing the *old* copy of the row is not needed
+        with m.If(updated): # PTE and TAG to be written
+            comb += wr_pteway.data.eq(pb_out)
+            comb += wr_pteway.en.eq(1<<self.repl_way)
+            comb += wr_tagway.data.eq(tb_out)
+            comb += wr_tagway.en.eq(1<<self.repl_way)
+        with m.If(v_updated): # Valid to be written
+            sync += dtlb_valid[tlb_req_index].eq(db_out)
+            #comb += wr_valid.data.eq(db_out)
+            #comb += wr_valid.en.eq(1<<self.repl_way)
+
+        # select one TLB way, use a register here
+        r_delay = Signal()
+        sync += r_delay.eq(self.tlb_read)
+        # first deal with the valids, which are not in a Memory.
+        # tlb way valid is output on a 1 clock delay with sync,
+        # but have to explicitly deal with "forwarding" here
+        with m.If(self.tlb_read):
+            with m.If(v_updated): # write *and* read in same cycle: forward
+                sync += self.tlb_way.valid.eq(db_out)
+            with m.Else():
+                sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
+        # now deal with the Memory-read case. the output must remain
+        # valid (stable) even when a read-request is not made, but stable
+        # on a one-clock delay, hence the register
+        r_tlb_way        = cfg.TLBRecord("r_tlb_way")
+        with m.If(r_delay):
+            # on one clock delay, capture the contents of the read port(s)
+            comb += self.tlb_way.tag.eq(rd_tagway.data)
+            comb += self.tlb_way.pte.eq(rd_pteway.data)
+            sync += r_tlb_way.tag.eq(rd_tagway.data)
+            sync += r_tlb_way.pte.eq(rd_pteway.data)
+        with m.Else():
+            # ... so that the register can output it when no read is requested
+            # it's rather overkill but better to be safe than sorry
+            comb += self.tlb_way.tag.eq(r_tlb_way.tag)
+            comb += self.tlb_way.pte.eq(r_tlb_way.pte)
+            #comb += self.tlb_way.eq(r_tlb_way)
 
         return m
 
 
 class DCachePendingHit(Elaboratable):
 
-    def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
-                      cache_valid_idx, cache_tag_set,
-                    req_addr,
-                    hit_set):
+    def __init__(self, cfg, tlb_way,
+                      cache_i_validdx, cache_tag_set,
+                    req_addr):
 
         self.go          = Signal()
         self.virt_mode   = Signal()
         self.is_hit      = Signal()
-        self.tlb_hit     = Signal()
-        self.hit_way     = Signal(WAY_BITS)
+        self.tlb_hit     = cfg.TLBHit("tlb_hit")
+        self.hit_way     = Signal(cfg.WAY_BITS)
         self.rel_match   = Signal()
-        self.req_index   = Signal(INDEX_BITS)
-        self.reload_tag  = Signal(TAG_BITS)
+        self.req_index   = Signal(cfg.INDEX_BITS)
+        self.reload_tag  = Signal(cfg.TAG_BITS)
 
-        self.tlb_hit_way = tlb_hit_way
-        self.tlb_pte_way = tlb_pte_way
-        self.tlb_valid_way = tlb_valid_way
-        self.cache_valid_idx = cache_valid_idx
+        self.tlb_way = tlb_way
+        self.cache_i_validdx = cache_i_validdx
         self.cache_tag_set = cache_tag_set
         self.req_addr = req_addr
-        self.hit_set = hit_set
+        self.cfg = cfg
 
     def elaborate(self, platform):
         m = Module()
@@ -518,22 +645,22 @@ class DCachePendingHit(Elaboratable):
         go = self.go
         virt_mode = self.virt_mode
         is_hit = self.is_hit
-        tlb_pte_way = self.tlb_pte_way
-        tlb_valid_way = self.tlb_valid_way
-        cache_valid_idx = self.cache_valid_idx
+        tlb_way = self.tlb_way
+        cache_i_validdx = self.cache_i_validdx
         cache_tag_set = self.cache_tag_set
         req_addr = self.req_addr
-        tlb_hit_way = self.tlb_hit_way
         tlb_hit = self.tlb_hit
-        hit_set = self.hit_set
         hit_way = self.hit_way
         rel_match = self.rel_match
         req_index = self.req_index
         reload_tag = self.reload_tag
+        cfg = self.cfg
 
+        hit_set     = Array(Signal(name="hit_set_%d" % i) \
+                                  for i in range(cfg.TLB_NUM_WAYS))
         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
-                                    for i in range(TLB_NUM_WAYS))
-        hit_way_set = HitWaySet()
+                                    for i in range(cfg.TLB_NUM_WAYS))
+        hit_way_set = cfg.HitWaySet()
 
         # Test if pending request is a hit on any way
         # In order to make timing in virtual mode,
@@ -542,38 +669,38 @@ class DCachePendingHit(Elaboratable):
         # the TLB, and then decide later which match to use.
 
         with m.If(virt_mode):
-            for j in range(TLB_NUM_WAYS): # tlb_num_way_t
-                s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
-                s_hit       = Signal()
-                s_pte       = Signal(TLB_PTE_BITS)
-                s_ra        = Signal(REAL_ADDR_BITS)
-                comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
-                comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
-                                    s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
-                comb += s_tag.eq(get_tag(s_ra))
-
-                for i in range(NUM_WAYS): # way_t
+            for j in range(cfg.TLB_NUM_WAYS): # tlb_num_way_t
+                s_tag       = Signal(cfg.TAG_BITS, name="s_tag%d" % j)
+                s_hit       = Signal(name="s_hit%d" % j)
+                s_pte       = Signal(cfg.TLB_PTE_BITS, name="s_pte%d" % j)
+                s_ra        = Signal(cfg.REAL_ADDR_BITS, name="s_ra%d" % j)
+                # read the PTE, calc the Real Address, get tge tag
+                comb += s_pte.eq(cfg.read_tlb_pte(j, tlb_way.pte))
+                comb += s_ra.eq(Cat(req_addr[0:cfg.TLB_LG_PGSZ],
+                                    s_pte[cfg.TLB_LG_PGSZ:cfg.REAL_ADDR_BITS]))
+                comb += s_tag.eq(cfg.get_tag(s_ra))
+                # for each way check tge tag against the cache tag set
+                for i in range(cfg.NUM_WAYS): # way_t
                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
-                    comb += is_tag_hit.eq(go & cache_valid_idx[i] &
-                                  (read_tag(i, cache_tag_set) == s_tag)
-                                  & tlb_valid_way[j])
+                    comb += is_tag_hit.eq(go & cache_i_validdx[i] &
+                                  (cfg.read_tag(i, cache_tag_set) == s_tag)
+                                  & (tlb_way.valid[j]))
                     with m.If(is_tag_hit):
                         comb += hit_way_set[j].eq(i)
                         comb += s_hit.eq(1)
                 comb += hit_set[j].eq(s_hit)
-                with m.If(s_tag == reload_tag):
-                    comb += rel_matches[j].eq(1)
-            with m.If(tlb_hit):
-                comb += is_hit.eq(hit_set[tlb_hit_way])
-                comb += hit_way.eq(hit_way_set[tlb_hit_way])
-                comb += rel_match.eq(rel_matches[tlb_hit_way])
+                comb += rel_matches[j].eq(s_tag == reload_tag)
+            with m.If(tlb_hit.valid):
+                comb += is_hit.eq(hit_set[tlb_hit.way])
+                comb += hit_way.eq(hit_way_set[tlb_hit.way])
+                comb += rel_match.eq(rel_matches[tlb_hit.way])
         with m.Else():
-            s_tag       = Signal(TAG_BITS)
-            comb += s_tag.eq(get_tag(req_addr))
-            for i in range(NUM_WAYS): # way_t
+            s_tag       = Signal(cfg.TAG_BITS)
+            comb += s_tag.eq(cfg.get_tag(req_addr))
+            for i in range(cfg.NUM_WAYS): # way_t
                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
-                comb += is_tag_hit.eq(go & cache_valid_idx[i] &
-                          (read_tag(i, cache_tag_set) == s_tag))
+                comb += is_tag_hit.eq(go & cache_i_validdx[i] &
+                          (cfg.read_tag(i, cache_tag_set) == s_tag))
                 with m.If(is_tag_hit):
                     comb += hit_way.eq(i)
                     comb += is_hit.eq(1)
@@ -583,7 +710,7 @@ class DCachePendingHit(Elaboratable):
         return m
 
 
-class DCache(Elaboratable):
+class DCache(Elaboratable, DCacheConfig):
     """Set associative dcache write-through
 
     TODO (in no specific order):
@@ -592,7 +719,7 @@ class DCache(Elaboratable):
       at the end of line (this requires dealing with requests coming in
       while not idle...)
     """
-    def __init__(self):
+    def __init__(self, pspec=None):
         self.d_in      = LoadStore1ToDCacheType("d_in")
         self.d_out     = DCacheToLoadStore1Type("d_out")
 
@@ -600,12 +727,54 @@ class DCache(Elaboratable):
         self.m_out     = DCacheToMMUType("m_out")
 
         self.stall_out = Signal()
-
-        self.wb_out    = WBMasterOut("wb_out")
-        self.wb_in     = WBSlaveOut("wb_in")
+        self.any_stall_out = Signal()
+        self.dreq_when_stall = Signal()
+        self.mreq_when_stall = Signal()
+
+        # standard naming (wired to non-standard for compatibility)
+        self.bus = Interface(addr_width=32,
+                            data_width=64,
+                            granularity=8,
+                            features={'stall'},
+                            #alignment=0,
+                            name="dcache")
 
         self.log_out   = Signal(20)
 
+        # test if small cache to be enabled
+        self.small_cache = (hasattr(pspec, "small_cache") and
+                                 (pspec.small_cache == True))
+        # test if microwatt compatibility is to be enabled
+        self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+                                 (pspec.microwatt_compat == True))
+        # test if fabric compatibility is to be enabled
+        self.fabric_compat = (hasattr(pspec, "fabric_compat") and
+                                 (pspec.fabric_compat == True))
+
+        XLEN = pspec.XLEN
+        TLB_SET_SIZE = 8
+        TLB_NUM_WAYS = 2
+        NUM_LINES = 8
+        NUM_WAYS = 2
+
+        if self.small_cache:
+            # reduce way sizes and num lines to ridiculously small
+            TLB_SET_SIZE = 2
+            TLB_NUM_WAYS = 1
+            NUM_LINES = 2
+            NUM_WAYS = 1
+        if self.microwatt_compat or self.fabric_compat:
+            # reduce way sizes
+            NUM_WAYS = 1
+            TLB_NUM_WAYS = 1
+
+        super().__init__(TLB_SET_SIZE=TLB_SET_SIZE,
+                         # XLEN=XLEN, # TODO
+                         TLB_NUM_WAYS = TLB_NUM_WAYS,
+                         NUM_LINES = NUM_LINES,
+                         NUM_WAYS = NUM_WAYS
+                        )
+
     def stage_0(self, m, r0, r1, r0_full):
         """Latch the request in r0.req as long as we're not stalling
         """
@@ -634,6 +803,7 @@ class DCache(Elaboratable):
             comb += r.doall.eq(m_in.doall)
             comb += r.tlbld.eq(m_in.tlbld)
             comb += r.mmu_req.eq(1)
+            comb += r.d_valid.eq(1)
             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
                                  m_in.addr, m_in.pte, r.req.load)
 
@@ -644,25 +814,25 @@ class DCache(Elaboratable):
             comb += r.doall.eq(0)
             comb += r.tlbld.eq(0)
             comb += r.mmu_req.eq(0)
+            comb += r.d_valid.eq(0)
+
+        sync += r0_full.eq(0)
         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
             sync += r0.eq(r)
             sync += r0_full.eq(r.req.valid)
+        with m.Elif(~r0.d_valid):
             # Sample data the cycle after a request comes in from loadstore1.
             # If another request has come in already then the data will get
             # put directly into req.data below.
-            with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
-                     ~r0.mmu_req):
-                sync += r0.req.data.eq(d_in.data)
-                sync += r0.d_valid.eq(1)
+            sync += r0.req.data.eq(d_in.data)
+            sync += r0.d_valid.eq(1)
         with m.If(d_in.valid):
             m.d.sync += Display("    DCACHE req cache "
                                 "virt %d addr %x data %x ld %d",
                                  r.req.virt_mode, r.req.addr,
                                  r.req.data, r.req.load)
 
-    def tlb_read(self, m, r0_stall, tlb_valid_way,
-                 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
-                 dtlb_tags, dtlb_ptes):
+    def tlb_read(self, m, r0_stall, tlb_way):
         """TLB
         Operates in the second cycle on the request latched in r0.req.
         TLB updates write the entry at the end of the second cycle.
@@ -671,78 +841,76 @@ class DCache(Elaboratable):
         sync = m.d.sync
         m_in, d_in = self.m_in, self.d_in
 
-        index    = Signal(TLB_SET_BITS)
-        addrbits = Signal(TLB_SET_BITS)
+        addrbits = Signal(self.TLB_SET_BITS)
 
-        amin = TLB_LG_PGSZ
-        amax = TLB_LG_PGSZ + TLB_SET_BITS
+        amin = self.TLB_LG_PGSZ
+        amax = self.TLB_LG_PGSZ + self.TLB_SET_BITS
 
         with m.If(m_in.valid):
             comb += addrbits.eq(m_in.addr[amin : amax])
         with m.Else():
             comb += addrbits.eq(d_in.addr[amin : amax])
-        comb += index.eq(addrbits)
 
         # If we have any op and the previous op isn't finished,
         # then keep the same output for next cycle.
-        with m.If(~r0_stall):
-            sync += tlb_valid_way.eq(dtlb_valid_bits[index])
-            sync += tlb_tag_way.eq(dtlb_tags[index])
-            sync += tlb_pte_way.eq(dtlb_ptes[index])
+        d = self.dtlb_update
+        comb += d.tlb_read_index.eq(addrbits)
+        comb += d.tlb_read.eq(~r0_stall)
+        comb += tlb_way.eq(d.tlb_way)
 
-    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
+    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
         """Generate TLB PLRUs
         """
         comb = m.d.comb
         sync = m.d.sync
 
-        if TLB_NUM_WAYS == 0:
+        if self.TLB_NUM_WAYS == 0:
             return
-        for i in range(TLB_SET_SIZE):
-            # TLB PLRU interface
-            tlb_plru        = PLRU(TLB_WAY_BITS)
-            setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
-            tlb_plru_acc_en = Signal()
 
-            comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
-            comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
-            comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
-            comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
+        # suite of PLRUs with a selection and output mechanism
+        tlb_plrus = PLRUs("d_tlb", self.TLB_SET_SIZE, self.TLB_WAY_BITS)
+        m.submodules.tlb_plrus = tlb_plrus
+        comb += tlb_plrus.way.eq(r1.tlb_hit.way)
+        comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
+        comb += tlb_plrus.index.eq(r1.tlb_hit_index)
+        comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
+        comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 
     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
-                   tlb_valid_way, tlb_tag_way, tlb_hit_way,
-                   tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
+                   tlb_way,
+                   pte, tlb_hit, valid_ra, perm_attr, ra):
 
         comb = m.d.comb
 
-        hitway = Signal(TLB_WAY_BITS)
+        hitway = Signal(self.TLB_WAY_BITS)
         hit    = Signal()
-        eatag  = Signal(TLB_EA_TAG_BITS)
+        eatag  = Signal(self.TLB_EA_TAG_BITS)
 
-        TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
-        comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
-        comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
+        self.TLB_LG_END = self.TLB_LG_PGSZ + self.TLB_SET_BITS
+        r0_req_addr = r0.req.addr[self.TLB_LG_PGSZ : self.TLB_LG_END]
+        comb += tlb_req_index.eq(r0_req_addr)
+        comb += eatag.eq(r0.req.addr[self.TLB_LG_END : 64 ])
 
-        for i in range(TLB_NUM_WAYS):
+        for i in range(self.TLB_NUM_WAYS):
             is_tag_hit = Signal(name="is_tag_hit%d" % i)
-            tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
-            comb += tlb_tag.eq(read_tlb_tag(i, tlb_tag_way))
-            comb += is_tag_hit.eq(tlb_valid_way[i] & (tlb_tag == eatag))
+            tlb_tag = Signal(self.TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
+            comb += tlb_tag.eq(self.read_tlb_tag(i, tlb_way.tag))
+            comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
             with m.If(is_tag_hit):
                 comb += hitway.eq(i)
                 comb += hit.eq(1)
 
-        comb += tlb_hit.eq(hit & r0_valid)
-        comb += tlb_hit_way.eq(hitway)
+        comb += tlb_hit.valid.eq(hit & r0_valid)
+        comb += tlb_hit.way.eq(hitway)
 
-        with m.If(tlb_hit):
-            comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
-        comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
+        with m.If(tlb_hit.valid):
+            comb += pte.eq(self.read_tlb_pte(hitway, tlb_way.pte))
+        comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 
         with m.If(r0.req.virt_mode):
-            comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
-                              r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
-                              pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
+            comb += ra.eq(Cat(Const(0, self.ROW_OFF_BITS),
+                              r0.req.addr[self.ROW_OFF_BITS:self.TLB_LG_PGSZ],
+                              pte[self.TLB_LG_PGSZ:self.REAL_ADDR_BITS]))
             comb += perm_attr.reference.eq(pte[8])
             comb += perm_attr.changed.eq(pte[7])
             comb += perm_attr.nocache.eq(pte[5])
@@ -750,8 +918,8 @@ class DCache(Elaboratable):
             comb += perm_attr.rd_perm.eq(pte[2])
             comb += perm_attr.wr_perm.eq(pte[1])
         with m.Else():
-            comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
-                              r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
+            comb += ra.eq(Cat(Const(0, self.ROW_OFF_BITS),
+                          r0.req.addr[self.ROW_OFF_BITS:self.REAL_ADDR_BITS]))
             comb += perm_attr.reference.eq(1)
             comb += perm_attr.changed.eq(1)
             comb += perm_attr.nocache.eq(0)
@@ -761,7 +929,7 @@ class DCache(Elaboratable):
 
         with m.If(valid_ra):
             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
-                                r0.req.virt_mode, tlb_hit, ra, pte)
+                                r0.req.virt_mode, tlb_hit.valid, ra, pte)
             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
@@ -769,11 +937,8 @@ class DCache(Elaboratable):
             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 
-    def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
-                    tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
-                    dtlb_tags, tlb_pte_way, dtlb_ptes):
-
-        dtlb_valids = TLBValidBitsArray()
+    def tlb_update(self, m, r0_valid, r0, tlb_req_index,
+                    tlb_hit, tlb_plru_victim):
 
         comb = m.d.comb
         sync = m.d.sync
@@ -784,33 +949,19 @@ class DCache(Elaboratable):
         comb += tlbie.eq(r0_valid & r0.tlbie)
         comb += tlbwe.eq(r0_valid & r0.tlbld)
 
-        m.submodules.tlb_update = d = DTLBUpdate()
-        with m.If(tlbie & r0.doall):
-            # clear all valid bits at once
-            for i in range(TLB_SET_SIZE):
-                sync += dtlb_valid_bits[i].eq(0)
-        with m.If(d.updated):
-            sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
-            sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
-        with m.If(d.v_updated):
-            sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
-
-        comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
+        d = self.dtlb_update
 
         comb += d.tlbie.eq(tlbie)
         comb += d.tlbwe.eq(tlbwe)
         comb += d.doall.eq(r0.doall)
         comb += d.tlb_hit.eq(tlb_hit)
-        comb += d.tlb_hit_way.eq(tlb_hit_way)
-        comb += d.tlb_tag_way.eq(tlb_tag_way)
-        comb += d.tlb_pte_way.eq(tlb_pte_way)
         comb += d.tlb_req_index.eq(tlb_req_index)
 
-        with m.If(tlb_hit):
-            comb += d.repl_way.eq(tlb_hit_way)
+        with m.If(tlb_hit.valid):
+            comb += d.repl_way.eq(tlb_hit.way)
         with m.Else():
-            comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
-        comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
+            comb += d.repl_way.eq(tlb_plru_victim)
+        comb += d.eatag.eq(r0.req.addr[self.TLB_LG_PGSZ + self.TLB_SET_BITS:64])
         comb += d.pte_data.eq(r0.req.data)
 
     def maybe_plrus(self, m, r1, plru_victim):
@@ -819,44 +970,47 @@ class DCache(Elaboratable):
         comb = m.d.comb
         sync = m.d.sync
 
-        if TLB_NUM_WAYS == 0:
+        if self.TLB_NUM_WAYS == 0:
             return
 
-        for i in range(NUM_LINES):
-            # PLRU interface
-            plru        = PLRU(WAY_BITS)
-            setattr(m.submodules, "plru%d" % i, plru)
-            plru_acc_en = Signal()
-
-            comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
-            comb += plru.acc_en.eq(plru_acc_en)
-            comb += plru.acc_i.eq(r1.hit_way)
-            comb += plru_victim[i].eq(plru.lru_o)
+        # suite of PLRUs with a selection and output mechanism
+        m.submodules.plrus = plrus = PLRUs("dtag", self.NUM_LINES,
+                                                   self.WAY_BITS)
+        comb += plrus.way.eq(r1.hit_way)
+        comb += plrus.valid.eq(r1.cache_hit)
+        comb += plrus.index.eq(r1.hit_index)
+        comb += plrus.isel.eq(r1.store_index) # select victim
+        comb += plru_victim.eq(plrus.o_index) # selected victim
 
-    def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
+    def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set):
         """Cache tag RAM read port
         """
         comb = m.d.comb
         sync = m.d.sync
+
         m_in, d_in = self.m_in, self.d_in
 
-        index = Signal(INDEX_BITS)
+        # synchronous tag read-port: NOT TRANSPARENT (cannot pass through
+        # write-to-a-read at the same time), seems to pass tests ok
+        m.submodules.rd_tag = rd_tag = self.tagmem.read_port(transparent=False)
+
+        index = Signal(self.INDEX_BITS)
 
         with m.If(r0_stall):
             comb += index.eq(req_index)
         with m.Elif(m_in.valid):
-            comb += index.eq(get_index(m_in.addr))
+            comb += index.eq(self.get_index(m_in.addr))
         with m.Else():
-            comb += index.eq(get_index(d_in.addr))
-        sync += cache_tag_set.eq(cache_tags[index])
+            comb += index.eq(self.get_index(d_in.addr))
+        comb += rd_tag.addr.eq(index)
+        comb += cache_tag_set.eq(rd_tag.data) # read-port is a 1-clock delay
 
     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
                        r0_valid, r1, cache_valids, replace_way,
                        use_forward1_next, use_forward2_next,
                        req_hit_way, plru_victim, rc_ok, perm_attr,
                        valid_ra, perm_ok, access_ok, req_op, req_go,
-                       tlb_pte_way,
-                       tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
+                       tlb_hit, tlb_way, cache_tag_set,
                        cancel_store, req_same_tag, r0_stall, early_req_row):
         """Cache request parsing and hit detection
         """
@@ -865,38 +1019,34 @@ class DCache(Elaboratable):
         m_in, d_in = self.m_in, self.d_in
 
         is_hit      = Signal()
-        hit_way     = Signal(WAY_BITS)
+        hit_way     = Signal(self.WAY_BITS)
         op          = Signal(Op)
         opsel       = Signal(3)
         go          = Signal()
         nc          = Signal()
-        hit_set     = Array(Signal(name="hit_set_%d" % i) \
-                                  for i in range(TLB_NUM_WAYS))
-        cache_valid_idx = Signal(NUM_WAYS)
+        cache_i_validdx = Signal(self.NUM_WAYS)
 
         # Extract line, row and tag from request
-        comb += req_index.eq(get_index(r0.req.addr))
-        comb += req_row.eq(get_row(r0.req.addr))
-        comb += req_tag.eq(get_tag(ra))
+        comb += req_index.eq(self.get_index(r0.req.addr))
+        comb += req_row.eq(self.get_row(r0.req.addr))
+        comb += req_tag.eq(self.get_tag(ra))
 
         if False: # display on comb is a bit... busy.
             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
                     r0.req.addr, ra, req_index, req_tag, req_row)
 
         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
-        comb += cache_valid_idx.eq(cache_valids[req_index])
-
-        m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
-                                tlb_valid_way, tlb_hit_way,
-                                cache_valid_idx, cache_tag_set,
-                                r0.req.addr,
-                                hit_set)
+        comb += cache_i_validdx.eq(cache_valids[req_index])
 
+        m.submodules.dcache_pend = dc = DCachePendingHit(self, tlb_way,
+                                            cache_i_validdx, cache_tag_set,
+                                            r0.req.addr)
         comb += dc.tlb_hit.eq(tlb_hit)
         comb += dc.reload_tag.eq(r1.reload_tag)
         comb += dc.virt_mode.eq(r0.req.virt_mode)
         comb += dc.go.eq(go)
         comb += dc.req_index.eq(req_index)
+
         comb += is_hit.eq(dc.is_hit)
         comb += hit_way.eq(dc.hit_way)
         comb += req_same_tag.eq(dc.rel_match)
@@ -907,14 +1057,14 @@ class DCache(Elaboratable):
             # For a store, consider this a hit even if the row isn't
             # valid since it will be by the time we perform the store.
             # For a load, check the appropriate row valid bit.
-            rrow = Signal(ROW_LINE_BITS)
+            rrow = Signal(self.ROW_LINE_BITS)
             comb += rrow.eq(req_row)
             valid = r1.rows_valid[rrow]
             comb += is_hit.eq((~r0.req.load) | valid)
             comb += hit_way.eq(replace_way)
 
         # Whether to use forwarded data for a load or not
-        with m.If((get_row(r1.req.real_addr) == req_row) &
+        with m.If((self.get_row(r1.req.real_addr) == req_row) &
                   (r1.req.hit_way == hit_way)):
             # Only need to consider r1.write_bram here, since if we
             # are writing refill data here, then we don't have a
@@ -933,7 +1083,7 @@ class DCache(Elaboratable):
 
         # The way to replace on a miss
         with m.If(r1.write_tag):
-            comb += replace_way.eq(plru_victim[r1.store_index])
+            comb += replace_way.eq(plru_victim)
         with m.Else():
             comb += replace_way.eq(r1.store_way)
 
@@ -945,6 +1095,7 @@ class DCache(Elaboratable):
                            (perm_attr.wr_perm |
                               (r0.req.load & perm_attr.rd_perm)))
         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
+
         # Combine the request and cache hit status to decide what
         # operation needs to be done
         comb += nc.eq(r0.req.nc | perm_attr.nocache)
@@ -979,9 +1130,9 @@ class DCache(Elaboratable):
         # row requested.
         with m.If(~r0_stall):
             with m.If(m_in.valid):
-                comb += early_req_row.eq(get_row(m_in.addr))
+                comb += early_req_row.eq(self.get_row(m_in.addr))
             with m.Else():
-                comb += early_req_row.eq(get_row(d_in.addr))
+                comb += early_req_row.eq(self.get_row(d_in.addr))
         with m.Else():
             comb += early_req_row.eq(req_row)
 
@@ -999,12 +1150,12 @@ class DCache(Elaboratable):
             with m.Else():
                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
                 with m.If((~reservation.valid) |
-                         (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
+                         (r0.req.addr[self.LINE_OFF_BITS:64] !=
+                          reservation.addr)):
                     comb += cancel_store.eq(1)
 
     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                         reservation, r0):
-
         comb = m.d.comb
         sync = m.d.sync
 
@@ -1013,7 +1164,7 @@ class DCache(Elaboratable):
                 sync += reservation.valid.eq(0)
             with m.Elif(set_rsrv):
                 sync += reservation.valid.eq(1)
-                sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
+                sync += reservation.addr.eq(r0.req.addr[self.LINE_OFF_BITS:64])
 
     def writeback_control(self, m, r1, cache_out_row):
         """Return data for loads & completion control logic
@@ -1041,6 +1192,7 @@ class DCache(Elaboratable):
                 dsel = data_fwd.word_select(i, 8)
                 comb += data_out.word_select(i, 8).eq(dsel)
 
+        # DCache output to LoadStore
         comb += d_out.valid.eq(r1.ls_valid)
         comb += d_out.data.eq(data_out)
         comb += d_out.store_done.eq(~r1.stcx_fail)
@@ -1079,7 +1231,10 @@ class DCache(Elaboratable):
 
             # error cases complete without stalling
             with m.If(r1.ls_error):
-                sync += Display("completing ld/st with error")
+                with m.If(r1.dcbz):
+                    sync += Display("completing dcbz with error")
+                with m.Else():
+                    sync += Display("completing ld/st with error")
 
             # Slow ops (load miss, NC, stores)
             with m.If(r1.slow_valid):
@@ -1112,62 +1267,80 @@ class DCache(Elaboratable):
         account by using 1-cycle delayed signals for load hits.
         """
         comb = m.d.comb
-        wb_in = self.wb_in
+        bus = self.bus
+
+        # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
+        # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
+        m.submodules.rams_replace_way_e = rwe = Decoder(self.NUM_WAYS)
+        comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
+                   ~r1.write_bram))
+        comb += rwe.i.eq(replace_way)
+
+        m.submodules.rams_hit_way_e = hwe = Decoder(self.NUM_WAYS)
+        comb += hwe.i.eq(r1.hit_way)
+
+        # this one is gated with write_bram, and replace_way_e can never be
+        # set at the same time.  that means that do_write can OR the outputs
+        m.submodules.rams_hit_req_way_e = hre = Decoder(self.NUM_WAYS)
+        comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
+        comb += hre.i.eq(r1.req.hit_way)
+
+        # common Signals
+        do_read  = Signal()
+        wr_addr  = Signal(self.ROW_BITS)
+        wr_data  = Signal(WB_DATA_BITS)
+        wr_sel   = Signal(self.ROW_SIZE)
+        rd_addr  = Signal(self.ROW_BITS)
+
+        comb += do_read.eq(1) # always enable
+        comb += rd_addr.eq(early_req_row)
+
+        # Write mux:
+        #
+        # Defaults to wishbone read responses (cache refill)
+        #
+        # For timing, the mux on wr_data/sel/addr is not
+        # dependent on anything other than the current state.
 
-        for i in range(NUM_WAYS):
-            do_read  = Signal(name="do_rd%d" % i)
-            rd_addr  = Signal(ROW_BITS, name="rd_addr_%d" % i)
+        with m.If(r1.write_bram):
+            # Write store data to BRAM.  This happens one
+            # cycle after the store is in r0.
+            comb += wr_data.eq(r1.req.data)
+            comb += wr_sel.eq(r1.req.byte_sel)
+            comb += wr_addr.eq(self.get_row(r1.req.real_addr))
+
+        with m.Else():
+            # Otherwise, we might be doing a reload or a DCBZ
+            with m.If(r1.dcbz):
+                comb += wr_data.eq(0)
+            with m.Else():
+                comb += wr_data.eq(bus.dat_r)
+            comb += wr_addr.eq(r1.store_row)
+            comb += wr_sel.eq(~0) # all 1s
+
+        # set up Cache Rams
+        for i in range(self.NUM_WAYS):
             do_write = Signal(name="do_wr%d" % i)
-            wr_addr  = Signal(ROW_BITS, name="wr_addr_%d" % i)
-            wr_data  = Signal(WB_DATA_BITS, name="din_%d" % i)
-            wr_sel   = Signal(ROW_SIZE)
-            wr_sel_m = Signal(ROW_SIZE)
-            _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
+            wr_sel_m = Signal(self.ROW_SIZE, name="wr_sel_m_%d" % i)
+            d_out= Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
 
-            way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
-            setattr(m.submodules, "cacheram_%d" % i, way)
+            way = CacheRam(self.ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
+            m.submodules["cacheram_%d" % i] = way
 
             comb += way.rd_en.eq(do_read)
             comb += way.rd_addr.eq(rd_addr)
-            comb += _d_out.eq(way.rd_data_o)
+            comb += d_out.eq(way.rd_data_o)
             comb += way.wr_sel.eq(wr_sel_m)
             comb += way.wr_addr.eq(wr_addr)
             comb += way.wr_data.eq(wr_data)
 
             # Cache hit reads
-            comb += do_read.eq(1)
-            comb += rd_addr.eq(early_req_row)
-            with m.If(r1.hit_way == i):
-                comb += cache_out_row.eq(_d_out)
-
-            # Write mux:
-            #
-            # Defaults to wishbone read responses (cache refill)
-            #
-            # For timing, the mux on wr_data/sel/addr is not
-            # dependent on anything other than the current state.
-
-            with m.If(r1.write_bram):
-                # Write store data to BRAM.  This happens one
-                # cycle after the store is in r0.
-                comb += wr_data.eq(r1.req.data)
-                comb += wr_sel.eq(r1.req.byte_sel)
-                comb += wr_addr.eq(get_row(r1.req.real_addr))
-
-                with m.If(i == r1.req.hit_way):
-                    comb += do_write.eq(1)
-            with m.Else():
-                # Otherwise, we might be doing a reload or a DCBZ
-                with m.If(r1.dcbz):
-                    comb += wr_data.eq(0)
-                with m.Else():
-                    comb += wr_data.eq(wb_in.dat)
-                comb += wr_addr.eq(r1.store_row)
-                comb += wr_sel.eq(~0) # all 1s
+            with m.If(hwe.o[i]):
+                comb += cache_out_row.eq(d_out)
 
-                with m.If((r1.state == State.RELOAD_WAIT_ACK)
-                          & wb_in.ack & (replace_way == i)):
-                    comb += do_write.eq(1)
+            # these are mutually-exclusive via their Decoder-enablers
+            # (note: Decoder-enable is inverted)
+            comb += do_write.eq(hre.o[i] | rwe.o[i])
 
             # Mask write selects with do_write since BRAM
             # doesn't have a global write-enable
@@ -1179,8 +1352,7 @@ class DCache(Elaboratable):
     # It also handles error cases (TLB miss, cache paradox)
     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
                         req_hit_way, req_index, req_tag, access_ok,
-                        tlb_hit, tlb_hit_way, tlb_req_index):
-
+                        tlb_hit, tlb_req_index):
         comb = m.d.comb
         sync = m.d.sync
 
@@ -1197,15 +1369,9 @@ class DCache(Elaboratable):
         sync += r1.hit_way.eq(req_hit_way)
         sync += r1.hit_index.eq(req_index)
 
-        with m.If(req_op == Op.OP_LOAD_HIT):
-            sync += r1.hit_load_valid.eq(1)
-        with m.Else():
-            sync += r1.hit_load_valid.eq(0)
-
-        with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
-            sync += r1.cache_hit.eq(1)
-        with m.Else():
-            sync += r1.cache_hit.eq(0)
+        sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
+        sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
+                                (req_op == Op.OP_STORE_HIT))
 
         with m.If(req_op == Op.OP_BAD):
             sync += Display("Signalling ld/st error "
@@ -1214,20 +1380,15 @@ class DCache(Elaboratable):
             sync += r1.ls_error.eq(~r0.mmu_req)
             sync += r1.mmu_error.eq(r0.mmu_req)
             sync += r1.cache_paradox.eq(access_ok)
-
         with m.Else():
             sync += r1.ls_error.eq(0)
             sync += r1.mmu_error.eq(0)
             sync += r1.cache_paradox.eq(0)
 
-        with m.If(req_op == Op.OP_STCX_FAIL):
-            sync += r1.stcx_fail.eq(1)
-        with m.Else():
-            sync += r1.stcx_fail.eq(0)
+        sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
 
         # Record TLB hit information for updating TLB PLRU
         sync += r1.tlb_hit.eq(tlb_hit)
-        sync += r1.tlb_hit_way.eq(tlb_hit_way)
         sync += r1.tlb_hit_index.eq(tlb_req_index)
 
     # Memory accesses are handled by this state machine:
@@ -1239,23 +1400,27 @@ class DCache(Elaboratable):
     # All wishbone requests generation is done here.
     # This machine operates at stage 1.
     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
-                    cache_valids, r0, replace_way,
+                    r0, replace_way,
                     req_hit_way, req_same_tag,
-                    r0_valid, req_op, cache_tags, req_go, ra):
+                    r0_valid, req_op, cache_valids, req_go, ra):
 
         comb = m.d.comb
         sync = m.d.sync
-        wb_in = self.wb_in
+        bus = self.bus
         d_in = self.d_in
 
-        req         = MemAccessRequest("mreq_ds")
+        m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
+                                                    granularity=self.TAG_WIDTH)
+
+        req         = MemAccessRequest(self, "mreq_ds")
 
-        req_row = Signal(ROW_BITS)
-        req_idx = Signal(INDEX_BITS)
-        req_tag = Signal(TAG_BITS)
-        comb += req_idx.eq(get_index(req.real_addr))
-        comb += req_row.eq(get_row(req.real_addr))
-        comb += req_tag.eq(get_tag(req.real_addr))
+        r1_next_cycle = Signal()
+        req_row = Signal(self.ROW_BITS)
+        req_idx = Signal(self.INDEX_BITS)
+        req_tag = Signal(self.TAG_BITS)
+        comb += req_idx.eq(self.get_index(req.real_addr))
+        comb += req_row.eq(self.get_row(req.real_addr))
+        comb += req_tag.eq(self.get_tag(req.real_addr))
 
         sync += r1.use_forward1.eq(use_forward1_next)
         sync += r1.forward_sel.eq(0)
@@ -1270,13 +1435,13 @@ class DCache(Elaboratable):
             sync += r1.forward_data1.eq(r1.req.data)
             sync += r1.forward_sel1.eq(r1.req.byte_sel)
             sync += r1.forward_way1.eq(r1.req.hit_way)
-            sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
+            sync += r1.forward_row1.eq(self.get_row(r1.req.real_addr))
             sync += r1.forward_valid1.eq(1)
         with m.Else():
             with m.If(r1.dcbz):
                 sync += r1.forward_data1.eq(0)
             with m.Else():
-                sync += r1.forward_data1.eq(wb_in.dat)
+                sync += r1.forward_data1.eq(bus.dat_r)
             sync += r1.forward_sel1.eq(~0) # all 1s
             sync += r1.forward_way1.eq(replace_way)
             sync += r1.forward_row1.eq(r1.store_row)
@@ -1293,24 +1458,21 @@ class DCache(Elaboratable):
         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
 
         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
-            with m.If(~r0.mmu_req):
-                sync += r1.ls_valid.eq(1)
-            with m.Else():
+            with m.If(r0.mmu_req):
                 sync += r1.mmu_done.eq(1)
+            with m.Else():
+                sync += r1.ls_valid.eq(1)
 
         with m.If(r1.write_tag):
             # Store new tag in selected way
-            for i in range(NUM_WAYS):
-                with m.If(i == replace_way):
-                    ct = Signal(TAG_RAM_WIDTH)
-                    comb += ct.eq(cache_tags[r1.store_index])
-                    """
-TODO: check this
-cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
-                    (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
-                    """
-                    comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
-                    sync += cache_tags[r1.store_index].eq(ct)
+            replace_way_onehot = Signal(self.NUM_WAYS)
+            comb += replace_way_onehot.eq(1<<replace_way)
+            ct = Signal(self.TAG_RAM_WIDTH)
+            comb += ct.eq(r1.reload_tag << (replace_way*self.TAG_WIDTH))
+            comb += wr_tag.en.eq(replace_way_onehot)
+            comb += wr_tag.addr.eq(r1.store_index)
+            comb += wr_tag.data.eq(ct)
+
             sync += r1.store_way.eq(replace_way)
             sync += r1.write_tag.eq(0)
 
@@ -1351,12 +1513,15 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                       | (req_op == Op.OP_STORE_HIT)):
                 sync += r1.req.eq(req)
                 sync += r1.full.eq(1)
+                # do not let r1.state RELOAD_WAIT_ACK or STORE_WAIT_ACK
+                # destroy r1.req by overwriting r1.full back to zero
+                comb += r1_next_cycle.eq(1)
 
         # Main state machine
         with m.Switch(r1.state):
 
             with m.Case(State.IDLE):
-                sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
+                sync += r1.wb.adr.eq(req.real_addr[self.ROW_OFF_BITS:])
                 sync += r1.wb.sel.eq(req.byte_sel)
                 sync += r1.wb.dat.eq(req.data)
                 sync += r1.dcbz.eq(req.dcbz)
@@ -1365,16 +1530,19 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                 # for subsequent stores.
                 sync += r1.store_index.eq(req_idx)
                 sync += r1.store_row.eq(req_row)
-                sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
+                sync += r1.end_row_ix.eq(self.get_row_of_line(req_row)-1)
                 sync += r1.reload_tag.eq(req_tag)
                 sync += r1.req.same_tag.eq(1)
 
                 with m.If(req.op == Op.OP_STORE_HIT):
                     sync += r1.store_way.eq(req.hit_way)
 
+                #with m.If(r1.dec_acks):
+                #    sync += r1.acks_pending.eq(r1.acks_pending - 1)
+
                 # Reset per-row valid bits,
                 # ready for handling OP_LOAD_MISS
-                for i in range(ROW_PER_LINE):
+                for i in range(self.ROW_PER_LINE):
                     sync += r1.rows_valid[i].eq(0)
 
                 with m.If(req_op != Op.OP_NONE):
@@ -1410,12 +1578,13 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                             sync += r1.state.eq(State.STORE_WAIT_ACK)
                             sync += r1.acks_pending.eq(1)
                             sync += r1.full.eq(0)
+                            comb += r1_next_cycle.eq(0)
                             sync += r1.slow_valid.eq(1)
 
-                            with m.If(~req.mmu_req):
-                                sync += r1.ls_valid.eq(1)
-                            with m.Else():
+                            with m.If(req.mmu_req):
                                 sync += r1.mmu_done.eq(1)
+                            with m.Else():
+                                sync += r1.ls_valid.eq(1)
 
                             with m.If(req.op == Op.OP_STORE_HIT):
                                 sync += r1.write_bram.eq(1)
@@ -1442,30 +1611,25 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                         pass
 
             with m.Case(State.RELOAD_WAIT_ACK):
-                ld_stbs_done = Signal()
-                # Requests are all sent if stb is 0
-                comb += ld_stbs_done.eq(~r1.wb.stb)
 
                 # If we are still sending requests, was one accepted?
-                with m.If((~wb_in.stall) & r1.wb.stb):
-                    # That was the last word?  We are done sending.
-                    # Clear stb and set ld_stbs_done so we can handle an
-                    # eventual last ack on the same cycle.
+                with m.If((~bus.stall) & r1.wb.stb):
+                    # That was the last word?  We are done sending.  Clear stb
                     # sigh - reconstruct wb adr with 3 extra 0s at front
-                    wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
-                    with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
+                    wb_adr = Cat(Const(0, self.ROW_OFF_BITS), r1.wb.adr)
+                    with m.If(self.is_last_row_addr(wb_adr, r1.end_row_ix)):
                         sync += r1.wb.stb.eq(0)
-                        comb += ld_stbs_done.eq(1)
 
                     # Calculate the next row address in the current cache line
-                    row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
+                    rlen = self.LINE_OFF_BITS-self.ROW_OFF_BITS
+                    row = Signal(rlen)
                     comb += row.eq(r1.wb.adr)
-                    sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
+                    sync += r1.wb.adr[:rlen].eq(row+1)
 
                 # Incoming acks processing
-                sync += r1.forward_valid1.eq(wb_in.ack)
-                with m.If(wb_in.ack):
-                    srow = Signal(ROW_LINE_BITS)
+                sync += r1.forward_valid1.eq(bus.ack)
+                with m.If(bus.ack):
+                    srow = Signal(self.ROW_LINE_BITS)
                     comb += srow.eq(r1.store_row)
                     sync += r1.rows_valid[srow].eq(1)
 
@@ -1474,27 +1638,31 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                     # Compare the whole address in case the
                     # request in r1.req is not the one that
                     # started this refill.
-                    with m.If(req.valid & r1.req.same_tag &
-                              ((r1.dcbz & r1.req.dcbz) |
-                               (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
-                                (r1.store_row == get_row(req.real_addr))):
-                        sync += r1.full.eq(0)
+                    rowmatch = Signal()
+                    lastrow = Signal()
+                    comb += rowmatch.eq(r1.store_row ==
+                                        self.get_row(r1.req.real_addr))
+                    comb += lastrow.eq(self.is_last_row(r1.store_row,
+                                                      r1.end_row_ix))
+                    with m.If(r1.full & r1.req.same_tag &
+                              ((r1.dcbz & req.dcbz) |
+                               (r1.req.op == Op.OP_LOAD_MISS)) & rowmatch):
+                        sync += r1.full.eq(r1_next_cycle)
                         sync += r1.slow_valid.eq(1)
-                        with m.If(~r1.mmu_req):
-                            sync += r1.ls_valid.eq(1)
-                        with m.Else():
+                        with m.If(r1.mmu_req):
                             sync += r1.mmu_done.eq(1)
+                        with m.Else():
+                            sync += r1.ls_valid.eq(1)
                         sync += r1.forward_sel.eq(~0) # all 1s
                         sync += r1.use_forward1.eq(1)
 
                     # Check for completion
-                    with m.If(ld_stbs_done & is_last_row(r1.store_row,
-                                                      r1.end_row_ix)):
+                    with m.If(lastrow):
                         # Complete wishbone cycle
                         sync += r1.wb.cyc.eq(0)
 
                         # Cache line is now valid
-                        cv = Signal(INDEX_BITS)
+                        cv = Signal(self.INDEX_BITS)
                         comb += cv.eq(cache_valids[r1.store_index])
                         comb += cv.bit_select(r1.store_way, 1).eq(1)
                         sync += cache_valids[r1.store_index].eq(cv)
@@ -1505,45 +1673,48 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                                          cv, r1.store_index, r1.store_way)
 
                     # Increment store row counter
-                    sync += r1.store_row.eq(next_row(r1.store_row))
+                    sync += r1.store_row.eq(self.next_row(r1.store_row))
 
             with m.Case(State.STORE_WAIT_ACK):
                 st_stbs_done = Signal()
-                acks        = Signal(3)
                 adjust_acks = Signal(3)
 
                 comb += st_stbs_done.eq(~r1.wb.stb)
-                comb += acks.eq(r1.acks_pending)
 
                 with m.If(r1.inc_acks != r1.dec_acks):
                     with m.If(r1.inc_acks):
-                        comb += adjust_acks.eq(acks + 1)
+                        comb += adjust_acks.eq(r1.acks_pending + 1)
                     with m.Else():
-                        comb += adjust_acks.eq(acks - 1)
+                        comb += adjust_acks.eq(r1.acks_pending - 1)
                 with m.Else():
-                    comb += adjust_acks.eq(acks)
+                    comb += adjust_acks.eq(r1.acks_pending)
 
                 sync += r1.acks_pending.eq(adjust_acks)
 
                 # Clear stb when slave accepted request
-                with m.If(~wb_in.stall):
+                with m.If(~bus.stall):
                     # See if there is another store waiting
                     # to be done which is in the same real page.
+                    # (this is when same_tsg is true)
                     with m.If(req.valid):
-                        _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
-                        sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
+                        _ra = req.real_addr[self.ROW_OFF_BITS:
+                                            self.SET_SIZE_BITS]
+                        alen = self.SET_SIZE_BITS-self.ROW_OFF_BITS
+                        sync += r1.wb.adr[0:alen].eq(_ra)
                         sync += r1.wb.dat.eq(req.data)
                         sync += r1.wb.sel.eq(req.byte_sel)
 
                     with m.If((adjust_acks < 7) & req.same_tag &
-                                ((req.op == Op.OP_STORE_MISS)
-                                 (req.op == Op.OP_STORE_HIT))):
+                                ((req.op == Op.OP_STORE_MISS) |
+                                 (req.op == Op.OP_STORE_HIT))):
                         sync += r1.wb.stb.eq(1)
                         comb += st_stbs_done.eq(0)
+                        sync += r1.store_way.eq(req.hit_way)
+                        sync += r1.store_row.eq(self.get_row(req.real_addr))
 
                         with m.If(req.op == Op.OP_STORE_HIT):
                             sync += r1.write_bram.eq(1)
-                        sync += r1.full.eq(0)
+                        sync += r1.full.eq(r1_next_cycle)
                         sync += r1.slow_valid.eq(1)
 
                         # Store requests never come from the MMU
@@ -1555,7 +1726,9 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                         comb += st_stbs_done.eq(1)
 
                 # Got ack ? See if complete.
-                with m.If(wb_in.ack):
+                sync += Display("got ack %d %d stbs %d adjust_acks %d",
+                                bus.ack, bus.ack, st_stbs_done, adjust_acks)
+                with m.If(bus.ack):
                     with m.If(st_stbs_done & (adjust_acks == 1)):
                         sync += r1.state.eq(State.IDLE)
                         sync += r1.wb.cyc.eq(0)
@@ -1564,55 +1737,51 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
 
             with m.Case(State.NC_LOAD_WAIT_ACK):
                 # Clear stb when slave accepted request
-                with m.If(~wb_in.stall):
+                with m.If(~bus.stall):
                     sync += r1.wb.stb.eq(0)
 
                 # Got ack ? complete.
-                with m.If(wb_in.ack):
+                with m.If(bus.ack):
                     sync += r1.state.eq(State.IDLE)
-                    sync += r1.full.eq(0)
+                    sync += r1.full.eq(r1_next_cycle)
                     sync += r1.slow_valid.eq(1)
 
-                    with m.If(~r1.mmu_req):
-                        sync += r1.ls_valid.eq(1)
-                    with m.Else():
+                    with m.If(r1.mmu_req):
                         sync += r1.mmu_done.eq(1)
+                    with m.Else():
+                        sync += r1.ls_valid.eq(1)
 
                     sync += r1.forward_sel.eq(~0) # all 1s
                     sync += r1.use_forward1.eq(1)
                     sync += r1.wb.cyc.eq(0)
                     sync += r1.wb.stb.eq(0)
 
-    def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
+    def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
 
         sync = m.d.sync
-        d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
+        d_out, bus, log_out = self.d_out, self.bus, self.log_out
 
-        sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
+        sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
                                stall_out, req_op[:3], d_out.valid, d_out.error,
-                               r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
+                               r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
                                r1.real_adr[3:6]))
 
     def elaborate(self, platform):
 
         m = Module()
-        comb = m.d.comb
-        d_in = self.d_in
+        comb, sync = m.d.comb, m.d.sync
+        m_in, d_in = self.m_in, self.d_in
 
         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
-        cache_tags       = CacheTagArray()
-        cache_tag_set    = Signal(TAG_RAM_WIDTH)
-        cache_valids = CacheValidBitsArray()
+        cache_valids     = self.CacheValidsArray()
+        cache_tag_set    = Signal(self.TAG_RAM_WIDTH)
 
-        # TODO attribute ram_style : string;
-        # TODO attribute ram_style of cache_tags : signal is "distributed";
+        self.tagmem = Memory(depth=self.NUM_LINES, width=self.TAG_RAM_WIDTH,
+                             attrs={'syn_ramstyle': "block_ram"})
 
         """note: these are passed to nmigen.hdl.Memory as "attributes".
            don't know how, just that they are.
         """
-        dtlb_valid_bits = TLBValidBitsArray()
-        dtlb_tags       = TLBTagsArray()
-        dtlb_ptes       = TLBPtesArray()
         # TODO attribute ram_style of
         #  dtlb_tags : signal is "distributed";
         # TODO attribute ram_style of
@@ -1621,21 +1790,21 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
         r0      = RegStage0("r0")
         r0_full = Signal()
 
-        r1 = RegStage1("r1")
+        r1 = RegStage1(self, "r1")
 
-        reservation = Reservation()
+        reservation = Reservation(self, "rsrv")
 
         # Async signals on incoming request
-        req_index    = Signal(INDEX_BITS)
-        req_row      = Signal(ROW_BITS)
-        req_hit_way  = Signal(WAY_BITS)
-        req_tag      = Signal(TAG_BITS)
+        req_index    = Signal(self.INDEX_BITS)
+        req_row      = Signal(self.ROW_BITS)
+        req_hit_way  = Signal(self.WAY_BITS)
+        req_tag      = Signal(self.TAG_BITS)
         req_op       = Signal(Op)
         req_data     = Signal(64)
         req_same_tag = Signal()
         req_go       = Signal()
 
-        early_req_row     = Signal(ROW_BITS)
+        early_req_row     = Signal(self.ROW_BITS)
 
         cancel_store      = Signal()
         set_rsrv          = Signal()
@@ -1649,28 +1818,25 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
 
         cache_out_row     = Signal(WB_DATA_BITS)
 
-        plru_victim       = PLRUOut()
-        replace_way       = Signal(WAY_BITS)
+        plru_victim       = Signal(self.WAY_BITS)
+        replace_way       = Signal(self.WAY_BITS)
 
         # Wishbone read/write/cache write formatting signals
         bus_sel           = Signal(8)
 
         # TLB signals
-        tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
-        tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
-        tlb_valid_way = Signal(TLB_NUM_WAYS)
-        tlb_req_index = Signal(TLB_SET_BITS)
-        tlb_hit       = Signal()
-        tlb_hit_way   = Signal(TLB_WAY_BITS)
-        pte           = Signal(TLB_PTE_BITS)
-        ra            = Signal(REAL_ADDR_BITS)
+        tlb_way       = self.TLBRecord("tlb_way")
+        tlb_req_index = Signal(self.TLB_SET_BITS)
+        tlb_hit       = self.TLBHit("tlb_hit")
+        pte           = Signal(self.TLB_PTE_BITS)
+        ra            = Signal(self.REAL_ADDR_BITS)
         valid_ra      = Signal()
         perm_attr     = PermAttr("dc_perms")
         rc_ok         = Signal()
         perm_ok       = Signal()
         access_ok     = Signal()
 
-        tlb_plru_victim = TLBPLRUOut()
+        tlb_plru_victim = Signal(self.TLB_WAY_BITS)
 
         # we don't yet handle collisions between loadstore1 requests
         # and MMU requests
@@ -1680,37 +1846,50 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
         comb += self.stall_out.eq(r0_stall)
-
-        # Wire up wishbone request latch out of stage 1
-        comb += self.wb_out.eq(r1.wb)
+        # debugging: detect if any stall ever requested, which is fine,
+        # but if a request comes in when stall requested, that's bad.
+        with m.If(r0_stall):
+            sync += self.any_stall_out.eq(1)
+            with m.If(d_in.valid):
+                sync += self.dreq_when_stall.eq(1)
+            with m.If(m_in.valid):
+                sync += self.mreq_when_stall.eq(1)
 
         # deal with litex not doing wishbone pipeline mode
         # XXX in wrong way.  FIFOs are needed in the SRAM test
-        # so that stb/ack match up
-        comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack)
+        # so that stb/ack match up. same thing done in icache.py
+        if not self.microwatt_compat or self.fabric_compat:
+            comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
+
+        # Wire up wishbone request latch out of stage 1
+        comb += self.bus.we.eq(r1.wb.we)
+        comb += self.bus.adr.eq(r1.wb.adr)
+        comb += self.bus.sel.eq(r1.wb.sel)
+        comb += self.bus.stb.eq(r1.wb.stb)
+        comb += self.bus.dat_w.eq(r1.wb.dat)
+        comb += self.bus.cyc.eq(r1.wb.cyc)
+
+        # create submodule TLBUpdate
+        m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate(self)
 
         # call sub-functions putting everything together, using shared
         # signals established above
         self.stage_0(m, r0, r1, r0_full)
-        self.tlb_read(m, r0_stall, tlb_valid_way,
-                      tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
-                      dtlb_tags, dtlb_ptes)
+        self.tlb_read(m, r0_stall, tlb_way)
         self.tlb_search(m, tlb_req_index, r0, r0_valid,
-                        tlb_valid_way, tlb_tag_way, tlb_hit_way,
-                        tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
-        self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
-                        tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
-                        dtlb_tags, tlb_pte_way, dtlb_ptes)
+                        tlb_way,
+                        pte, tlb_hit, valid_ra, perm_attr, ra)
+        self.tlb_update(m, r0_valid, r0, tlb_req_index,
+                        tlb_hit, tlb_plru_victim)
         self.maybe_plrus(m, r1, plru_victim)
-        self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
-        self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
+        self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
+        self.cache_tag_read(m, r0_stall, req_index, cache_tag_set)
         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
                            r0_valid, r1, cache_valids, replace_way,
                            use_forward1_next, use_forward2_next,
                            req_hit_way, plru_victim, rc_ok, perm_attr,
                            valid_ra, perm_ok, access_ok, req_op, req_go,
-                           tlb_pte_way,
-                           tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
+                           tlb_hit, tlb_way, cache_tag_set,
                            cancel_store, req_same_tag, r0_stall, early_req_row)
         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
                            r0_valid, r0, reservation)
@@ -1720,12 +1899,12 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
                         req_hit_way, req_index, req_tag, access_ok,
-                        tlb_hit, tlb_hit_way, tlb_req_index)
+                        tlb_hit, tlb_req_index)
         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
-                    cache_valids, r0, replace_way,
+                    r0, replace_way,
                     req_hit_way, req_same_tag,
-                         r0_valid, req_op, cache_tags, req_go, ra)
-        #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
+                         r0_valid, req_op, cache_valids, req_go, ra)
+        #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
 
         return m
 
index 97f36a8f4529ab9515acea560cb33df73f0bbf51..8883a985f480ae8b2ea5a732ca1f4463a129665d 100644 (file)
@@ -39,19 +39,19 @@ class Driver(Elaboratable):
         # liveness counter
         live_cnt = Signal(5)
         # keep data and valid stable, until accepted
-        with m.If(Past(dut.p.valid_i) & ~Past(dut.p.ready_o)):
+        with m.If(Past(dut.p.i_valid) & ~Past(dut.p.o_ready)):
             comb += [
                 Assume(Stable(dut.op.sdir)),
                 Assume(Stable(dut.p.data_i.data)),
                 Assume(Stable(dut.p.data_i.shift)),
-                Assume(Stable(dut.p.valid_i)),
+                Assume(Stable(dut.p.i_valid)),
             ]
         # force reading the output in a reasonable time,
         # necessary to pass induction
-        with m.If(Past(dut.n.valid_o) & ~Past(dut.n.ready_i)):
-            comb += Assume(dut.n.ready_i)
+        with m.If(Past(dut.n.o_valid) & ~Past(dut.n.i_ready)):
+            comb += Assume(dut.n.i_ready)
         # capture transferred input data
-        with m.If(dut.p.ready_o & dut.p.valid_i):
+        with m.If(dut.p.o_ready & dut.p.i_valid):
             sync += [
                 data_i.eq(dut.p.data_i.data),
                 shift_i.eq(dut.p.data_i.shift),
@@ -71,18 +71,18 @@ class Driver(Elaboratable):
         # one work item ever in flight at any given time.
         # Whenever the unit is busy (not ready) the read and write counters
         # will differ by exactly one unit.
-        m.d.comb += Assert((read_cnt + ~dut.p.ready_o) & 0xF == write_cnt)
+        m.d.comb += Assert((read_cnt + ~dut.p.o_ready) & 0xF == write_cnt)
         # Check for liveness. It will ensure that the FSM is not stuck, and
         # will eventually produce some result.
-        # In this case, the delay between ready_o being negated and valid_o
+        # In this case, the delay between o_ready being negated and o_valid
         # being asserted has to be less than 16 cycles.
-        with m.If(~dut.p.ready_o & ~dut.n.valid_o):
+        with m.If(~dut.p.o_ready & ~dut.n.o_valid):
             m.d.sync += live_cnt.eq(live_cnt + 1)
         with m.Else():
             m.d.sync += live_cnt.eq(0)
         m.d.comb += Assert(live_cnt < 16)
         # check coverage as output data is accepted
-        with m.If(dut.n.ready_i & dut.n.valid_o):
+        with m.If(dut.n.i_ready & dut.n.o_valid):
             # increment read counter
             sync += read_cnt.eq(read_cnt + 1)
             # check result
@@ -123,9 +123,9 @@ class ALUFSMTestCase(FHDLTestCase):
         traces = [
             'clk',
             'p_data_i[7:0]', 'p_shift_i[7:0]', 'op__sdir',
-            'p_valid_i', 'p_ready_o',
+            'p_i_valid', 'p_o_ready',
             'n_data_o[7:0]',
-            'n_valid_o', 'n_ready_i',
+            'n_o_valid', 'n_i_ready',
             ('formal', {'module': 'top'}, [
                 'write_cnt[3:0]', 'read_cnt[3:0]', 'cov[7:0]'
             ])
diff --git a/src/soc/experiment/formal/proof_compalu_multi.py b/src/soc/experiment/formal/proof_compalu_multi.py
new file mode 100644 (file)
index 0000000..96b61a2
--- /dev/null
@@ -0,0 +1,211 @@
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Cesar Strauss <cestrauss@gmail.com>
+# Sponsored by NLnet under EU Grant and 957073
+# Part of the Libre-SOC Project.
+
+"""
+Formal proof of soc.experiment.compalu_multi.MultiCompUnit
+
+In short, MultiCompUnit:
+
+1) stores an opcode from Issue, when not "busy", and "issue" is pulsed
+2) signals "busy" high
+3) fetches its operand(s), if any (which are not masked or zero) from the
+Scoreboard (REL/GO protocol)
+4) starts the ALU (ready/valid protocol), as soon as all inputs are available
+5) captures result from ALU (again ready/valid)
+5) sends the result(s) back to the Scoreboard (again REL/GO)
+6) drops "busy"
+
+Note that, if the conditions are right, many of the above can occur together,
+on a single cycle.
+
+The formal proof involves ensuring that:
+1) the ALU gets the right opcode from Issue
+2) the ALU gets the right operands from the Scoreboard
+3) the Scoreboard receives the right result from the ALU
+4) no transactions are dropped or repeated
+
+This can be checked using holding registers and transaction counters.
+
+See https://bugs.libre-soc.org/show_bug.cgi?id=879 and
+https://bugs.libre-soc.org/show_bug.cgi?id=197
+"""
+
+import unittest
+
+from nmigen import Signal, Module
+from nmigen.hdl.ast import Cover, Const, Assume, Assert
+from nmutil.formaltest import FHDLTestCase
+from nmutil.singlepipe import ControlBase
+
+from soc.experiment.compalu_multi import MultiCompUnit
+from soc.fu.alu.alu_input_record import CompALUOpSubset
+
+
+# Formal model of a simple ALU, whose inputs and outputs are randomly
+# generated by the formal engine
+
+class ALUCtx:
+    def __init__(self):
+        self.op = CompALUOpSubset(name="op")
+
+
+class ALUInput:
+    def __init__(self):
+        self.a = Signal(16)
+        self.b = Signal(16)
+        self.ctx = ALUCtx()
+
+    def eq(self, i):
+        return [self.a.eq(i.a), self.b.eq(i.b)]
+
+
+class ALUOutput:
+    def __init__(self):
+        self.o1 = Signal(16)
+        self.o2 = Signal(16)
+
+    def eq(self, i):
+        return [self.o1.eq(i.o1), self.o2.eq(i.o2)]
+
+
+class ALU(ControlBase):
+    def __init__(self):
+        super().__init__(stage=self)
+        self.p.i_data, self.n.o_data = self.new_specs(None)
+        self.i, self.o = self.p.i_data, self.n.o_data
+
+    def setup(self, m, i):
+        pass
+
+    def ispec(self, name=None):
+        return ALUInput()
+
+    def ospec(self, name=None):
+        return ALUOutput()
+
+    def elaborate(self, platform):
+        m = super().elaborate(platform)
+        return m
+
+
+class CompALUMultiTestCase(FHDLTestCase):
+    def test_formal(self):
+        inspec = [('INT', 'a', '0:15'),
+                  ('INT', 'b', '0:15')]
+        outspec = [('INT', 'o1', '0:15'),
+                   ('INT', 'o2', '0:15')]
+        regspec = (inspec, outspec)
+        m = Module()
+        # Instantiate "random" ALU
+        alu = ALU()
+        m.submodules.dut = dut = MultiCompUnit(regspec, alu, CompALUOpSubset)
+        # TODO Test shadow / die
+        m.d.comb += [dut.shadown_i.eq(1), dut.go_die_i.eq(0)]
+        # Don't issue while busy
+        issue = Signal()
+        m.d.comb += dut.issue_i.eq(issue & ~dut.busy_o)
+        # Avoid toggling go_i when rel_o is low (rel / go protocol)
+        rd_go = Signal(dut.n_src)
+        m.d.comb += dut.cu.rd.go_i.eq(rd_go & dut.cu.rd.rel_o)
+        wr_go = Signal(dut.n_dst)
+        m.d.comb += dut.cu.wr.go_i.eq(wr_go & dut.cu.wr.rel_o)
+        # Transaction counters
+        do_issue = Signal()
+        m.d.comb += do_issue.eq(dut.issue_i & ~dut.busy_o)
+        cnt_issue = Signal(4)
+        m.d.sync += cnt_issue.eq(cnt_issue + do_issue)
+        do_read = Signal(dut.n_src)
+        m.d.comb += do_read.eq(dut.cu.rd.rel_o & dut.cu.rd.go_i)
+        cnt_read = []
+        for i in range(dut.n_src):
+            cnt = Signal(4, name="cnt_read_%d" % i)
+            m.d.sync += cnt.eq(cnt + do_read[i])
+            cnt_read.append(cnt)
+        do_write = Signal(dut.n_dst)
+        m.d.comb += do_write.eq(dut.cu.wr.rel_o & dut.cu.wr.go_i)
+        cnt_write = []
+        for i in range(dut.n_dst):
+            cnt = Signal(4, name="cnt_write_%d" % i)
+            m.d.sync += cnt.eq(cnt + do_write[i])
+            cnt_write.append(cnt)
+        do_alu_write = Signal()
+        m.d.comb += do_alu_write.eq(alu.p.i_valid & alu.p.o_ready)
+        cnt_alu_write = Signal(4)
+        m.d.sync += cnt_alu_write.eq(cnt_alu_write + do_alu_write)
+        do_alu_read = Signal()
+        m.d.comb += do_alu_read.eq(alu.n.o_valid & alu.n.i_ready)
+        cnt_alu_read = Signal(4)
+        m.d.sync += cnt_alu_read.eq(cnt_alu_read + do_alu_read)
+        cnt_masked_read = []
+        do_masked_read = Signal(dut.n_src)
+        for i in range(dut.n_src):
+            cnt = Signal(4, name="cnt_masked_read_%d" % i)
+            if i == 0:
+                extra = dut.oper_i.zero_a
+            elif i == 1:
+                extra = dut.oper_i.imm_data.ok
+            else:
+                extra = Const(0, 1)
+            m.d.comb += do_masked_read[i].eq(do_issue &
+                                             (dut.rdmaskn[i] | extra))
+            m.d.sync += cnt.eq(cnt + do_masked_read[i])
+            cnt_masked_read.append(cnt)
+        # If the ALU is idle, do not assert valid
+        with m.If((cnt_alu_read == cnt_alu_write) & ~do_alu_write):
+            m.d.comb += Assume(~alu.n.o_valid)
+        # Keep ALU valid high, until read
+        last_alu_valid = Signal()
+        m.d.sync += last_alu_valid.eq(alu.n.o_valid & ~alu.n.i_ready)
+        with m.If(last_alu_valid):
+            m.d.comb += Assume(alu.n.o_valid)
+
+        # Invariant checks
+
+        # For every instruction issued, at any point in time,
+        # each operand was either:
+        # 1) Already read
+        # 2) Not read yet, but the read is pending (rel_o high)
+        # 3) Masked
+        for i in range(dut.n_src):
+            sum_read = Signal(4)
+            m.d.comb += sum_read.eq(
+                cnt_read[i] + cnt_masked_read[i] + dut.cu.rd.rel_o[i])
+            m.d.comb += Assert(sum_read == cnt_issue)
+
+        # For every instruction, either:
+        # 1) The ALU is executing the instruction
+        # 2) Otherwise, execution is pending (alu.p.i_valid is high)
+        # 3) Otherwise, it is waiting for operands
+        #    (some dut.cu.rd.rel_o are still high)
+        # 4) ... unless all operands are masked, in which case there is a one
+        #    cycle delay
+        all_masked = Signal()
+        m.d.sync += all_masked.eq(do_masked_read.all())
+        sum_alu_write = Signal(4)
+        m.d.comb += sum_alu_write.eq(
+            cnt_alu_write +
+            (dut.cu.rd.rel_o.any() | all_masked | alu.p.i_valid))
+        m.d.comb += Assert(sum_alu_write == cnt_issue)
+
+        # Ask the formal engine to give an example
+        m.d.comb += Cover((cnt_issue == 2)
+                          & (cnt_read[0] == 1)
+                          & (cnt_read[1] == 0)
+                          & (cnt_write[0] == 1)
+                          & (cnt_write[1] == 1)
+                          & (cnt_alu_write == 1)
+                          & (cnt_alu_read == 1)
+                          & (cnt_masked_read[0] == 1)
+                          & (cnt_masked_read[1] == 1))
+        with self.subTest("cover"):
+            self.assertFormal(m, mode="cover", depth=10)
+
+        # Check assertions
+        with self.subTest("bmc"):
+            self.assertFormal(m, mode="bmc", depth=10)
+
+
+if __name__ == "__main__":
+    unittest.main()
index 1b8aa8586a761337cf5cb09359b807cd66576516..064f39b629e2388616a47be04726cd1c290b1853 100644 (file)
@@ -17,18 +17,28 @@ TODO (in no specific order):
   write TAG_BITS width which may not match full ram blocks and might
   cause muxes to be inferred for "partial writes".
 * Check if making the read size of PLRU a ROM helps utilization
+
+Links:
+
+* https://bugs.libre-soc.org/show_bug.cgi?id=485
+* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
+  (discussion about brams for ECP5)
+
 """
 
 from enum import (Enum, unique)
-from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
+from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
+                    Record)
 from nmigen.cli import main, rtlil
 from nmutil.iocontrol import RecordObject
 from nmigen.utils import log2_int
+from nmigen.lib.coding import Decoder
 from nmutil.util import Display
+from nmutil.latch import SRLatch
 
 #from nmutil.plru import PLRU
+from soc.experiment.plru import PLRU, PLRUs
 from soc.experiment.cache_ram import CacheRam
-from soc.experiment.plru import PLRU
 
 from soc.experiment.mem_types import (Fetch1ToICacheType,
                                       ICacheToDecode1Type,
@@ -37,8 +47,11 @@ from soc.experiment.mem_types import (Fetch1ToICacheType,
 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
                                      WB_SEL_BITS, WBAddrType, WBDataType,
                                      WBSelType, WBMasterOut, WBSlaveOut,
-                                     WBMasterOutVector, WBSlaveOutVector,
-                                     WBIOMasterOut, WBIOSlaveOut)
+                                     )
+
+from nmigen_soc.wishbone.bus import Interface
+from soc.minerva.units.fetch import FetchUnitInterface
+
 
 # for test
 from soc.bus.sram import SRAM
@@ -50,225 +63,216 @@ from nmigen.cli import main, rtlil
 # Also, check out the cxxsim nmigen branch, and latest yosys from git
 from nmutil.sim_tmp_alternative import Simulator, Settle
 
+# from microwatt/utils.vhdl
+def ispow2(n):
+    return n != 0 and (n & (n - 1)) == 0
 
 SIM            = 0
-LINE_SIZE      = 64
-# BRAM organisation: We never access more than wishbone_data_bits
-# at a time so to save resources we make the array only that wide,
-# and use consecutive indices for to make a cache "line"
-#
-# ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
-ROW_SIZE       = WB_DATA_BITS // 8
-# Number of lines in a set
-NUM_LINES      = 16
-# Number of ways
-NUM_WAYS       = 4
-# L1 ITLB number of entries (direct mapped)
-TLB_SIZE       = 64
-# L1 ITLB log_2(page_size)
-TLB_LG_PGSZ    = 12
-# Number of real address bits that we store
-REAL_ADDR_BITS = 56
 # Non-zero to enable log data collection
 LOG_LENGTH     = 0
 
-ROW_SIZE_BITS  = ROW_SIZE * 8
-# ROW_PER_LINE is the number of row (wishbone) transactions in a line
-ROW_PER_LINE   = LINE_SIZE // ROW_SIZE
-# BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
-BRAM_ROWS      = NUM_LINES * ROW_PER_LINE
-# INSN_PER_ROW is the number of 32bit instructions per BRAM row
-INSN_PER_ROW   = ROW_SIZE_BITS // 32
-
-# Bit fields counts in the address
-#
-# INSN_BITS is the number of bits to select an instruction in a row
-INSN_BITS      = log2_int(INSN_PER_ROW)
-# ROW_BITS is the number of bits to select a row
-ROW_BITS       = log2_int(BRAM_ROWS)
-# ROW_LINE_BITS is the number of bits to select a row within a line
-ROW_LINE_BITS  = log2_int(ROW_PER_LINE)
-# LINE_OFF_BITS is the number of bits for the offset in a cache line
-LINE_OFF_BITS  = log2_int(LINE_SIZE)
-# ROW_OFF_BITS is the number of bits for the offset in a row
-ROW_OFF_BITS   = log2_int(ROW_SIZE)
-# INDEX_BITS is the number of bits to select a cache line
-INDEX_BITS     = log2_int(NUM_LINES)
-# SET_SIZE_BITS is the log base 2 of the set size
-SET_SIZE_BITS  = LINE_OFF_BITS + INDEX_BITS
-# TAG_BITS is the number of bits of the tag part of the address
-TAG_BITS       = REAL_ADDR_BITS - SET_SIZE_BITS
-# TAG_WIDTH is the width in bits of each way of the tag RAM
-TAG_WIDTH      = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
-
-# WAY_BITS is the number of bits to select a way
-WAY_BITS       = log2_int(NUM_WAYS)
-TAG_RAM_WIDTH  = TAG_BITS * NUM_WAYS
-
-# L1 ITLB
-TLB_BITS        = log2_int(TLB_SIZE)
-TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
-TLB_PTE_BITS    = 64
-
-print("BRAM_ROWS       =", BRAM_ROWS)
-print("INDEX_BITS      =", INDEX_BITS)
-print("INSN_BITS       =", INSN_BITS)
-print("INSN_PER_ROW    =", INSN_PER_ROW)
-print("LINE_SIZE       =", LINE_SIZE)
-print("LINE_OFF_BITS   =", LINE_OFF_BITS)
-print("LOG_LENGTH      =", LOG_LENGTH)
-print("NUM_LINES       =", NUM_LINES)
-print("NUM_WAYS        =", NUM_WAYS)
-print("REAL_ADDR_BITS  =", REAL_ADDR_BITS)
-print("ROW_BITS        =", ROW_BITS)
-print("ROW_OFF_BITS    =", ROW_OFF_BITS)
-print("ROW_LINE_BITS   =", ROW_LINE_BITS)
-print("ROW_PER_LINE    =", ROW_PER_LINE)
-print("ROW_SIZE        =", ROW_SIZE)
-print("ROW_SIZE_BITS   =", ROW_SIZE_BITS)
-print("SET_SIZE_BITS   =", SET_SIZE_BITS)
-print("SIM             =", SIM)
-print("TAG_BITS        =", TAG_BITS)
-print("TAG_RAM_WIDTH   =", TAG_RAM_WIDTH)
-print("TAG_BITS        =", TAG_BITS)
-print("TLB_BITS        =", TLB_BITS)
-print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
-print("TLB_LG_PGSZ     =", TLB_LG_PGSZ)
-print("TLB_PTE_BITS    =", TLB_PTE_BITS)
-print("TLB_SIZE        =", TLB_SIZE)
-print("WAY_BITS        =", WAY_BITS)
-
-# from microwatt/utils.vhdl
-def ispow2(n):
-    return n != 0 and (n & (n - 1)) == 0
-
-assert LINE_SIZE % ROW_SIZE == 0
-assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
-assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
-assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
-assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
-assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
-    "geometry bits don't add up"
-assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
-   "geometry bits don't add up"
-assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
-    "geometry bits don't add up"
-assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
-    "geometry bits don't add up"
-
-# Example of layout for 32 lines of 64 bytes:
-#
-# ..  tag    |index|  line  |
-# ..         |   row   |    |
-# ..         |     |   | |00| zero          (2)
-# ..         |     |   |-|  | INSN_BITS     (1)
-# ..         |     |---|    | ROW_LINE_BITS  (3)
-# ..         |     |--- - --| LINE_OFF_BITS (6)
-# ..         |         |- --| ROW_OFF_BITS  (3)
-# ..         |----- ---|    | ROW_BITS      (8)
-# ..         |-----|        | INDEX_BITS    (5)
-# .. --------|              | TAG_BITS      (53)
-
-# The cache data BRAM organized as described above for each way
-#subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
-#
-# The cache tags LUTRAM has a row per set. Vivado is a pain and will
-# not handle a clean (commented) definition of the cache tags as a 3d
-# memory. For now, work around it by putting all the tags
-def CacheTagArray():
-    return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
-                 for x in range(NUM_LINES))
-
-# The cache valid bits
-def CacheValidBitsArray():
-    return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
-                 for x in range(NUM_LINES))
-
-def RowPerLineValidArray():
-    return Array(Signal(name="rows_valid_%d" %x) \
-                 for x in range(ROW_PER_LINE))
-
-
-# TODO to be passed to nigmen as ram attributes
-# attribute ram_style : string;
-# attribute ram_style of cache_tags : signal is "distributed";
-
-
-def TLBValidBitsArray():
-    return Array(Signal(name="tlbvalid_%d" %x) \
-                 for x in range(TLB_SIZE))
-
-def TLBTagArray():
-    return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
-                 for x in range(TLB_SIZE))
-
-def TLBPtesArray():
-    return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
-                 for x in range(TLB_SIZE))
-
-# Cache RAM interface
-def CacheRamOut():
-    return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
-                 for x in range(NUM_WAYS))
-
-# PLRU output interface
-def PLRUOut():
-    return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
-                 for x in range(NUM_LINES))
-
-# Return the cache line index (tag index) for an address
-def get_index(addr):
-    return addr[LINE_OFF_BITS:SET_SIZE_BITS]
-
-# Return the cache row index (data memory) for an address
-def get_row(addr):
-    return addr[ROW_OFF_BITS:SET_SIZE_BITS]
-
-# Return the index of a row within a line
-def get_row_of_line(row):
-    return row[:ROW_LINE_BITS]
-
-# Returns whether this is the last row of a line
-def is_last_row_addr(addr, last):
-    return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
-
-# Returns whether this is the last row of a line
-def is_last_row(row, last):
-    return get_row_of_line(row) == last
-
-# Return the next row in the current cache line. We use a dedicated
-# function in order to limit the size of the generated adder to be
-# only the bits within a cache line (3 bits with default settings)
-def next_row(row):
-    row_v = row[0:ROW_LINE_BITS] + 1
-    return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
-
-# Read the instruction word for the given address
-# in the current cache row
-def read_insn_word(addr, data):
-    word = addr[2:INSN_BITS+2]
-    return data.word_select(word, 32)
-
-# Get the tag value from the address
-def get_tag(addr):
-    return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
-
-# Read a tag from a tag memory row
-def read_tag(way, tagset):
-    return tagset.word_select(way, TAG_BITS)
-
-# Write a tag to tag memory row
-def write_tag(way, tagset, tag):
-    return read_tag(way, tagset).eq(tag)
-
-# Simple hash for direct-mapped TLB index
-def hash_ea(addr):
-    hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
-           TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
-          ] ^ addr[
-           TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
-          ]
-    return hsh
+class ICacheConfig:
+    def __init__(self, XLEN          = 64,
+                       LINE_SIZE     = 64,
+                       NUM_LINES     = 64,  # Number of lines in a set
+                       NUM_WAYS      = 2,  # Number of ways
+                       TLB_SIZE      = 64,  # L1 ITLB number of entries
+                       TLB_LG_PGSZ   = 12): # L1 ITLB log_2(page_size)
+        self.XLEN           = XLEN
+        self.LINE_SIZE      = LINE_SIZE
+        self.NUM_LINES      = NUM_LINES
+        self.NUM_WAYS       = NUM_WAYS
+        self.TLB_SIZE       = TLB_SIZE
+        self.TLB_LG_PGSZ    = TLB_LG_PGSZ
+
+        # BRAM organisation: We never access more than wishbone_data_bits
+        # at a time so to save resources we make the array only that wide,
+        # and use consecutive indices for to make a cache "line"
+        #
+        # self.ROW_SIZE is the width in bytes of the BRAM
+        # (based on WB, so 64-bits)
+        self.ROW_SIZE       = WB_DATA_BITS // 8
+        # Number of real address bits that we store
+        self.REAL_ADDR_BITS = XLEN-8 # 56 for XLEN=64
+
+        self.ROW_SIZE_BITS  = self.ROW_SIZE * 8
+        # ROW_PER_LINE is the number of row (wishbone) transactions in a line
+        self.ROW_PER_LINE   = self.LINE_SIZE // self.ROW_SIZE
+        # BRAM_ROWS is the number of rows in BRAM
+        # needed to represent the full icache
+        self.BRAM_ROWS      = self.NUM_LINES * self.ROW_PER_LINE
+        # INSN_PER_ROW is the number of 32bit instructions per BRAM row
+        self.INSN_PER_ROW   = self.ROW_SIZE_BITS // 32
+
+        # Bit fields counts in the address
+        #
+        # INSN_BITS is the number of bits to select an instruction in a row
+        self.INSN_BITS      = log2_int(self.INSN_PER_ROW)
+        # ROW_BITS is the number of bits to select a row
+        self.ROW_BITS       = log2_int(self.BRAM_ROWS)
+        # ROW_LINE_BITS is the number of bits to select a row within a line
+        self.ROW_LINE_BITS  = log2_int(self.ROW_PER_LINE)
+        # LINE_OFF_BITS is the number of bits for the offset in a cache line
+        self.LINE_OFF_BITS  = log2_int(self.LINE_SIZE)
+        # ROW_OFF_BITS is the number of bits for the offset in a row
+        self.ROW_OFF_BITS   = log2_int(self.ROW_SIZE)
+        # INDEX_BITS is the number of bits to select a cache line
+        self.INDEX_BITS     = log2_int(self.NUM_LINES)
+        # SET_SIZE_BITS is the log base 2 of the set size
+        self.SET_SIZE_BITS  = self.LINE_OFF_BITS + self.INDEX_BITS
+        # TAG_BITS is the number of bits of the tag part of the address
+        self.TAG_BITS       = self.REAL_ADDR_BITS - self.SET_SIZE_BITS
+        # TAG_WIDTH is the width in bits of each way of the tag RAM
+        self.TAG_WIDTH      = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8)
+
+        # WAY_BITS is the number of bits to select a way
+        self.WAY_BITS       = log2_int(self.NUM_WAYS)
+        self.TAG_RAM_WIDTH  = self.TAG_BITS * self.NUM_WAYS
+
+        # L1 ITLB
+        self.TL_BITS        = log2_int(self.TLB_SIZE)
+        self.TLB_EA_TAG_BITS = XLEN - (self.TLB_LG_PGSZ + self.TL_BITS)
+        self.TLB_PTE_BITS    = XLEN
+
+        print("self.XLEN            =", self.XLEN)
+        print("self.BRAM_ROWS       =", self.BRAM_ROWS)
+        print("self.INDEX_BITS      =", self.INDEX_BITS)
+        print("self.INSN_BITS       =", self.INSN_BITS)
+        print("self.INSN_PER_ROW    =", self.INSN_PER_ROW)
+        print("self.LINE_SIZE       =", self.LINE_SIZE)
+        print("self.LINE_OFF_BITS   =", self.LINE_OFF_BITS)
+        print("LOG_LENGTH      =", LOG_LENGTH)
+        print("self.NUM_LINES       =", self.NUM_LINES)
+        print("self.NUM_WAYS        =", self.NUM_WAYS)
+        print("self.REAL_ADDR_BITS  =", self.REAL_ADDR_BITS)
+        print("self.ROW_BITS        =", self.ROW_BITS)
+        print("self.ROW_OFF_BITS    =", self.ROW_OFF_BITS)
+        print("self.ROW_LINE_BITS   =", self.ROW_LINE_BITS)
+        print("self.ROW_PER_LINE    =", self.ROW_PER_LINE)
+        print("self.ROW_SIZE        =", self.ROW_SIZE)
+        print("self.ROW_SIZE_BITS   =", self.ROW_SIZE_BITS)
+        print("self.SET_SIZE_BITS   =", self.SET_SIZE_BITS)
+        print("SIM             =", SIM)
+        print("self.TAG_BITS        =", self.TAG_BITS)
+        print("self.TAG_RAM_WIDTH   =", self.TAG_RAM_WIDTH)
+        print("self.TAG_BITS        =", self.TAG_BITS)
+        print("self.TL_BITS        =", self.TL_BITS)
+        print("self.TLB_EA_TAG_BITS =", self.TLB_EA_TAG_BITS)
+        print("self.TLB_LG_PGSZ     =", self.TLB_LG_PGSZ)
+        print("self.TLB_PTE_BITS    =", self.TLB_PTE_BITS)
+        print("self.TLB_SIZE        =", self.TLB_SIZE)
+        print("self.WAY_BITS        =", self.WAY_BITS)
+        print()
+
+        assert self.LINE_SIZE % self.ROW_SIZE == 0
+        assert ispow2(self.LINE_SIZE), "self.LINE_SIZE not power of 2"
+        assert ispow2(self.NUM_LINES), "self.NUM_LINES not power of 2"
+        assert ispow2(self.ROW_PER_LINE), "self.ROW_PER_LINE not power of 2"
+        assert ispow2(self.INSN_PER_ROW), "self.INSN_PER_ROW not power of 2"
+        assert (self.ROW_BITS == (self.INDEX_BITS + self.ROW_LINE_BITS)), \
+            "geometry bits don't add up"
+        assert (self.LINE_OFF_BITS ==
+            (self.ROW_OFF_BITS + self.ROW_LINE_BITS)), \
+           "geometry bits don't add up"
+        assert (self.REAL_ADDR_BITS ==
+            (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS)), \
+            "geometry bits don't add up"
+        assert (self.REAL_ADDR_BITS ==
+            (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS)), \
+            "geometry bits don't add up"
+
+        # Example of layout for 32 lines of 64 bytes:
+        #
+        # ..  tag    |index|  line  |
+        # ..         |   row   |    |
+        # ..         |     |   | |00| zero          (2)
+        # ..         |     |   |-|  | self.INSN_BITS     (1)
+        # ..         |     |---|    | self.ROW_LINE_BITS  (3)
+        # ..         |     |--- - --| self.LINE_OFF_BITS (6)
+        # ..         |         |- --| self.ROW_OFF_BITS  (3)
+        # ..         |----- ---|    | self.ROW_BITS      (8)
+        # ..         |-----|        | self.INDEX_BITS    (5)
+        # .. --------|              | self.TAG_BITS      (53)
+
+    # The cache data BRAM organized as described above for each way
+    #subtype cache_row_t is std_ulogic_vector(self.ROW_SIZE_BITS-1 downto 0);
+    #
+    def RowPerLineValidArray(self):
+        return Array(Signal(name="rows_valid_%d" %x) \
+                     for x in range(self.ROW_PER_LINE))
+
+
+    # TODO to be passed to nigmen as ram attributes
+    # attribute ram_style : string;
+    # attribute ram_style of cache_tags : signal is "distributed";
+
+    def TLBRecord(self, name):
+        tlb_layout = [ ('tag', self.TLB_EA_TAG_BITS),
+                      ('pte', self.TLB_PTE_BITS)
+                     ]
+        return Record(tlb_layout, name=name)
+
+    def TLBArray(self):
+        return Array(self.TLBRecord("tlb%d" % x) for x in range(self.TLB_SIZE))
+
+    # PLRU output interface
+    def PLRUOut(self):
+        return Array(Signal(self.WAY_BITS, name="plru_out_%d" %x) \
+                     for x in range(self.NUM_LINES))
+
+    # Return the cache line index (tag index) for an address
+    def get_index(self, addr):
+        return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS]
+
+    # Return the cache row index (data memory) for an address
+    def get_row(self, addr):
+        return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS]
+
+    # Return the index of a row within a line
+    def get_row_of_line(self, row):
+        return row[:self.ROW_BITS][:self.ROW_LINE_BITS]
+
+    # Returns whether this is the last row of a line
+    def is_last_row_addr(self, addr, last):
+        return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last
+
+    # Returns whether this is the last row of a line
+    def is_last_row(self, row, last):
+        return self.get_row_of_line(row) == last
+
+    # Return the next row in the current cache line. We use a dedicated
+    # function in order to limit the size of the generated adder to be
+    # only the bits within a cache line (3 bits with default settings)
+    def next_row(self, row):
+        row_v = row[0:self.ROW_LINE_BITS] + 1
+        return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:])
+
+    # Read the instruction word for the given address
+    # in the current cache row
+    def read_insn_word(self, addr, data):
+        word = addr[2:self.INSN_BITS+2]
+        return data.word_select(word, 32)
+
+    # Get the tag value from the address
+    def get_tag(self, addr):
+        return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS]
+
+    # Read a tag from a tag memory row
+    def read_tag(self, way, tagset):
+        return tagset.word_select(way, self.TAG_BITS)
+
+    # Write a tag to tag memory row
+    def write_tag(self, way, tagset, tag):
+        return self.read_tag(way, tagset).eq(tag)
+
+    # Simple hash for direct-mapped TLB index
+    def hash_ea(self, addr):
+        hsh = (addr[self.TLB_LG_PGSZ:self.TLB_LG_PGSZ + self.TL_BITS] ^
+               addr[self.TLB_LG_PGSZ + self.TL_BITS:
+                    self.TLB_LG_PGSZ + 2 * self.TL_BITS ] ^
+               addr[self.TLB_LG_PGSZ + 2 * self.TL_BITS:
+                    self.TLB_LG_PGSZ + 3 * self.TL_BITS])
+        return hsh
 
 
 # Cache reload state machine
@@ -280,10 +284,10 @@ class State(Enum):
 
 
 class RegInternal(RecordObject):
-    def __init__(self):
+    def __init__(self, cfg):
         super().__init__()
         # Cache hit state (Latches for 1 cycle BRAM access)
-        self.hit_way      = Signal(NUM_WAYS)
+        self.hit_way      = Signal(cfg.WAY_BITS)
         self.hit_nia      = Signal(64)
         self.hit_smark    = Signal()
         self.hit_valid    = Signal()
@@ -292,21 +296,22 @@ class RegInternal(RecordObject):
         self.state        = Signal(State, reset=State.IDLE)
         self.wb           = WBMasterOut("wb")
         self.req_adr      = Signal(64)
-        self.store_way    = Signal(NUM_WAYS)
-        self.store_index  = Signal(NUM_LINES)
-        self.store_row    = Signal(BRAM_ROWS)
-        self.store_tag    = Signal(TAG_BITS)
+        self.store_way    = Signal(cfg.WAY_BITS)
+        self.store_index  = Signal(cfg.INDEX_BITS)
+        self.store_row    = Signal(cfg.ROW_BITS)
+        self.store_tag    = Signal(cfg.TAG_BITS)
         self.store_valid  = Signal()
-        self.end_row_ix   = Signal(ROW_LINE_BITS)
-        self.rows_valid   = RowPerLineValidArray()
+        self.end_row_ix   = Signal(cfg.ROW_LINE_BITS)
+        self.rows_valid   = cfg.RowPerLineValidArray()
 
         # TLB miss state
         self.fetch_failed = Signal()
 
 
-class ICache(Elaboratable):
+class ICache(FetchUnitInterface, Elaboratable, ICacheConfig):
     """64 bit direct mapped icache. All instructions are 4B aligned."""
-    def __init__(self):
+    def __init__(self, pspec):
+        FetchUnitInterface.__init__(self, pspec)
         self.i_in           = Fetch1ToICacheType(name="i_in")
         self.i_out          = ICacheToDecode1Type(name="i_out")
 
@@ -317,11 +322,52 @@ class ICache(Elaboratable):
         self.flush_in       = Signal()
         self.inval_in       = Signal()
 
-        self.wb_out         = WBMasterOut(name="wb_out")
-        self.wb_in          = WBSlaveOut(name="wb_in")
+        # standard naming (wired to non-standard for compatibility)
+        self.bus = Interface(addr_width=32,
+                            data_width=64,
+                            granularity=8,
+                            features={'stall'},
+                            #alignment=0,
+                            name="icache_wb")
 
         self.log_out        = Signal(54)
 
+        # use FetchUnitInterface, helps keep some unit tests running
+        self.use_fetch_iface = False
+
+        # test if small cache to be enabled
+        self.small_cache = (hasattr(pspec, "small_cache") and
+                                 (pspec.small_cache == True))
+        # test if microwatt compatibility to be enabled
+        self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+                                 (pspec.microwatt_compat == True))
+        # test if fabric compatibility is to be enabled
+        self.fabric_compat = (hasattr(pspec, "fabric_compat") and
+                                 (pspec.fabric_compat == True))
+
+        XLEN = pspec.XLEN
+        LINE_SIZE = 64
+        TLB_SIZE = 8
+        NUM_LINES = 8
+        NUM_WAYS = 2
+        if self.small_cache:
+            # reduce way sizes and num lines to ridiculously small
+            NUM_LINES = 2
+            NUM_WAYS = 1
+            TLB_SIZE = 2
+        if self.microwatt_compat or self.fabric_compat:
+            # reduce way sizes
+            NUM_WAYS = 1
+
+        ICacheConfig.__init__(self, LINE_SIZE=LINE_SIZE,
+                                    XLEN=XLEN,
+                                    NUM_LINES = NUM_LINES,
+                                    NUM_WAYS = NUM_WAYS,
+                                    TLB_SIZE=TLB_SIZE
+                             )
+
+    def use_fetch_interface(self):
+        self.use_fetch_iface = True
 
     # Generate a cache RAM for each way
     def rams(self, m, r, cache_out_row, use_previous,
@@ -330,93 +376,100 @@ class ICache(Elaboratable):
         comb = m.d.comb
         sync = m.d.sync
 
-        wb_in, stall_in = self.wb_in, self.stall_in
+        bus, stall_in = self.bus, self.stall_in
+
+        # read condition (for every cache ram)
+        do_read  = Signal()
+        comb += do_read.eq(~(stall_in | use_previous))
+
+        rd_addr  = Signal(self.ROW_BITS)
+        wr_addr  = Signal(self.ROW_BITS)
+        comb += rd_addr.eq(req_row)
+        comb += wr_addr.eq(r.store_row)
 
-        for i in range(NUM_WAYS):
-            do_read  = Signal(name="do_rd_%d" % i)
+        # binary-to-unary converters: replace-way enabled by bus.ack,
+        # hit-way left permanently enabled
+        m.submodules.replace_way_e = re = Decoder(self.NUM_WAYS)
+        m.submodules.hit_way_e = he = Decoder(self.NUM_WAYS)
+        comb += re.i.eq(replace_way)
+        comb += re.n.eq(~bus.ack)
+        comb += he.i.eq(r.hit_way)
+
+        for i in range(self.NUM_WAYS):
             do_write = Signal(name="do_wr_%d" % i)
-            rd_addr  = Signal(ROW_BITS)
-            wr_addr  = Signal(ROW_BITS)
-            d_out    = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
-            wr_sel   = Signal(ROW_SIZE)
+            d_out    = Signal(self.ROW_SIZE_BITS, name="d_out_%d" % i)
+            wr_sel   = Signal(self.ROW_SIZE, name="wr_sel_%d" % i)
 
-            way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
-            setattr(m.submodules, "cacheram_%d" % i, way)
+            way = CacheRam(self.ROW_BITS, self.ROW_SIZE_BITS,
+                           TRACE=True, ram_num=i)
+            m.submodules["cacheram_%d" % i] =  way
 
             comb += way.rd_en.eq(do_read)
             comb += way.rd_addr.eq(rd_addr)
             comb += d_out.eq(way.rd_data_o)
             comb += way.wr_sel.eq(wr_sel)
             comb += way.wr_addr.eq(wr_addr)
-            comb += way.wr_data.eq(wb_in.dat)
+            comb += way.wr_data.eq(bus.dat_r)
 
-            comb += do_read.eq(~(stall_in | use_previous))
-            comb += do_write.eq(wb_in.ack & (replace_way == i))
+            comb += do_write.eq(re.o[i])
 
             with m.If(do_write):
                 sync += Display("cache write adr: %x data: %lx",
                                 wr_addr, way.wr_data)
 
-            with m.If(r.hit_way == i):
+            with m.If(he.o[i]):
                 comb += cache_out_row.eq(d_out)
                 with m.If(do_read):
                     sync += Display("cache read adr: %x data: %x",
                                      req_row, d_out)
 
-            comb += rd_addr.eq(req_row)
-            comb += wr_addr.eq(r.store_row)
-            comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
+            comb += wr_sel.eq(Repl(do_write, self.ROW_SIZE))
 
     # Generate PLRUs
     def maybe_plrus(self, m, r, plru_victim):
         comb = m.d.comb
 
-        with m.If(NUM_WAYS > 1):
-            for i in range(NUM_LINES):
-                plru_acc_i  = Signal(WAY_BITS)
-                plru_acc_en = Signal()
-                plru        = PLRU(WAY_BITS)
-                setattr(m.submodules, "plru_%d" % i, plru)
-
-                comb += plru.acc_i.eq(plru_acc_i)
-                comb += plru.acc_en.eq(plru_acc_en)
+        if self.NUM_WAYS == 0:
+            return
 
-                # PLRU interface
-                with m.If(get_index(r.hit_nia) == i):
-                    comb += plru.acc_en.eq(r.hit_valid)
 
-                comb += plru.acc_i.eq(r.hit_way)
-                comb += plru_victim[i].eq(plru.lru_o)
+        m.submodules.plrus = plru = PLRUs("itag", self.NUM_LINES,
+                                                  self.WAY_BITS)
+        comb += plru.way.eq(r.hit_way)
+        comb += plru.valid.eq(r.hit_valid)
+        comb += plru.index.eq(self.get_index(r.hit_nia))
+        comb += plru.isel.eq(r.store_index) # select victim
+        comb += plru_victim.eq(plru.o_index) # selected victim
 
     # TLB hit detection and real address generation
-    def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
-                    real_addr, itlb_valid_bits, ra_valid, eaa_priv,
+    def itlb_lookup(self, m, tlb_req_index, itlb, itlb_valid,
+                    real_addr, ra_valid, eaa_priv,
                     priv_fault, access_ok):
 
         comb = m.d.comb
 
         i_in = self.i_in
 
-        pte  = Signal(TLB_PTE_BITS)
-        ttag = Signal(TLB_EA_TAG_BITS)
+        # use an *asynchronous* Memory read port here (combinatorial)
+        m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb")
+        tlb = self.TLBRecord("tlb_rdport")
+        pte, ttag = tlb.pte, tlb.tag
 
-        comb += tlb_req_index.eq(hash_ea(i_in.nia))
-        comb += pte.eq(itlb_ptes[tlb_req_index])
-        comb += ttag.eq(itlb_tags[tlb_req_index])
+        comb += tlb_req_index.eq(self.hash_ea(i_in.nia))
+        comb += rd_tlb.addr.eq(tlb_req_index)
+        comb += tlb.eq(rd_tlb.data)
 
         with m.If(i_in.virt_mode):
-            comb += real_addr.eq(Cat(
-                     i_in.nia[:TLB_LG_PGSZ],
-                     pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
-                    ))
+            comb += real_addr.eq(Cat(i_in.nia[:self.TLB_LG_PGSZ],
+                                     pte[self.TLB_LG_PGSZ:self.REAL_ADDR_BITS]))
 
-            with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
-                comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
+            with m.If(ttag == i_in.nia[self.TLB_LG_PGSZ + self.TL_BITS:64]):
+                comb += ra_valid.eq(itlb_valid.q.bit_select(tlb_req_index, 1))
 
             comb += eaa_priv.eq(pte[3])
 
         with m.Else():
-            comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
+            comb += real_addr.eq(i_in.nia[:self.REAL_ADDR_BITS])
             comb += ra_valid.eq(1)
             comb += eaa_priv.eq(1)
 
@@ -425,85 +478,101 @@ class ICache(Elaboratable):
         comb += access_ok.eq(ra_valid & ~priv_fault)
 
     # iTLB update
-    def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
+    def itlb_update(self, m, itlb, itlb_valid):
         comb = m.d.comb
         sync = m.d.sync
 
         m_in = self.m_in
 
-        wr_index = Signal(TLB_SIZE)
-        comb += wr_index.eq(hash_ea(m_in.addr))
+        wr_index = Signal(self.TL_BITS)
+        wr_unary = Signal(self.TLB_SIZE)
+        comb += wr_index.eq(self.hash_ea(m_in.addr))
+        comb += wr_unary.eq(1<<wr_index)
+
+        m.submodules.wr_tlb = wr_tlb = self.tlbmem.write_port()
+        sync += itlb_valid.s.eq(0)
+        sync += itlb_valid.r.eq(0)
 
         with m.If(m_in.tlbie & m_in.doall):
             # Clear all valid bits
-            for i in range(TLB_SIZE):
-                sync += itlb_valid_bits[i].eq(0)
+            sync += itlb_valid.r.eq(-1)
 
         with m.Elif(m_in.tlbie):
             # Clear entry regardless of hit or miss
-            sync += itlb_valid_bits[wr_index].eq(0)
+            sync += itlb_valid.r.eq(wr_unary)
 
         with m.Elif(m_in.tlbld):
-            sync += itlb_tags[wr_index].eq(
-                     m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
-                    )
-            sync += itlb_ptes[wr_index].eq(m_in.pte)
-            sync += itlb_valid_bits[wr_index].eq(1)
+            tlb = self.TLBRecord("tlb_wrport")
+            comb += tlb.tag.eq(m_in.addr[self.TLB_LG_PGSZ + self.TL_BITS:64])
+            comb += tlb.pte.eq(m_in.pte)
+            comb += wr_tlb.en.eq(1)
+            comb += wr_tlb.addr.eq(wr_index)
+            comb += wr_tlb.data.eq(tlb)
+            sync += itlb_valid.s.eq(wr_unary)
 
     # Cache hit detection, output to fetch2 and other misc logic
     def icache_comb(self, m, use_previous, r, req_index, req_row,
                     req_hit_way, req_tag, real_addr, req_laddr,
-                    cache_valid_bits, cache_tags, access_ok,
+                    cache_valids, access_ok,
                     req_is_hit, req_is_miss, replace_way,
                     plru_victim, cache_out_row):
 
         comb = m.d.comb
+        m.submodules.rd_tag = rd_tag = self.tagmem.read_port(domain="comb")
 
-        i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
+        i_in, i_out, bus = self.i_in, self.i_out, self.bus
         flush_in, stall_out = self.flush_in, self.stall_out
 
         is_hit  = Signal()
-        hit_way = Signal(NUM_WAYS)
+        hit_way = Signal(self.WAY_BITS)
 
         # i_in.sequential means that i_in.nia this cycle is 4 more than
         # last cycle.  If we read more than 32 bits at a time, had a
         # cache hit last cycle, and we don't want the first 32-bit chunk
         # then we can keep the data we read last cycle and just use that.
-        with m.If(i_in.nia[2:INSN_BITS+2] != 0):
+        with m.If(i_in.nia[2:self.INSN_BITS+2] != 0):
             comb += use_previous.eq(i_in.sequential & r.hit_valid)
 
         # Extract line, row and tag from request
-        comb += req_index.eq(get_index(i_in.nia))
-        comb += req_row.eq(get_row(i_in.nia))
-        comb += req_tag.eq(get_tag(real_addr))
+        comb += req_index.eq(self.get_index(i_in.nia))
+        comb += req_row.eq(self.get_row(i_in.nia))
+        comb += req_tag.eq(self.get_tag(real_addr))
 
         # Calculate address of beginning of cache row, will be
         # used for cache miss processing if needed
         comb += req_laddr.eq(Cat(
-                 Const(0, ROW_OFF_BITS),
-                 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
+                 Const(0, self.ROW_OFF_BITS),
+                 real_addr[self.ROW_OFF_BITS:self.REAL_ADDR_BITS],
                 ))
 
         # Test if pending request is a hit on any way
         hitcond = Signal()
-        comb += hitcond.eq((r.state == State.WAIT_ACK)
-                 & (req_index == r.store_index)
-                 & r.rows_valid[req_row % ROW_PER_LINE]
+        rowvalid = Signal()
+        comb += rowvalid.eq(r.rows_valid[req_row % self.ROW_PER_LINE])
+        comb += hitcond.eq((r.state == State.WAIT_ACK) &
+                            (req_index == r.store_index) &
+                             rowvalid
                 )
-        with m.If(i_in.req):
-            cvb = Signal(NUM_WAYS)
-            ctag = Signal(TAG_RAM_WIDTH)
-            comb += ctag.eq(cache_tags[req_index])
-            comb += cvb.eq(cache_valid_bits[req_index])
-            for i in range(NUM_WAYS):
-                tagi = Signal(TAG_BITS, name="tag_i%d" % i)
-                comb += tagi.eq(read_tag(i, ctag))
-                hit_test = Signal(name="hit_test%d" % i)
-                comb += hit_test.eq(i == r.store_way)
-                with m.If((cvb[i] | (hitcond & hit_test))
-                          & (tagi == req_tag)):
-                    comb += hit_way.eq(i)
-                    comb += is_hit.eq(1)
+        # i_in.req asserts Decoder active
+        cvb = Signal(self.NUM_WAYS)
+        ctag = Signal(self.TAG_RAM_WIDTH)
+        comb += rd_tag.addr.eq(req_index)
+        comb += ctag.eq(rd_tag.data)
+        comb += cvb.eq(cache_valids.q.word_select(req_index, self.NUM_WAYS))
+        m.submodules.store_way_e = se = Decoder(self.NUM_WAYS)
+        comb += se.i.eq(r.store_way)
+        comb += se.n.eq(~i_in.req)
+        for i in range(self.NUM_WAYS):
+            tagi = Signal(self.TAG_BITS, name="tag_i%d" % i)
+            hit_test = Signal(name="hit_test%d" % i)
+            is_tag_hit = Signal(name="is_tag_hit_%d" % i)
+            comb += tagi.eq(self.read_tag(i, ctag))
+            comb += hit_test.eq(se.o[i])
+            comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
+                                  (tagi == req_tag))
+            with m.If(is_tag_hit):
+                comb += hit_way.eq(i)
+                comb += is_hit.eq(1)
 
         # Generate the "hit" and "miss" signals
         # for the synchronous blocks
@@ -511,15 +580,11 @@ class ICache(Elaboratable):
             comb += req_is_hit.eq(is_hit)
             comb += req_is_miss.eq(~is_hit)
 
-        with m.Else():
-            comb += req_is_hit.eq(0)
-            comb += req_is_miss.eq(0)
-
         comb += req_hit_way.eq(hit_way)
 
         # The way to replace on a miss
         with m.If(r.state == State.CLR_TAG):
-            comb += replace_way.eq(plru_victim[r.store_index])
+            comb += replace_way.eq(plru_victim)
         with m.Else():
             comb += replace_way.eq(r.store_way)
 
@@ -531,7 +596,7 @@ class ICache(Elaboratable):
         # be output an entire row which I prefer not to do just yet
         # as it would force fetch2 to know about some of the cache
         # geometry information.
-        comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
+        comb += i_out.insn.eq(self.read_insn_word(r.hit_nia, cache_out_row))
         comb += i_out.valid.eq(r.hit_valid)
         comb += i_out.nia.eq(r.hit_nia)
         comb += i_out.stop_mark.eq(r.hit_smark)
@@ -542,7 +607,12 @@ class ICache(Elaboratable):
         comb += stall_out.eq(~(is_hit & access_ok))
 
         # Wishbone requests output (from the cache miss reload machine)
-        comb += wb_out.eq(r.wb)
+        comb += bus.we.eq(r.wb.we)
+        comb += bus.adr.eq(r.wb.adr)
+        comb += bus.sel.eq(r.wb.sel)
+        comb += bus.stb.eq(r.wb.stb)
+        comb += bus.dat_w.eq(r.wb.dat)
+        comb += bus.cyc.eq(r.wb.cyc)
 
     # Cache hit synchronous machine
     def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
@@ -567,14 +637,10 @@ class ICache(Elaboratable):
 
             with m.If(req_is_hit):
                 sync += r.hit_way.eq(req_hit_way)
-                sync += Display(
-                         "cache hit nia:%x IR:%x SM:%x idx:%x tag:%x " \
-                         "way:%x RA:%x", i_in.nia, i_in.virt_mode, \
-                         i_in.stop_mark, req_index, req_tag, \
-                         req_hit_way, real_addr
-                        )
-
-
+                sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
+                                "way:%x RA:%x", i_in.nia, i_in.virt_mode,
+                                 i_in.stop_mark, req_index, req_tag,
+                                 req_hit_way, real_addr)
 
         with m.If(~stall_in):
             # Send stop marks and NIA down regardless of validity
@@ -589,7 +655,7 @@ class ICache(Elaboratable):
         i_in = self.i_in
 
         # Reset per-row valid flags, only used in WAIT_ACK
-        for i in range(ROW_PER_LINE):
+        for i in range(self.ROW_PER_LINE):
             sync += r.rows_valid[i].eq(0)
 
         # We need to read a cache line
@@ -598,17 +664,16 @@ class ICache(Elaboratable):
                      "cache miss nia:%x IR:%x SM:%x idx:%x "
                      " way:%x tag:%x RA:%x", i_in.nia,
                      i_in.virt_mode, i_in.stop_mark, req_index,
-                     replace_way, req_tag, real_addr
-                    )
+                     replace_way, req_tag, real_addr)
 
             # Keep track of our index and way for subsequent stores
-            st_row = Signal(BRAM_ROWS)
-            comb += st_row.eq(get_row(req_laddr))
+            st_row = Signal(self.ROW_BITS)
+            comb += st_row.eq(self.get_row(req_laddr))
             sync += r.store_index.eq(req_index)
             sync += r.store_row.eq(st_row)
             sync += r.store_tag.eq(req_tag)
             sync += r.store_valid.eq(1)
-            sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
+            sync += r.end_row_ix.eq(self.get_row_of_line(st_row) - 1)
 
             # Prep for first wishbone read.  We calculate the address
             # of the start of the cache line and start the WB cycle.
@@ -620,144 +685,113 @@ class ICache(Elaboratable):
             sync += r.state.eq(State.CLR_TAG)
 
     def icache_miss_clr_tag(self, m, r, replace_way,
-                            cache_valid_bits, req_index,
-                            tagset, cache_tags):
-
+                            req_index,
+                            cache_valids):
         comb = m.d.comb
         sync = m.d.sync
+        m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
+                                                    granularity=self.TAG_BITS)
 
         # Get victim way from plru
         sync += r.store_way.eq(replace_way)
+
         # Force misses on that way while reloading that line
-        cv = Signal(INDEX_BITS)
-        comb += cv.eq(cache_valid_bits[req_index])
-        comb += cv.bit_select(replace_way, 1).eq(0)
-        sync += cache_valid_bits[req_index].eq(cv)
+        idx = req_index*self.NUM_WAYS + replace_way # 2D index, 1st dim: self.NUM_WAYS
+        comb += cache_valids.r.eq(1<<idx)
 
-        for i in range(NUM_WAYS):
-            with m.If(i == replace_way):
-                comb += tagset.eq(cache_tags[r.store_index])
-                comb += write_tag(i, tagset, r.store_tag)
-                sync += cache_tags[r.store_index].eq(tagset)
+        # use write-port "granularity" to select the tag to write to
+        # TODO: the Memory should be multipled-up (by NUM_TAGS)
+        tagset = Signal(self.TAG_RAM_WIDTH)
+        comb += tagset.eq(r.store_tag << (replace_way*self.TAG_BITS))
+        comb += wr_tag.en.eq(1<<replace_way)
+        comb += wr_tag.addr.eq(r.store_index)
+        comb += wr_tag.data.eq(tagset)
 
         sync += r.state.eq(State.WAIT_ACK)
 
     def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
-                             stbs_done, cache_valid_bits):
+                             cache_valids):
         comb = m.d.comb
         sync = m.d.sync
 
-        wb_in = self.wb_in
-
-        # Requests are all sent if stb is 0
-        stbs_zero = Signal()
-        comb += stbs_zero.eq(r.wb.stb == 0)
-        comb += stbs_done.eq(stbs_zero)
+        bus = self.bus
 
         # If we are still sending requests, was one accepted?
-        with m.If(~wb_in.stall & ~stbs_zero):
-            # That was the last word? We are done sending.
-            # Clear stb and set stbs_done so we can handle
-            # an eventual last ack on the same cycle.
-            with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
-                sync += Display(
-                         "IS_LAST_ROW_ADDR r.wb.addr:%x " \
-                         "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x " \
-                         "stbs_done:%x", r.wb.adr, r.end_row_ix,
-                         r.wb.stb, stbs_zero, stbs_done
-                        )
+        with m.If(~bus.stall & r.wb.stb):
+            # That was the last word? We are done sending.  Clear stb
+            with m.If(self.is_last_row_addr(r.req_adr, r.end_row_ix)):
+                sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
+                         "r.end_row_ix:%x r.wb.stb:%x",
+                         r.wb.adr, r.end_row_ix, r.wb.stb)
                 sync += r.wb.stb.eq(0)
-                comb += stbs_done.eq(1)
 
             # Calculate the next row address
-            rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
-            comb += rarange.eq(
-                     r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
-                    )
-            sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(
-                     rarange
-                    )
+            rarange = Signal(self.LINE_OFF_BITS - self.ROW_OFF_BITS)
+            comb += rarange.eq(r.req_adr[self.ROW_OFF_BITS:
+                                         self.LINE_OFF_BITS] + 1)
+            sync += r.req_adr[self.ROW_OFF_BITS:self.LINE_OFF_BITS].eq(rarange)
             sync += Display("RARANGE r.req_adr:%x rarange:%x "
-                            "stbs_zero:%x stbs_done:%x",
-                            r.req_adr, rarange, stbs_zero, stbs_done)
+                            "r.wb.stb:%x",
+                            r.req_adr, rarange, r.wb.stb)
 
         # Incoming acks processing
-        with m.If(wb_in.ack):
-            sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
-                            "stbs_done:%x",
-                            wb_in.dat, stbs_zero, stbs_done)
+        with m.If(bus.ack):
+            sync += Display("WB_IN_ACK data:%x", bus.dat_r)
 
-            sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
+            sync += r.rows_valid[r.store_row % self.ROW_PER_LINE].eq(1)
 
             # Check for completion
-            with m.If(stbs_done &
-                      is_last_row(r.store_row, r.end_row_ix)):
+            with m.If(self.is_last_row(r.store_row, r.end_row_ix)):
                 # Complete wishbone cycle
                 sync += r.wb.cyc.eq(0)
                 # be nice, clear addr
                 sync += r.req_adr.eq(0)
 
                 # Cache line is now valid
-                cv = Signal(INDEX_BITS)
-                comb += cv.eq(cache_valid_bits[r.store_index])
-                comb += cv.bit_select(replace_way, 1).eq(
-                         r.store_valid & ~inval_in
-                        )
-                sync += cache_valid_bits[r.store_index].eq(cv)
-
+                idx = r.store_index*self.NUM_WAYS + replace_way # 2D index again
+                valid = r.store_valid & ~inval_in
+                comb += cache_valids.s.eq(1<<idx)
                 sync += r.state.eq(State.IDLE)
 
-            # not completed, move on to next request in row
-            with m.Else():
-                # Increment store row counter
-                sync += r.store_row.eq(next_row(r.store_row))
-
+            # move on to next request in row
+            # Increment store row counter
+            sync += r.store_row.eq(self.next_row(r.store_row))
 
     # Cache miss/reload synchronous machine
-    def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
+    def icache_miss(self, m, r, req_is_miss,
                     req_index, req_laddr, req_tag, replace_way,
-                    cache_tags, access_ok, real_addr):
+                    cache_valids, access_ok, real_addr):
         comb = m.d.comb
         sync = m.d.sync
 
-        i_in, wb_in, m_in  = self.i_in, self.wb_in, self.m_in
+        i_in, bus, m_in  = self.i_in, self.bus, self.m_in
         stall_in, flush_in = self.stall_in, self.flush_in
         inval_in           = self.inval_in
 
-        tagset    = Signal(TAG_RAM_WIDTH)
-        stbs_done = Signal()
-
         comb += r.wb.sel.eq(-1)
         comb += r.wb.adr.eq(r.req_adr[3:])
 
         # Process cache invalidations
         with m.If(inval_in):
-            for i in range(NUM_LINES):
-                sync += cache_valid_bits[i].eq(0)
+            comb += cache_valids.r.eq(-1)
             sync += r.store_valid.eq(0)
 
         # Main state machine
         with m.Switch(r.state):
 
             with m.Case(State.IDLE):
-                self.icache_miss_idle(
-                    m, r, req_is_miss, req_laddr,
-                    req_index, req_tag, replace_way,
-                    real_addr
-                )
+                self.icache_miss_idle(m, r, req_is_miss, req_laddr,
+                                      req_index, req_tag, replace_way,
+                                      real_addr)
 
             with m.Case(State.CLR_TAG, State.WAIT_ACK):
                 with m.If(r.state == State.CLR_TAG):
-                    self.icache_miss_clr_tag(
-                        m, r, replace_way,
-                        cache_valid_bits, req_index,
-                        tagset, cache_tags
-                    )
-
-                self.icache_miss_wait_ack(
-                    m, r, replace_way, inval_in,
-                    stbs_done, cache_valid_bits
-                )
+                    self.icache_miss_clr_tag(m, r, replace_way,
+                                             req_index,
+                                             cache_valids)
+
+                self.icache_miss_wait_ack(m, r, replace_way, inval_in,
+                                          cache_valids)
 
         # TLB miss and protection fault processing
         with m.If(flush_in | m_in.tlbld):
@@ -771,13 +805,13 @@ class ICache(Elaboratable):
         comb = m.d.comb
         sync = m.d.sync
 
-        wb_in, i_out       = self.wb_in, self.i_out
+        bus, i_out       = self.bus, self.i_out
         log_out, stall_out = self.log_out, self.stall_out
 
         # Output data to logger
         for i in range(LOG_LENGTH):
             log_data = Signal(54)
-            lway     = Signal(NUM_WAYS)
+            lway     = Signal(self.WAY_BITS)
             wstate   = Signal()
 
             sync += lway.eq(req_hit_way)
@@ -789,8 +823,8 @@ class ICache(Elaboratable):
             sync += log_data.eq(Cat(
                      ra_valid, access_ok, req_is_miss, req_is_hit,
                      lway, wstate, r.hit_nia[2:6], r.fetch_failed,
-                     stall_out, wb_in.stall, r.wb.cyc, r.wb.stb,
-                     r.real_addr[3:6], wb_in.ack, i_out.insn, i_out.valid
+                     stall_out, bus.stall, r.wb.cyc, r.wb.stb,
+                     r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
                     ))
             comb += log_out.eq(log_data)
 
@@ -799,13 +833,17 @@ class ICache(Elaboratable):
         m                = Module()
         comb             = m.d.comb
 
-        # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
-        cache_tags       = CacheTagArray()
-        cache_valid_bits = CacheValidBitsArray()
+        # Cache-Ways "valid" indicators.  this is a 2D Signal, by the
+        # number of ways and the number of lines.
+        vec = SRLatch(sync=True, llen=self.NUM_WAYS*self.NUM_LINES,
+                      name="cachevalids")
+        m.submodules.cache_valids = cache_valids = vec
+
+        # TLB Array
+        itlb            = self.TLBArray()
+        vec = SRLatch(sync=False, llen=self.TLB_SIZE, name="tlbvalids")
+        m.submodules.itlb_valids = itlb_valid = vec
 
-        itlb_valid_bits  = TLBValidBitsArray()
-        itlb_tags        = TLBTagArray()
-        itlb_ptes        = TLBPtesArray()
         # TODO to be passed to nmigen as ram attributes
         # attribute ram_style of itlb_tags : signal is "distributed";
         # attribute ram_style of itlb_ptes : signal is "distributed";
@@ -813,62 +851,106 @@ class ICache(Elaboratable):
         # Privilege bit from PTE EAA field
         eaa_priv         = Signal()
 
-        r                = RegInternal()
+        r                = RegInternal(self)
 
         # Async signal on incoming request
-        req_index        = Signal(NUM_LINES)
-        req_row          = Signal(BRAM_ROWS)
-        req_hit_way      = Signal(NUM_WAYS)
-        req_tag          = Signal(TAG_BITS)
+        req_index        = Signal(self.INDEX_BITS)
+        req_row          = Signal(self.ROW_BITS)
+        req_hit_way      = Signal(self.WAY_BITS)
+        req_tag          = Signal(self.TAG_BITS)
         req_is_hit       = Signal()
         req_is_miss      = Signal()
         req_laddr        = Signal(64)
 
-        tlb_req_index    = Signal(TLB_SIZE)
-        real_addr        = Signal(REAL_ADDR_BITS)
+        tlb_req_index    = Signal(self.TL_BITS)
+        real_addr        = Signal(self.REAL_ADDR_BITS)
         ra_valid         = Signal()
         priv_fault       = Signal()
         access_ok        = Signal()
         use_previous     = Signal()
 
-        cache_out_row    = Signal(ROW_SIZE_BITS)
+        cache_out_row    = Signal(self.ROW_SIZE_BITS)
+
+        plru_victim      = Signal(self.WAY_BITS)
+        replace_way      = Signal(self.WAY_BITS)
 
-        plru_victim      = PLRUOut()
-        replace_way      = Signal(NUM_WAYS)
+        self.tlbmem = Memory(depth=self.TLB_SIZE,
+                             width=self.TLB_EA_TAG_BITS+self.TLB_PTE_BITS,
+                             #attrs={'syn_ramstyle': "block_ram"}
+                            )
+        self.tagmem = Memory(depth=self.NUM_LINES,
+                             width=self.TAG_RAM_WIDTH,
+                             #attrs={'syn_ramstyle': "block_ram"}
+                            )
 
         # call sub-functions putting everything together,
         # using shared signals established above
         self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
         self.maybe_plrus(m, r, plru_victim)
-        self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags, real_addr,
-                         itlb_valid_bits, ra_valid, eaa_priv, priv_fault,
+        self.itlb_lookup(m, tlb_req_index, itlb, itlb_valid, real_addr,
+                         ra_valid, eaa_priv, priv_fault,
                          access_ok)
-        self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
+        self.itlb_update(m, itlb, itlb_valid)
         self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
-                         req_tag, real_addr, req_laddr, cache_valid_bits,
-                         cache_tags, access_ok, req_is_hit, req_is_miss,
+                         req_tag, real_addr, req_laddr,
+                         cache_valids,
+                         access_ok, req_is_hit, req_is_miss,
                          replace_way, plru_victim, cache_out_row)
         self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
                         req_index, req_tag, real_addr)
-        self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
-                         req_laddr, req_tag, replace_way, cache_tags,
+        self.icache_miss(m, r, req_is_miss, req_index,
+                         req_laddr, req_tag, replace_way,
+                         cache_valids,
                          access_ok, real_addr)
         #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
         #                req_is_miss, req_is_hit, lway, wstate, r)
 
+        # don't connect up to FetchUnitInterface so that some unit tests
+        # can continue to operate
+        if not self.use_fetch_iface:
+            return m
+
+        # connect to FetchUnitInterface. FetchUnitInterface is undocumented
+        # so needs checking and iterative revising
+        i_in, bus, i_out = self.i_in, self.bus, self.i_out
+        comb += i_in.req.eq(self.a_i_valid)
+        comb += i_in.nia.eq(self.a_pc_i)
+        comb += self.stall_in.eq(self.a_stall_i)
+        comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
+        comb += self.f_badaddr_o.eq(i_out.nia)
+        comb += self.f_instr_o.eq(i_out.insn)
+        comb += self.f_busy_o.eq(~i_out.valid) # probably
+
+        # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
+        ibus = self.ibus
+        comb += ibus.adr.eq(self.bus.adr)
+        comb += ibus.dat_w.eq(self.bus.dat_w)
+        comb += ibus.sel.eq(self.bus.sel)
+        comb += ibus.cyc.eq(self.bus.cyc)
+        comb += ibus.stb.eq(self.bus.stb)
+        comb += ibus.we.eq(self.bus.we)
+
+        comb += self.bus.dat_r.eq(ibus.dat_r)
+        comb += self.bus.ack.eq(ibus.ack)
+        if hasattr(ibus, "stall"):
+            comb += self.bus.stall.eq(ibus.stall)
+        else:
+            # fake-up the wishbone stall signal to comply with pipeline mode
+            # same thing is done in dcache.py
+            comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
+
         return m
 
 
 def icache_sim(dut):
-    i_out = dut.i_in
-    i_in  = dut.i_out
+    i_in = dut.i_in
+    i_out  = dut.i_out
     m_out = dut.m_in
 
-    yield i_in.valid.eq(0)
-    yield i_out.priv_mode.eq(1)
-    yield i_out.req.eq(0)
-    yield i_out.nia.eq(0)
-    yield i_out.stop_mark.eq(0)
+    yield i_in.priv_mode.eq(1)
+    yield i_in.req.eq(0)
+    yield i_in.nia.eq(0)
+    yield i_in.stop_mark.eq(0)
     yield m_out.tlbld.eq(0)
     yield m_out.tlbie.eq(0)
     yield m_out.addr.eq(0)
@@ -877,107 +959,126 @@ def icache_sim(dut):
     yield
     yield
     yield
-    yield i_out.req.eq(1)
-    yield i_out.nia.eq(Const(0x0000000000000004, 64))
-    for i in range(30):
-        yield
+
+    # miss, stalls for a bit
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(Const(0x0000000000000004, 64))
     yield
-    valid = yield i_in.valid
+    valid = yield i_out.valid
+    while not valid:
+        yield
+        valid = yield i_out.valid
+    yield i_in.req.eq(0)
+
+    insn  = yield i_out.insn
     nia   = yield i_out.nia
-    insn  = yield i_in.insn
-    print(f"valid? {valid}")
-    assert valid
     assert insn == 0x00000001, \
         "insn @%x=%x expected 00000001" % (nia, insn)
-    yield i_out.req.eq(0)
+    yield i_in.req.eq(0)
     yield
 
     # hit
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(Const(0x0000000000000008, 64))
     yield
+    valid = yield i_out.valid
+    while not valid:
+        yield
+        valid = yield i_out.valid
+    yield i_in.req.eq(0)
+
+    nia   = yield i_out.nia
+    insn  = yield i_out.insn
     yield
-    yield i_out.req.eq(1)
-    yield i_out.nia.eq(Const(0x0000000000000008, 64))
-    yield
-    yield
-    valid = yield i_in.valid
-    nia   = yield i_in.nia
-    insn  = yield i_in.insn
-    assert valid
     assert insn == 0x00000002, \
         "insn @%x=%x expected 00000002" % (nia, insn)
-    yield
 
     # another miss
-    yield i_out.req.eq(1)
-    yield i_out.nia.eq(Const(0x0000000000000040, 64))
-    for i in range(30):
-        yield
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(Const(0x0000000000000040, 64))
     yield
-    valid = yield i_in.valid
-    nia   = yield i_out.nia
-    insn  = yield i_in.insn
-    assert valid
+    valid = yield i_out.valid
+    while not valid:
+        yield
+        valid = yield i_out.valid
+    yield i_in.req.eq(0)
+
+    nia   = yield i_in.nia
+    insn  = yield i_out.insn
     assert insn == 0x00000010, \
         "insn @%x=%x expected 00000010" % (nia, insn)
 
-    # test something that aliases
-    yield i_out.req.eq(1)
-    yield i_out.nia.eq(Const(0x0000000000000100, 64))
+    # test something that aliases (this only works because
+    # the unit test SRAM is a depth of 512)
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(Const(0x0000000000000100, 64))
     yield
     yield
-    valid = yield i_in.valid
+    valid = yield i_out.valid
     assert ~valid
     for i in range(30):
         yield
     yield
-    insn  = yield i_in.insn
-    valid = yield i_in.valid
-    insn  = yield i_in.insn
+    insn  = yield i_out.insn
+    valid = yield i_out.valid
+    insn  = yield i_out.insn
     assert valid
     assert insn == 0x00000040, \
          "insn @%x=%x expected 00000040" % (nia, insn)
-    yield i_out.req.eq(0)
-
+    yield i_in.req.eq(0)
 
 
 def test_icache(mem):
-     dut    = ICache()
-
-     memory = Memory(width=64, depth=512, init=mem)
-     sram   = SRAM(memory=memory, granularity=8)
-
-     m      = Module()
-
-     m.submodules.icache = dut
-     m.submodules.sram   = sram
-
-     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
-     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
-     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
-     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
-     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
-     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
-
-     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
-     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
-
-     # nmigen Simulation
-     sim = Simulator(m)
-     sim.add_clock(1e-6)
-
-     sim.add_sync_process(wrap(icache_sim(dut)))
-     with sim.write_vcd('test_icache.vcd'):
+    from soc.config.test.test_loadstore import TestMemPspec
+    pspec = TestMemPspec(addr_wid=32,
+                         mask_wid=8,
+                         reg_wid=64,
+                         XLEN=32,
+                         )
+    dut    = ICache(pspec)
+
+    memory = Memory(width=64, depth=512, init=mem)
+    sram   = SRAM(memory=memory, granularity=8)
+
+    m      = Module()
+
+    m.submodules.icache = dut
+    m.submodules.sram   = sram
+
+    m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
+    m.d.comb += sram.bus.stb.eq(dut.bus.stb)
+    m.d.comb += sram.bus.we.eq(dut.bus.we)
+    m.d.comb += sram.bus.sel.eq(dut.bus.sel)
+    m.d.comb += sram.bus.adr.eq(dut.bus.adr)
+    m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
+
+    m.d.comb += dut.bus.ack.eq(sram.bus.ack)
+    m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    sim.add_sync_process(wrap(icache_sim(dut)))
+    with sim.write_vcd('test_icache.vcd'):
          sim.run()
 
+
 if __name__ == '__main__':
-    dut = ICache()
+    from soc.config.test.test_loadstore import TestMemPspec
+    pspec = TestMemPspec(addr_wid=64,
+                         mask_wid=8,
+                         XLEN=32,
+                         reg_wid=64,
+                         )
+    dut = ICache(pspec)
     vl = rtlil.convert(dut, ports=[])
     with open("test_icache.il", "w") as f:
         f.write(vl)
 
+    # set up memory every 32-bits with incrementing values 0 1 2 ...
     mem = []
     for i in range(512):
         mem.append((i*2) | ((i*2+1)<<32))
 
     test_icache(mem)
-
index 3a9a1bc8e97b6f3cd7d49ae2734ae4a706b810d6..177e238c781205c813b8deec6173970b16ccce00 100644 (file)
@@ -23,7 +23,7 @@ class TestMemFetchUnit(FetchUnitInterface, Elaboratable):
         m.submodules.mem = mem = self.mem
 
         do_fetch = Signal()  # set when fetch while valid and not stalled
-        m.d.comb += do_fetch.eq(self.a_valid_i & ~self.a_stall_i)
+        m.d.comb += do_fetch.eq(self.a_i_valid & ~self.a_stall_i)
 
         # bit of a messy FSM that progresses from idle to in progress
         # to done.
@@ -37,7 +37,7 @@ class TestMemFetchUnit(FetchUnitInterface, Elaboratable):
         with m.If(~do_fetch):               # done
             m.d.sync += op_in_progress.eq(0)
 
-        m.d.comb += self.a_busy_o.eq(op_actioned & self.a_valid_i)
+        m.d.comb += self.a_busy_o.eq(op_actioned & self.a_i_valid)
         # fetch
         m.d.comb += mem.rdport.addr.eq(self.a_pc_i[adr_lsb:])
         m.d.comb += self.f_instr_o.eq(mem.rdport.data)
index 8414f77f75631691df9c358646fac454de950040..42ef061072d6b6b1511fa9e16061286744b27153 100644 (file)
@@ -43,7 +43,7 @@ import unittest
 
 class L0CacheBuffer2(Elaboratable):
     """L0CacheBuffer2"""
-    def __init__(self, n_units=8, regwid=64, addrwid=48):
+    def __init__(self, n_units=8, regwid=64, addrwid=64):
         self.n_units = n_units
         self.regwid = regwid
         self.addrwid = addrwid
@@ -59,7 +59,7 @@ class L0CacheBuffer2(Elaboratable):
         # connect the ports as modules
 
         for i in range(self.n_units):
-            d = LDSTSplitter(64, 48, 4, self.dports[i])
+            d = LDSTSplitter(64, 64, 4, self.dports[i])
             setattr(m.submodules, "ldst_splitter%d" % i, d)
 
         # state-machine latches TODO
@@ -126,8 +126,8 @@ class DataMerger(Elaboratable):
         :addr_array_i: an NxN Array of Signals with bits set indicating address
                        match.  bits across the diagonal (addr_array_i[x][x])
                        will always be set, to indicate "active".
-        :data_i: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
-        :data_o: an Output Record of same type
+        :i_data: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
+        :o_data: an Output Record of same type
                  {data: 128 bit, byte_enable: 16 bit}
         """
         self.array_size = array_size
@@ -141,8 +141,8 @@ class DataMerger(Elaboratable):
         ul = []
         for i in range(array_size):
             ul.append(DataMergerRecord())
-        self.data_i = Array(ul)
-        self.data_o = DataMergerRecord()
+        self.i_data = Array(ul)
+        self.o_data = DataMergerRecord()
 
     def elaborate(self, platform):
         m = Module()
@@ -160,10 +160,10 @@ class DataMerger(Elaboratable):
                 select = self.addr_array_i[idx][j]
                 r = DataMergerRecord()
                 with m.If(select):
-                    comb += r.eq(self.data_i[j])
+                    comb += r.eq(self.i_data[j])
                 l.append(r)
-            comb += self.data_o.data.eq(ortreereduce(l, "data"))
-            comb += self.data_o.en.eq(ortreereduce(l, "en"))
+            comb += self.o_data.data.eq(ortreereduce(l, "data"))
+            comb += self.o_data.en.eq(ortreereduce(l, "en"))
 
         return m
 
@@ -197,15 +197,15 @@ class TstDataMerger2(Elaboratable):
 
         for j in range(self.n_units):
             inp = self.input_array[j]
-            m.d.comb += dm_even.data_i[j].en.eq(inp.bytemask_even)
-            m.d.comb += dm_odd.data_i[j].en.eq(inp.bytemask_odd)
-            m.d.comb += dm_even.data_i[j].data.eq(inp.data_even)
-            m.d.comb += dm_odd.data_i[j].data.eq(inp.data_odd)
+            m.d.comb += dm_even.i_data[j].en.eq(inp.bytemask_even)
+            m.d.comb += dm_odd.i_data[j].en.eq(inp.bytemask_odd)
+            m.d.comb += dm_even.i_data[j].data.eq(inp.data_even)
+            m.d.comb += dm_odd.i_data[j].data.eq(inp.data_odd)
             m.d.comb += dm_even.addr_array_i[j].eq(self.addr_match(j,addr_even))
             m.d.comb += dm_odd.addr_array_i[j].eq(self.addr_match(j,addr_odd))
 
-        m.d.comb += self.data_odd.eq(dm_odd.data_o.data)
-        m.d.comb += self.data_even.eq(dm_even.data_o.data)
+        m.d.comb += self.data_odd.eq(dm_odd.o_data.data)
+        m.d.comb += self.data_even.eq(dm_even.o_data.data)
         return m
 
 
@@ -228,7 +228,7 @@ class L0CacheBuffer(Elaboratable):
     by this class.  That task is taken care of by LDSTCompUnit.
     """
 
-    def __init__(self, n_units, pimem, regwid=64, addrwid=48):
+    def __init__(self, n_units, pimem, regwid=64, addrwid=64):
         self.n_units = n_units
         self.pimem = pimem
         self.regwid = regwid
@@ -384,20 +384,20 @@ def l0_cache_ldst(arg, dut):
 def data_merger_merge(dut):
     # starting with all inputs zero
     yield Settle()
-    en = yield dut.data_o.en
-    data = yield dut.data_o.data
+    en = yield dut.o_data.en
+    data = yield dut.o_data.data
     assert en == 0, "en must be zero"
     assert data == 0, "data must be zero"
     yield
 
     yield dut.addr_array_i[0].eq(0xFF)
     for j in range(dut.array_size):
-        yield dut.data_i[j].en.eq(1 << j)
-        yield dut.data_i[j].data.eq(0xFF << (16*j))
+        yield dut.i_data[j].en.eq(1 << j)
+        yield dut.i_data[j].data.eq(0xFF << (16*j))
     yield Settle()
 
-    en = yield dut.data_o.en
-    data = yield dut.data_o.data
+    en = yield dut.o_data.en
+    data = yield dut.o_data.data
     assert data == 0xff00ff00ff00ff00ff00ff00ff00ff
     assert en == 0xff
     yield
@@ -414,7 +414,7 @@ class TestL0Cache(unittest.TestCase):
     def test_l0_cache_test_bare_wb(self):
 
         pspec = TestMemPspec(ldst_ifacetype='test_bare_wb',
-                             addr_wid=48,
+                             addr_wid=64,
                              mask_wid=8,
                              reg_wid=64)
         dut = TstL0CacheBuffer(pspec)
@@ -428,7 +428,7 @@ class TestL0Cache(unittest.TestCase):
     def test_l0_cache_testpi(self):
 
         pspec = TestMemPspec(ldst_ifacetype='testpi',
-                             addr_wid=48,
+                             addr_wid=64,
                              mask_wid=8,
                              reg_wid=64)
         dut = TstL0CacheBuffer(pspec)
index 08764232b6c4bc34cc092e2158a38d46ab355541..11a1ba81a14c020d15f9b3268604a7102b54b3e4 100644 (file)
@@ -19,8 +19,8 @@ class TestMemLoadStoreUnit(LoadStoreUnitInterface, Elaboratable):
         do_store = Signal() # set when store while valid and not stalled
 
         m.d.comb += [
-            do_load.eq(self.x_ld_i & (self.x_valid_i & ~self.x_stall_i)),
-            do_store.eq(self.x_st_i & (self.x_valid_i & ~self.x_stall_i)),
+            do_load.eq(self.x_ld_i & (self.x_i_valid & ~self.x_stall_i)),
+            do_store.eq(self.x_st_i & (self.x_i_valid & ~self.x_stall_i)),
             ]
         # bit of a messy FSM that progresses from idle to in progress
         # to done.
@@ -34,7 +34,7 @@ class TestMemLoadStoreUnit(LoadStoreUnitInterface, Elaboratable):
         with m.If(~(do_load | do_store)):               # done
             m.d.sync += op_in_progress.eq(0)
 
-        m.d.comb += self.x_busy_o.eq(op_actioned & self.x_valid_i)
+        m.d.comb += self.x_busy_o.eq(op_actioned & self.x_i_valid)
 
         m.d.comb += [
             # load
index 8e63bdec4a580971c4dd8a6272b17b687f6e1b0c..2176855d0efa2b4cf21beb0a709e34559158e893 100644 (file)
@@ -32,6 +32,45 @@ from soc.experiment.mem_types import (LoadStore1ToMMUType,
                                  DCacheToMMUType,
                                  MMUToICacheType)
 
+# Radix Tree Page Directory Entry Record, TODO put this somewhere sensible
+# v3.0C Book III p1015-1016 section 6.7.10.1
+class RTPDE(RecordObject):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.nls   = Signal(5)  # Nextded Access Auth bits 59:63 LSB0 0:4
+        self.rs1   = Signal(3)  # Reserved            bits 56:58 LSB0 5:7
+        self.nlb   = Signal(52) # Next Level Base     bit  4:55  LSB0 8:59
+        self.rs2   = Signal(2)  # Reserved            bit  2:3   LSB0 60:61
+        self.leaf  = Signal(1)  # leaf                bit  1     LSB0 62
+        self.valid = Signal(1)  # valid               bit  0     LSB0 63
+
+
+# Radix Tree Page Table Entry Record, TODO put this somewhere sensible
+# v3.0C Book III p1016 section 6.7.10.2
+class RTPTE(RecordObject):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.eaa   = Signal(4)  # Encoded Access Auth bits 60:63 LSB0 0:3
+        self.att   = Signal(2)  # Attributes          bits 58:59 LSB0 4:5
+        self.rs1   = Signal(1)  # Reserved            bit  57    LSB0 6
+        self.c     = Signal(1)  # Change              bit  56    LSB0 7
+        self.r     = Signal(1)  # Reference           bit  55    LSB0 8
+        self.sw    = Signal(3)  # SW bits 1:3         bits 52:54 LSB0 9:11
+        self.rpn   = Signal(45) # Real Page Number    bits 7:51  LSB0 12:56
+        self.rs2   = Signal(4)  # Reserved            bit  3:6   LSB0 57-60
+        self.sw0   = Signal(1)  # SW bit 0            bit  2     LSB0 61
+        self.leaf  = Signal(1)  # leaf                bit  1     LSB0 62
+        self.valid = Signal(1)  # valid               bit  0     LSB0 63
+
+# and these... which of course are turned round to LSB0 order.
+# TODO: sigh. use botchify and put them in openpower.consts
+EAA_PRIV = 3 # bit 0 (in MSB0) set ==> problem-state banned (priv=1 only)
+EAA_RD   = 2 # bit 1 (in MSB0) set ==> loads are permitted
+EAA_WR   = 1 # bit 2 (in MSB0) set ==> load and stores permitted
+EAA_EXE  = 0 # bit 3 (in MSB0) set ==> execute permitted
+
+# for debugging
+display_invalid = True
 
 @unique
 class State(Enum):
@@ -47,6 +86,19 @@ class State(Enum):
     RADIX_FINISH = 9
 
 
+# Process Table Record - near-identical to Page Table Record (same format)
+# v3.0C Book III Section 6.7.6.2 p1004
+class PRTBL(RecordObject):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.rpds  = Signal(5)  # Root Page Directory Size  59:63 LSB0 0:4
+        self.rts2  = Signal(3)  # Radix Tree Size part 2    56:58 LSB0 5:7
+        self.rpdb  = Signal(52) # Root Page Directory Base  4:55  LSB0 8:59
+        self.rsv2  = Signal(1)  # reserved                  3     LSB0 60
+        self.rts1  = Signal(2)  # Radix Tree Size part 1    1:2   LSB0 61:62
+        self.rsv1  = Signal(1)  # reserved                  0     LSB0 63
+
+
 class RegStage(RecordObject):
     def __init__(self, name=None):
         super().__init__(name=name)
@@ -57,17 +109,26 @@ class RegStage(RecordObject):
         self.priv = Signal()
         self.addr = Signal(64)
         self.inval_all = Signal()
+
         # config SPRs
         self.prtbl = Signal(64)
         self.pid = Signal(32)
+
         # internal state
         self.state = Signal(State) # resets to IDLE
         self.done = Signal()
         self.err = Signal()
+
+        # there are 4 quadrants (0-3): here we only support 2 (pt0 and pt3)
+        # these are bits 62-63 of any given address.
+        # except in segment_check, bit 62 is ignored
+        # Quadrant Select can be seen in v3.0C 6.7.10 p1015 book III figure 36
+        # and is further described in 6.7.11.3 p1019
         self.pgtbl0 = Signal(64)
         self.pt0_valid = Signal()
         self.pgtbl3 = Signal(64)
         self.pt3_valid = Signal()
+
         self.shift = Signal(6)
         self.mask_size = Signal(5)
         self.pgbase = Signal(56)
@@ -79,6 +140,20 @@ class RegStage(RecordObject):
         self.rc_error = Signal()
 
 
+# Page Table Record - note that HR bit is treated as part of rts below
+# (near-identical to Process Table Record - same format)
+# v3.0C Book III Section 6.7.6.1 p1003
+class PGTBL(RecordObject):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.rpds  = Signal(5)  # Root Page Directory Size  59:63 LSB0 0:4
+        self.rts2  = Signal(3)  # Radix Tree Size part 2    56:58 LSB0 5:7
+        self.rpdb  = Signal(52) # Root Page Directory Base  4:55  LSB0 8:59
+        self.s     = Signal(1)  # Host Secure               3     LSB0 60
+        self.rts1  = Signal(2)  # Radix Tree Size part 1    1:2   LSB0 61:62
+        self.hr    = Signal(1)  # Host Radix                0     LSB0 63
+
+
 class MMU(Elaboratable):
     """Radix MMU
 
@@ -87,41 +162,52 @@ class MMU(Elaboratable):
     (i.e. there is no gRA -> hRA translation).
     """
     def __init__(self):
-        self.l_in  = LoadStore1ToMMUType()
-        self.l_out = MMUToLoadStore1Type()
-        self.d_out = MMUToDCacheType()
-        self.d_in  = DCacheToMMUType()
-        self.i_out = MMUToICacheType()
+        self.l_in  = LoadStore1ToMMUType("l_in")
+        self.l_out = MMUToLoadStore1Type("l_out")
+        self.d_out = MMUToDCacheType("d_out")
+        self.d_in  = DCacheToMMUType("d_in")
+        self.i_out = MMUToICacheType("i_out")
 
     def radix_tree_idle(self, m, l_in, r, v):
+        """radix_tree_idle - the main decision-point.  valid actions include:
+        * LDST incoming TLBIE request (invalidate TLB entry)
+        * LDST incoming RADIX walk request
+        * set either PRTBL or PID SPRs (which then fires a TLB invalidate)
+        """
         comb = m.d.comb
         sync = m.d.sync
 
         pt_valid = Signal()
-        pgtbl = Signal(64)
+        pgtbl = PGTBL("pgtbl")
         rts = Signal(6)
-        mbits = Signal(6)
+        mbits = Signal(6, name="mbits_idle")
 
-        with m.If(~l_in.addr[63]):
-            comb += pgtbl.eq(r.pgtbl0)
-            comb += pt_valid.eq(r.pt0_valid)
-        with m.Else():
+        with m.If(l_in.addr[63]): # quadrant 3
             comb += pgtbl.eq(r.pgtbl3)
             comb += pt_valid.eq(r.pt3_valid)
+        with m.Else():
+            comb += pgtbl.eq(r.pgtbl0)
+            comb += pt_valid.eq(r.pt0_valid)
 
         # rts == radix tree size, number of address bits
-        # being translated
-        comb += rts.eq(Cat(pgtbl[5:8], pgtbl[61:63]))
+        # being translated.  takes bits 5:7 and 61:62
+        comb += rts.eq(Cat(pgtbl.rts2, pgtbl.rts1, C(0)))
 
         # mbits == number of address bits to index top
-        # level of tree
-        comb += mbits.eq(pgtbl[0:5])
+        # level of tree.  takes bits 0:4
+        comb += mbits.eq(pgtbl.rpds)
 
         # set v.shift to rts so that we can use finalmask
-        # for the segment check
+        # for the segment check.
+        # note: rpdb (52 bits long) is truncated to 48 bits
         comb += v.shift.eq(rts)
         comb += v.mask_size.eq(mbits[0:5])
-        comb += v.pgbase.eq(Cat(C(0, 8), pgtbl[8:56]))
+
+        # create the page base from root page directory base (48 bits with 8 0s)
+        comb += v.pgbase.eq(Cat(C(0, 8), pgtbl.rpdb[:48])) # bits 8:55
+
+        # request either TLB invalidate
+        # or start a RADIX walk
 
         with m.If(l_in.valid):
             comb += v.addr.eq(l_in.addr)
@@ -129,10 +215,10 @@ class MMU(Elaboratable):
             comb += v.store.eq(~(l_in.load | l_in.iside))
             comb += v.priv.eq(l_in.priv)
 
-            comb += Display("state %d l_in.valid addr %x iside %d store %d "
-                            "rts %x mbits %x pt_valid %d",
+            sync += Display("state %d l_in.valid addr %x iside %d store %d "
+                            "rpdb %x rts %d mbits %d pt_valid %d",
                             v.state, v.addr, v.iside, v.store,
-                            rts, mbits, pt_valid)
+                            pgtbl.rpdb, rts, mbits, pt_valid)
 
             with m.If(l_in.tlbie):
                 # Invalidate all iTLB/dTLB entries for
@@ -159,20 +245,28 @@ class MMU(Elaboratable):
                     # set v.shift so we can use finalmask
                     # for generating the process table
                     # entry address
-                    comb += v.shift.eq(r.prtbl[0:5])
+                    prtbl = PRTBL("prtbl")
+                    comb += prtbl.eq(r.prtbl)
+                    comb += v.shift.eq(prtbl.rpds)
                     comb += v.state.eq(State.PROC_TBL_READ)
 
                 with m.Elif(mbits == 0):
                     # Use RPDS = 0 to disable radix tree walks
                     comb += v.state.eq(State.RADIX_FINISH)
                     comb += v.invalid.eq(1)
+                    if(display_invalid):
+                        sync += Display("MMUBUG: Use RPDS = 0 to disable"
+                                        " radix tree walks")
                 with m.Else():
                     comb += v.state.eq(State.SEGMENT_CHECK)
 
+        # set either PID or PRTBL SPRs
+        # (then invalidate TLBs)
+
         with m.If(l_in.mtspr):
             # Move to PID needs to invalidate L1 TLBs
-            # and cached pgtbl0 value.  Move to PRTBL
-            # does that plus invalidating the cached
+            # and cached pgtbl0 value.
+            # Move to PRTBL does that plus invalidating the cached
             # pgtbl3 value as well.
             with m.If(~l_in.sprn[9]):
                 comb += v.pid.eq(l_in.rs[0:32])
@@ -186,82 +280,105 @@ class MMU(Elaboratable):
 
     def proc_tbl_wait(self, m, v, r, data):
         comb = m.d.comb
-        with m.If(r.addr[63]):
-            comb += v.pgtbl3.eq(data)
+        sync = m.d.sync
+        rts = Signal(6)
+        mbits = Signal(6, name="mbits_tbl_wait")
+        prtbl = PRTBL("prtblw")
+        comb += prtbl.eq(data)
+
+        with m.If(r.addr[63]): # top bit of quadrant selects pt3
+            comb += v.pgtbl3.eq(prtbl)
             comb += v.pt3_valid.eq(1)
         with m.Else():
-            comb += v.pgtbl0.eq(data)
+            comb += v.pgtbl0.eq(prtbl)
             comb += v.pt0_valid.eq(1)
 
-        rts = Signal(6)
-        mbits = Signal(6)
-
         # rts == radix tree size, # address bits being translated
-        comb += rts.eq(Cat(data[5:8], data[61:63]))
+        comb += rts.eq(Cat(prtbl.rts2, prtbl.rts1, C(0)))
 
         # mbits == # address bits to index top level of tree
-        comb += mbits.eq(data[0:5])
+        comb += mbits.eq(prtbl.rpds[0:5])
 
         # set v.shift to rts so that we can use finalmask for the segment check
         comb += v.shift.eq(rts)
         comb += v.mask_size.eq(mbits[0:5])
-        comb += v.pgbase.eq(Cat(C(0, 8), data[8:56]))
+
+        # create the page base from root page directory base (48 bits with 8 0s)
+        comb += v.pgbase.eq(Cat(C(0, 8), prtbl.rpdb[:48])) # bits 8:55
 
         with m.If(mbits):
             comb += v.state.eq(State.SEGMENT_CHECK)
+            sync += Display("PROC TBL %d data %x rts1 %x rts2 %x rts %d "
+                            "rpdb %x mbits %d pgbase %x "
+                            " pt0_valid %d, pt3_valid %d",
+                            v.state, data, prtbl.rts1, prtbl.rts2, rts,
+                            prtbl.rpdb, mbits, v.pgbase,
+                            v.pt0_valid, v.pt3_valid)
         with m.Else():
             comb += v.state.eq(State.RADIX_FINISH)
             comb += v.invalid.eq(1)
+            if (display_invalid): m.d.sync += Display("MMU: mbits is invalid")
 
     def radix_read_wait(self, m, v, r, d_in, data):
         comb = m.d.comb
         sync = m.d.sync
 
+        rpte = RTPTE(name="radix_rpte") # page-table (leaf) entry
+        rpde = RTPDE(name="radix_rpde") # page-directory (non-leaf) entry
+
         perm_ok = Signal()
         rc_ok = Signal()
-        mbits = Signal(6)
-        valid = Signal()
-        leaf = Signal()
+        mbits = Signal(6, name="mbits_read_wait")
+        valid = rpte.valid
+        eaa = rpte.eaa
+        leaf = rpte.leaf
         badtree = Signal()
 
-        comb += Display("RDW %016x done %d "
+        sync += Display("RDW %016x done %d "
                         "perm %d rc %d mbits %d shf %d "
                         "valid %d leaf %d bad %d",
                         data, d_in.done, perm_ok, rc_ok,
                         mbits, r.shift, valid, leaf, badtree)
 
-        # set pde
+        # set pde and interpret as Radix Tree Page Table Entry (leaf=1 case)
         comb += v.pde.eq(data)
+        comb += rpte.eq(data)
+        comb += rpde.eq(data)
 
-        # test valid bit
-        comb += valid.eq(data[63]) # valid=data[63]
-        comb += leaf.eq(data[62]) # valid=data[63]
-
-        comb += v.pde.eq(data)
-        # valid & leaf
         with m.If(valid):
+            # valid & leaf: RADIX Page-Table Entry
             with m.If(leaf):
                 # check permissions and RC bits
-                with m.If(r.priv | ~data[3]):
-                    with m.If(~r.iside):
-                        comb += perm_ok.eq(data[1] | (data[2] & ~r.store))
-                    with m.Else():
+                with m.If(r.priv | ~eaa[EAA_PRIV]):
+                    with m.If(r.iside): # instruction-side request
                         # no IAMR, so no KUEP support for now
                         # deny execute permission if cache inhibited
-                        comb += perm_ok.eq(data[0] & ~data[5])
+                        comb += perm_ok.eq(eaa[EAA_EXE] & ~rpte.att[1])
+                    with m.Else():
+                        # Load/Store (read/write)
+                        comb += perm_ok.eq(eaa[EAA_WR] |
+                                          (eaa[EAA_RD] & ~r.store))
+                comb += rc_ok.eq(rpte.r & (rpte.c | ~r.store))
 
-                comb += rc_ok.eq(data[8] & (data[7] | ~r.store))
+                # permissions / rc ok, load TLB, otherwise report error
                 with m.If(perm_ok & rc_ok):
                     comb += v.state.eq(State.RADIX_LOAD_TLB)
+                    sync += Display("RADIX LEAF data %x att %x eaa %x "
+                                    "R %d C %d "
+                                    "shift %d pgbase %x ",
+                                    data, rpte.att, eaa,
+                                    rpte.r, rpte.c,
+                                    v.shift, v.pgbase
+                                    )
                 with m.Else():
                     comb += v.state.eq(State.RADIX_FINISH)
                     comb += v.perm_err.eq(~perm_ok)
                     # permission error takes precedence over RC error
                     comb += v.rc_error.eq(perm_ok)
 
-            # valid & !leaf
+            # valid & !leaf: RADIX Page-Directory Entry
             with m.Else():
-                comb += mbits.eq(data[0:5])
+                comb += mbits.eq(rpde.nls) # 5 bits NLS into 6-bit-long mbits
                 comb += badtree.eq((mbits < 5) |
                                    (mbits > 16) |
                                    (mbits > r.shift))
@@ -270,24 +387,31 @@ class MMU(Elaboratable):
                     comb += v.badtree.eq(1)
                 with m.Else():
                     comb += v.shift.eq(r.shift - mbits)
-                    comb += v.mask_size.eq(mbits[0:5])
-                    comb += v.pgbase.eq(Cat(C(0, 8), data[8:56]))
+                    comb += v.mask_size.eq(mbits)
+                    # pagebase is first 48 bits of NLB, shifted up 1 byte
+                    comb += v.pgbase.eq(Cat(C(0, 8), rpde.nlb[:48]))
                     comb += v.state.eq(State.RADIX_LOOKUP)
 
         with m.Else():
             # non-present PTE, generate a DSI
             comb += v.state.eq(State.RADIX_FINISH)
             comb += v.invalid.eq(1)
+            if (display_invalid):
+                sync += Display("MMU: non-present PTE, generate a DSI")
 
     def segment_check(self, m, v, r, data, finalmask):
+        """segment_check: checks validity of the request before doing a
+        RADIX lookup. reports either segment error or bad tree if not ok
+        """
         comb = m.d.comb
 
-        mbits = Signal(6)
+        mbits = Signal(6, name="mbits_check")
         nonzero = Signal()
         comb += mbits.eq(r.mask_size)
         comb += v.shift.eq(r.shift + (31 - 12) - mbits)
         comb += nonzero.eq((r.addr[31:62] & ~finalmask[0:31]).bool())
-        with m.If((r.addr[63] ^ r.addr[62]) | nonzero):
+        with m.If((r.addr[63] != r.addr[62]) # pt3 == 0b11 and pt1 == 0b00
+                  | nonzero):
             comb += v.state.eq(State.RADIX_FINISH)
             comb += v.segerror.eq(1)
         with m.Elif((mbits < 5) | (mbits > 16) |
@@ -315,16 +439,18 @@ class MMU(Elaboratable):
             sync += Display("MMU completing op without error")
 
         with m.If(l_out.err):
-            sync += Display("MMU completing op with err invalid"
+            sync += Display("MMU completing op with err invalid="
                             "%d badtree=%d", l_out.invalid, l_out.badtree)
 
         with m.If(rin.state == State.RADIX_LOOKUP):
-            sync += Display ("radix lookup shift=%d msize=%d",
-                            rin.shift, rin.mask_size)
+            sync += Display ("radix lookup shift=%x msize=%x",
+                            rin.shift, mask)
 
         with m.If(r.state == State.RADIX_LOOKUP):
-            sync += Display(f"send load addr=%x addrsh=%d mask=%x",
+            sync += Display(f"send load addr=%x addrsh=%x mask=%x",
                             d_out.addr, addrsh, mask)
+
+        # update the internal register
         sync += r.eq(rin)
 
     def elaborate(self, platform):
@@ -340,6 +466,11 @@ class MMU(Elaboratable):
         self.rin = rin = RegStage("r_in")
         r = RegStage("r")
 
+        # get access to prtbl and pid for debug / testing purposes ONLY
+        # (actually, not needed, because setup_regs() triggers mmu direct)
+        # self._prtbl = r.prtbl
+        # self._pid = r.pid
+
         l_in  = self.l_in
         l_out = self.l_out
         d_out = self.d_out
@@ -348,7 +479,7 @@ class MMU(Elaboratable):
 
         self.mmu_0(m, r, rin, l_in, l_out, d_out, addrsh, mask)
 
-        v = RegStage()
+        v = RegStage("v")
         dcreq = Signal()
         tlb_load = Signal()
         itlb_load = Signal()
@@ -363,7 +494,6 @@ class MMU(Elaboratable):
 
         comb += v.eq(r)
         comb += v.valid.eq(0)
-        comb += dcreq.eq(0)
         comb += v.done.eq(0)
         comb += v.err.eq(0)
         comb += v.invalid.eq(0)
@@ -371,11 +501,7 @@ class MMU(Elaboratable):
         comb += v.segerror.eq(0)
         comb += v.perm_err.eq(0)
         comb += v.rc_error.eq(0)
-        comb += tlb_load.eq(0)
-        comb += itlb_load.eq(0)
-        comb += tlbie_req.eq(0)
         comb += v.inval_all.eq(0)
-        comb += prtbl_rd.eq(0)
 
         # Radix tree data structures in memory are
         # big-endian, so we need to byte-swap them
@@ -383,17 +509,29 @@ class MMU(Elaboratable):
 
         # generate mask for extracting address fields for PTE addr generation
         m.submodules.pte_mask = pte_mask = Mask(16-5)
+        pte_mask.mask.name = "pte_mask"
         comb += pte_mask.shift.eq(r.mask_size - 5)
         comb += mask.eq(Cat(C(0x1f, 5), pte_mask.mask))
 
         # generate mask for extracting address bits to go in
         # TLB entry in order to support pages > 4kB
         m.submodules.tlb_mask = tlb_mask = Mask(44)
+        tlb_mask.mask.name = "tlb_mask"
         comb += tlb_mask.shift.eq(r.shift)
         comb += finalmask.eq(tlb_mask.mask)
 
+        # Shift address bits 61--12 right by 0--47 bits and
+        # supply the least significant 16 bits of the result.
+        comb += addrsh.eq(r.addr[12:62] >> r.shift)
+
         with m.If(r.state != State.IDLE):
             sync += Display("MMU state %d %016x", r.state, data)
+            sync += Display("addrsh %x r.shift %d r.addr[12:62] %x",
+                        addrsh, r.shift, r.addr[12:62])
+
+        ##########
+        # Main FSM
+        ##########
 
         with m.Switch(r.state):
             with m.Case(State.IDLE):
@@ -451,25 +589,35 @@ class MMU(Elaboratable):
                 sync += Display("   RADIX_FINISH")
                 comb += v.state.eq(State.IDLE)
 
+        # check and report either error or done.
         with m.If((v.state == State.RADIX_FINISH) |
                  ((v.state == State.RADIX_LOAD_TLB) & r.iside)):
             comb += v.err.eq(v.invalid | v.badtree | v.segerror
                              | v.perm_err | v.rc_error)
             comb += v.done.eq(~v.err)
 
-        with m.If(~r.addr[63]):
+        # PID is only valid if MSB of address is zero, top 2 bits are Quadrant
+        with m.If(~r.addr[63]): # quadrant 0 (pt0)
             comb += effpid.eq(r.pid)
 
+        # calculate Process Table Address
         pr24 = Signal(24, reset_less=True)
-        comb += pr24.eq(masked(r.prtbl[12:36], effpid[8:32], finalmask))
-        comb += prtb_adr.eq(Cat(C(0, 4), effpid[0:8], pr24, r.prtbl[36:56]))
+        prtbla = PRTBL("prtbla")
+        comb += prtbla.eq(r.prtbl)
+        rpdb = prtbla.rpdb
+        comb += pr24.eq(masked(rpdb[4:28], effpid[8:32], finalmask))
+        comb += prtb_adr.eq(Cat(C(0, 4), effpid[0:8], pr24, rpdb[28:48]))
 
+        # calculate Page Table Address
         pg16 = Signal(16, reset_less=True)
         comb += pg16.eq(masked(r.pgbase[3:19], addrsh, mask))
         comb += pgtb_adr.eq(Cat(C(0, 3), pg16, r.pgbase[19:56]))
 
+        # calculate Page Table Entry from Real Page Number (leaf=1, RTPTE)
+        rpte = RTPTE(name="rpte")
+        comb += rpte.eq(r.pde)
         pd44 = Signal(44, reset_less=True)
-        comb += pd44.eq(masked(r.pde[12:56], r.addr[12:56], finalmask))
+        comb += pd44.eq(masked(rpte.rpn, r.addr[12:56], finalmask))
         comb += pte.eq(Cat(r.pde[0:12], pd44))
 
         # update registers
@@ -485,7 +633,11 @@ class MMU(Elaboratable):
             comb += addr.eq(prtb_adr)
         with m.Else():
             comb += addr.eq(pgtb_adr)
+            sync += Display(f"pagetable pg16=%x addrsh %x mask %x pgbase=%x "
+                            "pgbase[19:56]=%x",
+                            pg16, addrsh, mask, r.pgbase, r.pgbase[19:56])
 
+        # connect to other interfaces: LDST, D-Cache, I-Cache
         comb += l_out.done.eq(r.done)
         comb += l_out.err.eq(r.err)
         comb += l_out.invalid.eq(r.invalid)
@@ -524,8 +676,8 @@ def dcache_get(dut):
     mem = {0x0: 0x000000, # to get mtspr prtbl working
 
            0x10000:    # PARTITION_TABLE_2
-                       # PATB_GR=1 PRTB=0x1000 PRTS=0xb
-           b(0x800000000100000b),
+                       # HR=1 RTS1=0x2 PRTB=0x300 RTS2=0x5 PRTS=0xb
+           b(0xc0000000000030ad),
 
            0x30000:     # RADIX_ROOT_PTE
                         # V = 1 L = 0 NLB = 0x400 NLS = 9
@@ -536,20 +688,77 @@ def dcache_get(dut):
                            # R = 1 C = 1 ATT = 0 EAA 0x7
            b(0xc000000000000187),
 
-          0x1000000:   # PROCESS_TABLE_3
+#
+#   slightly different from radix_walk_example.txt: address in microwatt
+#   has the top bit set to indicate hypervisor.  here, Quadrant 3's
+#   process table entry is put instead into Quadrant 0.  the entry
+#   PROCESS_TABLE_3 should, strictly speaking, be at 0x1000010
+
+#          0x1000000:   # PROCESS_TABLE_3 (pt0_valid)
+#                       # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 12
+#           b(0x40000000000300ac),
+
+          0x1000000:   # PROCESS_TABLE_3 (pt3_valid)
                        # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
            b(0x40000000000300ad),
           }
 
+    # microwatt mmu.bin first part of test 2.
+    # PRTBL must be set to 0x12000, PID to 1
+    mem = {
+             0x0: 0x000000, # to get mtspr prtbl working
+             0x13920: 0x86810000000000c0, # leaf, supposed to be at 0x13920
+             0x10000: 0x0930010000000080, # directory node
+             0x12010: 0x0a00010000000000, # page table
+             0x124000: 0x0000000badc0ffee,  # memory to be looked up
+            }
+
+    # microwatt mmu.bin first part of test 4.
+    # PRTBL must be set to 0x12000, PID to 1
+    mem = {
+             0x0: 0x000000, # to get mtspr prtbl working
+             0x13858: 0x86a10000000000c0, # leaf node
+             0x10000: 0x0930010000000080, # directory node
+             0x12010: 0x0a00010000000000, # page table
+    }
+
+    # microwatt mmu.bin test 5.
+    # PRTBL must be set to 0x12000, PID to 1
+    mem = {
+             0x0: 0x000000, # to get mtspr prtbl working
+             0x13cf8: 0x86b10000000000c0, # leaf node
+             0x13d00: 0x0000000000000000, # invalid leaf node
+             0x10008: 0x0930010000000080, # directory node
+             0x12010: 0x0a00010000000000, # page table
+    }
+
+    # microwatt mmu.bin test 12, instruction-side
+    # PRTBL must be set to 0x12000, PID to 1, iside to 1
+    mem = {
+             0x0: 0x000000, # to get mtspr prtbl working
+             0x13920: 0x01110000000000c0, # leaf node
+             0x10008: 0x0930010000000080, # directory node
+             0x12010: 0x0a00010000000000, # page table
+    }
+
     while not stop:
         while True: # wait for dc_valid
             if stop:
                 return
             dc_valid = yield (dut.d_out.valid)
+            tlbld = yield (dut.d_out.tlbld)
             if dc_valid:
                 break
             yield
         addr = yield dut.d_out.addr
+        if tlbld:
+            pte = yield dut.d_out.pte
+            print ("    DCACHE PTE %x -> %x" % (pte, addr))
+            yield dut.d_in.done.eq(1)
+            yield
+            yield dut.d_in.done.eq(0)
+            continue
+
         if addr not in mem:
             print ("    DCACHE LOOKUP FAIL %x" % (addr))
             stop = True
@@ -563,9 +772,15 @@ def dcache_get(dut):
         yield
         yield dut.d_in.done.eq(0)
 
+
 def mmu_wait(dut):
     global stop
     while not stop: # wait for dc_valid / err
+        d_valid = yield (dut.d_out.valid)
+        if d_valid:
+            tlbld = yield (dut.d_out.tlbld)
+            addr = yield (dut.d_out.addr)
+            print ("addr %x tlbld %d" % (addr, tlbld))
         l_done = yield (dut.l_out.done)
         l_err = yield (dut.l_out.err)
         l_badtree = yield (dut.l_out.badtree)
@@ -581,13 +796,20 @@ def mmu_wait(dut):
         yield dut.l_in.mtspr.eq(0) # captured by RegStage(s)
         yield dut.l_in.load.eq(0)  # can reset everything safely
 
+
 def mmu_sim(dut):
     global stop
 
+    # microwatt PRTBL = 0x12000, other test is 0x1000000
+    #prtbl = 0x100000
+    #pidr = 0x0
+    prtbl = 0x12000
+    pidr = 0x1
+
     # MMU MTSPR set prtbl
     yield dut.l_in.mtspr.eq(1)
     yield dut.l_in.sprn[9].eq(1) # totally fake way to set SPR=prtbl
-    yield dut.l_in.rs.eq(0x1000000) # set process table
+    yield dut.l_in.rs.eq(prtbl) # set process table
     yield dut.l_in.valid.eq(1)
     yield from mmu_wait(dut)
     yield
@@ -597,26 +819,55 @@ def mmu_sim(dut):
 
     prtbl = yield (dut.rin.prtbl)
     print ("prtbl after MTSPR %x" % prtbl)
-    assert prtbl == 0x1000000
+    assert prtbl == prtbl
+
+    if True: # microwatt test set PIDR
+        # MMU MTSPR set PIDR = 1
+        yield dut.l_in.mtspr.eq(1)
+        yield dut.l_in.sprn[9].eq(0) # totally fake way to set SPR=pidr
+        yield dut.l_in.rs.eq(pidr) # set process table
+        yield dut.l_in.valid.eq(1)
+        yield from mmu_wait(dut)
+        yield
+        yield dut.l_in.sprn.eq(0)
+        yield dut.l_in.rs.eq(0)
+        yield
 
     #yield dut.rin.prtbl.eq(0x1000000) # manually set process table
     #yield
 
+    #addr = 0x10000  # original test
+    #addr = 0x124108  # microwatt mmu.bin test 2
+    #addr = 0x10b0d8  # microwatt mmu.bin test 4
+    # these are a misalignment test. one load results in two actual
+    # lookups, one of which has a valid page table entry, the other
+    # does not.  we currently do not support misaligned in Loadstore1
+    # therefore these tests fail with an align_intr (0x600) at 0x39fffd
+    addr = 0x39fffd # microwatt mmu.bin test 5
+    addr = 0x3a0000 # microwatt mmu.bin test 5
+
+    # microwatt mmu.bin test 12 is instruction-side
+    addr = 0x324000 # microwatt mmu.bin test 12
+    iside = 1
 
     # MMU PTE request
-    yield dut.l_in.load.eq(1)
+    yield dut.l_in.iside.eq(iside)
+    yield dut.l_in.load.eq(0)
     yield dut.l_in.priv.eq(1)
-    yield dut.l_in.addr.eq(0x10000)
+    yield dut.l_in.addr.eq(addr)
     yield dut.l_in.valid.eq(1)
     yield from mmu_wait(dut)
 
     addr = yield dut.d_out.addr
     pte = yield dut.d_out.pte
+    tlb_ld = yield dut.d_out.tlbld
     l_done = yield (dut.l_out.done)
     l_err = yield (dut.l_out.err)
     l_badtree = yield (dut.l_out.badtree)
-    print ("translated done %d err %d badtree %d addr %x pte %x" % \
-               (l_done, l_err, l_badtree, addr, pte))
+    print ("translated done %d err %d badtree %d "
+           "addr %x pte %x tlb_ld %d" % \
+               (l_done, l_err, l_badtree, addr, pte, tlb_ld))
+
     yield
     yield dut.l_in.priv.eq(0)
     yield dut.l_in.addr.eq(0)
index 2392620bc5c67bc7dea7fb17ad249bc7afbb60cf..023f47589eaf983e5731cfd7c6970b6072db47f2 100644 (file)
@@ -10,8 +10,8 @@
 
     busy_o/1        most likely to be x_busy_o
     go_die_i/1      rst?
-    addr.data/48    x_addr_i (x_addr_i[:4] goes into LenExpand)
-    addr.ok/1       probably x_valid_i & ~x_stall_i
+    addr.data/64    x_addr_i (x_addr_i[:4] goes into LenExpand)
+    addr.ok/1       probably x_i_valid & ~x_stall_i
 
     addr_ok_o/1     no equivalent.  *might* work using x_stall_i
     exc_o/6(?)      m_load_err_o and m_store_err_o
@@ -37,7 +37,7 @@ from nmutil.util import rising_edge
 class Pi2LSUI(PortInterfaceBase):
 
     def __init__(self, name, lsui=None,
-                 data_wid=64, mask_wid=8, addr_wid=48):
+                 data_wid=64, mask_wid=8, addr_wid=64):
         print("pi2lsui reg mask addr", data_wid, mask_wid, addr_wid)
         super().__init__(data_wid, addr_wid)
         if lsui is None:
@@ -46,12 +46,13 @@ class Pi2LSUI(PortInterfaceBase):
         self.lsui_busy = Signal()
         self.valid_l = SRLatch(False, name="valid")
 
-    def set_wr_addr(self, m, addr, mask, misalign, msr_pr):
+    def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
+        print("pi2lsui TODO, implement is_dcbz")
         m.d.comb += self.valid_l.s.eq(1)
         m.d.comb += self.lsui.x_mask_i.eq(mask)
         m.d.comb += self.lsui.x_addr_i.eq(addr)
 
-    def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+    def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
         m.d.comb += self.valid_l.s.eq(1)
         m.d.comb += self.lsui.x_mask_i.eq(mask)
         m.d.comb += self.lsui.x_addr_i.eq(addr)
@@ -100,8 +101,8 @@ class Pi2LSUI(PortInterfaceBase):
                     m.next = "IDLE"
 
         # indicate valid at both ends. OR with lsui_busy (stops comb loop)
-        m.d.comb += self.lsui.m_valid_i.eq(self.valid_l.q )
-        m.d.comb += self.lsui.x_valid_i.eq(self.valid_l.q )
+        m.d.comb += self.lsui.m_i_valid.eq(self.valid_l.q )
+        m.d.comb += self.lsui.x_i_valid.eq(self.valid_l.q )
 
         # reset the valid latch when not busy.  sync to stop loop
         lsui_active = Signal()
@@ -114,7 +115,7 @@ class Pi2LSUI(PortInterfaceBase):
 class Pi2LSUI1(Elaboratable):
 
     def __init__(self, name, pi=None, lsui=None,
-                 data_wid=64, mask_wid=8, addr_wid=48):
+                 data_wid=64, mask_wid=8, addr_wid=64):
         print("pi2lsui reg mask addr", data_wid, mask_wid, addr_wid)
         self.addrbits = mask_wid
         if pi is None:
@@ -154,7 +155,7 @@ class Pi2LSUI1(Elaboratable):
             # expand the LSBs of address plus LD/ST len into 16-bit mask
             m.d.comb += lsui.x_mask_i.eq(lenexp.lexp_o)
             # pass through the address, indicate "valid"
-            m.d.comb += lsui.x_valid_i.eq(1)
+            m.d.comb += lsui.x_i_valid.eq(1)
             # indicate "OK" - XXX should be checking address valid
             m.d.comb += pi.addr_ok_o.eq(1)
 
index 3d6dc8c79bcb69fbc4427767ba1a250a9a10a18f..93db9d6e9bdda5eceae23a680328ded88034e7ef 100644 (file)
@@ -25,12 +25,14 @@ from nmigen.utils import log2_int
 from nmutil.latch import SRLatch, latchregister
 from nmutil.util import rising_edge
 from openpower.decoder.power_decoder2 import Data
+from openpower.decoder.power_enums import MSRSpec
 from soc.scoreboard.addr_match import LenExpand
 from soc.experiment.mem_types import LDSTException
 
 # for testing purposes
 from soc.experiment.testmem import TestMemory
 #from soc.scoreboard.addr_split import LDSTSplitter
+from nmutil.util import Display
 
 import unittest
 
@@ -88,20 +90,25 @@ class PortInterface(RecordObject):
       busy_o is deasserted on the cycle AFTER st.ok is asserted.
     """
 
-    def __init__(self, name=None, regwid=64, addrwid=48):
+    def __init__(self, name=None, regwid=64, addrwid=64):
 
         self._regwid = regwid
         self._addrwid = addrwid
 
         RecordObject.__init__(self, name=name)
 
-        # distinguish op type (ld/st)
-        self.is_ld_i = Signal(reset_less=True)
-        self.is_st_i = Signal(reset_less=True)
+        # distinguish op type (ld/st/dcbz/nc)
+        self.is_ld_i    = Signal(reset_less=True)
+        self.is_st_i    = Signal(reset_less=True)
+        self.is_dcbz_i     = Signal(reset_less=True) # cache-line zeroing
+        self.is_nc         = Signal()  # no cacheing
 
         # LD/ST data length (TODO: other things may be needed)
         self.data_len = Signal(4, reset_less=True)
 
+        # atomic reservation (LR/SC - ldarx / stdcx etc.)
+        self.reserve = Signal(reset_less=True)
+
         # common signals
         self.busy_o = Signal(reset_less=True)     # do not use if busy
         self.go_die_i = Signal(reset_less=True)   # back to reset
@@ -113,15 +120,14 @@ class PortInterface(RecordObject):
         # LD/ST
         self.ld = Data(regwid, "ld_data_o")  # ok to be set by L0 Cache/Buf
         self.st = Data(regwid, "st_data_i")  # ok to be set by CompUnit
+        self.store_done = Data(1, "store_done_o") # store has been actioned
 
-        # additional "modes"
-        self.is_dcbz        = Signal()  # data cache block zero request
-        self.is_nc         = Signal()  # no cacheing
-        self.msr_pr        = Signal()  # 1==virtual, 0==privileged
+        #only priv_mode = not msr_pr is used currently
+        # TODO: connect signals
+        self.virt_mode  = Signal() # ctrl.msr(MSR_DR);
+        self.priv_mode  = Signal() # not ctrl.msr(MSR_PR);
+        self.mode_32bit = Signal() # not ctrl.msr(MSR_SF);
 
-        # mmu
-        self.mmu_done          = Signal() # keep for now
-       
         # dcache
         self.ldst_error        = Signal()
         ## Signalling ld/st error - NC cache hit, TLB miss, prot/RC failure
@@ -132,18 +138,21 @@ class PortInterface(RecordObject):
         return [self.is_ld_i.eq(inport.is_ld_i),
                 self.is_st_i.eq(inport.is_st_i),
                 self.is_nc.eq(inport.is_nc),
-                self.is_dcbz.eq(inport.is_dcbz),
+                self.is_dcbz_i.eq(inport.is_dcbz_i),
                 self.data_len.eq(inport.data_len),
+                self.reserve.eq(inport.reserve),
                 self.go_die_i.eq(inport.go_die_i),
                 self.addr.data.eq(inport.addr.data),
                 self.addr.ok.eq(inport.addr.ok),
                 self.st.eq(inport.st),
-                self.msr_pr.eq(inport.msr_pr),
+                self.virt_mode.eq(inport.virt_mode),
+                self.priv_mode.eq(inport.priv_mode),
+                self.mode_32bit.eq(inport.mode_32bit),
                 inport.ld.eq(self.ld),
                 inport.busy_o.eq(self.busy_o),
                 inport.addr_ok_o.eq(self.addr_ok_o),
                 inport.exc_o.eq(self.exc_o),
-                inport.mmu_done.eq(self.mmu_done),
+                inport.store_done.eq(self.store_done),
                 inport.ldst_error.eq(self.ldst_error),
                 inport.cache_paradox.eq(self.cache_paradox)
                 ]
@@ -172,8 +181,8 @@ class PortInterfaceBase(Elaboratable):
     def connect_port(self, inport):
         return self.pi.connect_port(inport)
 
-    def set_wr_addr(self, m, addr, mask, misalign, msr_pr): pass
-    def set_rd_addr(self, m, addr, mask, misalign, msr_pr): pass
+    def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc): pass
+    def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc): pass
     def set_wr_data(self, m, data, wen): pass
     def get_rd_data(self, m): pass
 
@@ -211,7 +220,13 @@ class PortInterfaceBase(Elaboratable):
         pi = self.pi
         comb += lds.eq(pi.is_ld_i)  # ld-req signals
         comb += sts.eq(pi.is_st_i)  # st-req signals
-        pr = pi.msr_pr # MSR problem state: PR=1 ==> virt, PR==0 ==> priv
+
+        # TODO: construct an MSRspec here and pass it over in
+        # self.set_rd_addr and set_wr_addr below rather than just pr
+        pr = ~pi.priv_mode
+        dr = pi.virt_mode
+        sf = ~pi.mode_32bit
+        msr = MSRSpec(pr=pr, dr=dr, sf=sf)
 
         # detect busy "edge"
         busy_delay = Signal()
@@ -225,7 +240,6 @@ class PortInterfaceBase(Elaboratable):
         misalign = Signal()
         comb += misalign.eq(lenexp.lexp_o[8:].bool())
 
-
         # activate mode: only on "edge"
         comb += ld_active.s.eq(rising_edge(m, lds))  # activate LD mode
         comb += st_active.s.eq(rising_edge(m, sts))  # activate ST mode
@@ -233,6 +247,8 @@ class PortInterfaceBase(Elaboratable):
         # LD/ST requested activates "busy" (only if not already busy)
         with m.If(self.pi.is_ld_i | self.pi.is_st_i):
             comb += busy_l.s.eq(~busy_delay)
+            with m.If(self.pi.exc_o.happened):
+                sync += Display("fast exception")
 
         # if now in "LD" mode: wait for addr_ok, then send the address out
         # to memory, acknowledge address, and send out LD data
@@ -242,7 +258,8 @@ class PortInterfaceBase(Elaboratable):
             comb += lenexp.len_i.eq(pi.data_len)
             comb += lenexp.addr_i.eq(lsbaddr)
             with m.If(pi.addr.ok & adrok_l.qn):
-                self.set_rd_addr(m, pi.addr.data, lenexp.lexp_o, misalign, pr)
+                self.set_rd_addr(m, pi.addr.data, lenexp.lexp_o, misalign,
+                                    msr, pi.is_nc)
                 comb += pi.addr_ok_o.eq(1)  # acknowledge addr ok
                 sync += adrok_l.s.eq(1)       # and pull "ack" latch
 
@@ -254,8 +271,9 @@ class PortInterfaceBase(Elaboratable):
             comb += lenexp.len_i.eq(pi.data_len)
             comb += lenexp.addr_i.eq(lsbaddr)
             with m.If(pi.addr.ok):
-                self.set_wr_addr(m, pi.addr.data, lenexp.lexp_o, misalign, pr)
-                with m.If(adrok_l.qn):
+                self.set_wr_addr(m, pi.addr.data, lenexp.lexp_o, misalign, msr,
+                                 pi.is_dcbz_i, pi.is_nc)
+                with m.If(adrok_l.qn & self.pi.exc_o.happened==0):
                     comb += pi.addr_ok_o.eq(1)  # acknowledge addr ok
                     sync += adrok_l.s.eq(1)       # and pull "ack" latch
 
@@ -275,15 +293,16 @@ class PortInterfaceBase(Elaboratable):
             comb += reset_l.s.eq(ldok)     # reset mode after 1 cycle
 
         # for ST mode, when addr has been "ok'd", wait for incoming "ST ok"
+        sync += st_done.s.eq(0)     # store done trigger
         with m.If(st_active.q & pi.st.ok):
             # shift data up before storing.  lenexp *bit* version of mask is
             # passed straight through as byte-level "write-enable" lines.
-            stdata = Signal(self.regwid, reset_less=True)
+            stdata = Signal(self.regwid*2, reset_less=True)
             comb += stdata.eq(pi.st.data << (lenexp.addr_i*8))
             # TODO: replace with link to LoadStoreUnitInterface.x_store_data
             # and also handle the ready/stall/busy protocol
             stok = self.set_wr_data(m, stdata, lenexp.lexp_o)
-            sync += st_done.s.eq(1)     # store done trigger
+            sync += st_done.s.eq(~self.pi.exc_o.happened) # store done trigger
         with m.If(st_done.q):
             comb += reset_l.s.eq(stok)   # reset mode after 1 cycle
 
@@ -295,7 +314,7 @@ class PortInterfaceBase(Elaboratable):
 
         # after waiting one cycle (reset_l is "sync" mode), reset the port
         with m.If(reset_l.q):
-            comb += ld_active.r.eq(1)   # leave the ST active for 1 cycle
+            comb += ld_active.r.eq(1)   # leave the LD active for 1 cycle
             comb += st_active.r.eq(1)   # leave the ST active for 1 cycle
             comb += reset_l.r.eq(1)     # clear reset
             comb += adrok_l.r.eq(1)     # address reset
@@ -304,6 +323,7 @@ class PortInterfaceBase(Elaboratable):
         # monitor for an exception, clear busy immediately
         with m.If(self.pi.exc_o.happened):
             comb += busy_l.r.eq(1)
+            comb += reset_l.s.eq(1) # also reset whole unit
 
         # however ST needs one cycle before busy is reset
         #with m.If(self.pi.st.ok | self.pi.ld.ok):
@@ -315,7 +335,14 @@ class PortInterfaceBase(Elaboratable):
             comb += busy_l.r.eq(1)
 
         # busy latch outputs to interface
-        comb += pi.busy_o.eq(busy_l.q)
+        if hasattr(self, "external_busy"):
+            # when there is an extra (external) busy, include that here.
+            # this is used e.g. in LoadStore1 when an instruction fault
+            # is being processed (instr_fault) and stops Load/Store requests
+            # from being made until it's done
+            comb += pi.busy_o.eq(busy_l.q | self.external_busy(m))
+        else:
+            comb += pi.busy_o.eq(busy_l.q)
 
         return m
 
@@ -341,11 +368,11 @@ class TestMemoryPortInterface(PortInterfaceBase):
         # hard-code memory addressing width to 6 bits
         self.mem = TestMemory(regwid, 5, granularity=regwid//8, init=False)
 
-    def set_wr_addr(self, m, addr, mask, misalign, msr_pr):
+    def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
         lsbaddr, msbaddr = self.splitaddr(addr)
         m.d.comb += self.mem.wrport.addr.eq(msbaddr)
 
-    def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+    def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
         lsbaddr, msbaddr = self.splitaddr(addr)
         m.d.comb += self.mem.rdport.addr.eq(msbaddr)
 
index 31f84c2033153ff710ca13aafa73445e181eb46f..661b784d71f6a091757d21e8de7ebebc50b4e4d8 100644 (file)
@@ -1,7 +1,8 @@
 # based on microwatt plru.vhdl
 
-from nmigen import Elaboratable, Signal, Array, Module, Mux, Const
+from nmigen import Elaboratable, Signal, Array, Module, Mux, Const, Cat
 from nmigen.cli import rtlil
+from nmigen.lib.coding import Decoder
 
 
 class PLRU(Elaboratable):
@@ -52,6 +53,53 @@ class PLRU(Elaboratable):
     def ports(self):
         return [self.acc_en, self.lru_o, self.acc_i]
 
+
+class PLRUs(Elaboratable):
+    def __init__(self, cachetype, n_plrus, n_bits):
+        self.cachetype = cachetype
+        self.n_plrus = n_plrus
+        self.n_bits = n_bits
+        self.valid = Signal()
+        self.way = Signal(n_bits)
+        self.index = Signal(n_plrus.bit_length())
+        self.isel = Signal(n_plrus.bit_length())
+        self.o_index = Signal(n_bits)
+
+    def elaborate(self, platform):
+        """Generate TLB PLRUs
+        """
+        m = Module()
+        comb = m.d.comb
+
+        if self.n_plrus == 0:
+            return m
+
+        # Binary-to-Unary one-hot, enabled by valid
+        m.submodules.te = te = Decoder(self.n_plrus)
+        comb += te.n.eq(~self.valid)
+        comb += te.i.eq(self.index)
+
+        out = Array(Signal(self.n_bits, name="plru_out%d" % x) \
+                             for x in range(self.n_plrus))
+
+        for i in range(self.n_plrus):
+            # PLRU interface
+            name = "%s_plru_%d" % (self.cachetype, i)
+            m.submodules[name] = plru = PLRU(self.n_bits)
+
+            comb += plru.acc_en.eq(te.o[i])
+            comb += plru.acc_i.eq(self.way)
+            comb += out[i].eq(plru.lru_o)
+
+        # select output based on index
+        comb += self.o_index.eq(out[self.isel])
+
+        return m
+
+    def ports(self):
+        return [self.valid, self.way, self.index, self.isel, self.o_index]
+
+
 if __name__ == '__main__':
     dut = PLRU(2)
     vl = rtlil.convert(dut, ports=dut.ports())
@@ -59,3 +107,9 @@ if __name__ == '__main__':
         f.write(vl)
 
 
+    dut = PLRUs("testing", 4, 2)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_plrus.il", "w") as f:
+        f.write(vl)
+
+
index 2e6c734f0ebbc6137e46b001a5474ce54de7d210..d30a99dc65a28278c9c04eff1dcfb935da0e199d 100644 (file)
@@ -53,7 +53,7 @@ PROCESS_TABLE:
            RTS2 = 0x5
            RPDS = 12
 
-           PROCESS_TABLE_3       |     PROCESS_TABLE_3 //Hypervisor Userspace 
+0x1000010 :    PROCESS_TABLE_3       |     PROCESS_TABLE_3 //Hypervisor Userspace 
            0x40000000000300ad    |     0x0
             RTS1 = 0x2
            RPDB = 0x300
index f53669610f8e9f995c6e2ed8783155f305ef1640..0104290e4f84b07e02e24e40e3ec43f4a1f77480 100644 (file)
@@ -107,7 +107,7 @@ class CompUnitsBase(Elaboratable):
             self.addr_o = Signal(rwid, reset_less=True)
 
         # in/out register data (note: not register#, actual data)
-        self.data_o = Signal(rwid, reset_less=True)
+        self.o_data = Signal(rwid, reset_less=True)
         self.src1_i = Signal(rwid, reset_less=True)
         self.src2_i = Signal(rwid, reset_less=True)
         # input operand
@@ -152,8 +152,8 @@ class CompUnitsBase(Elaboratable):
 
         # merge (OR) all integer FU / ALU outputs to a single value
         if self.units:
-            data_o = treereduce(self.units, "data_o")
-            comb += self.data_o.eq(data_o)
+            o_data = treereduce(self.units, "o_data")
+            comb += self.o_data.eq(o_data)
             if self.ldstmode:
                 addr_o = treereduce(self.units, "addr_o")
                 comb += self.addr_o.eq(addr_o)
@@ -679,11 +679,11 @@ class Scoreboard(Elaboratable):
         # branch is active (TODO: a better signal: this is over-using the
         # go_write signal - actually the branch should not be "writing")
         with m.If(br1.go_wr_i):
-            sync += self.branch_direction_o.eq(br1.data_o+Const(1, 2))
+            sync += self.branch_direction_o.eq(br1.o_data+Const(1, 2))
             sync += bspec.active_i.eq(0)
             comb += bspec.br_i.eq(1)
             # branch occurs if data == 1, failed if data == 0
-            comb += bspec.br_ok_i.eq(br1.data_o == 1)
+            comb += bspec.br_ok_i.eq(br1.o_data == 1)
             for i in range(n_intfus):
                 # *expected* direction of the branch matched against *actual*
                 comb += bshadow.s_good_i[i][0].eq(bspec.match_g_o[i])
@@ -698,9 +698,9 @@ class Scoreboard(Elaboratable):
         comb += int_src2.ren.eq(intfus.src2_rsel_o)
 
         # connect ALUs to regfile
-        comb += int_dest.data_i.eq(cu.data_o)
-        comb += cu.src1_i.eq(int_src1.data_o)
-        comb += cu.src2_i.eq(int_src2.data_o)
+        comb += int_dest.i_data.eq(cu.o_data)
+        comb += cu.src1_i.eq(int_src1.o_data)
+        comb += cu.src2_i.eq(int_src2.o_data)
 
         # connect ALU Computation Units
         comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
@@ -735,9 +735,9 @@ class IssueToScoreboard(Elaboratable):
         self.n_regs = n_regs
 
         mqbits = unsigned(int(log(qlen) / log(2))+2)
-        self.p_add_i = Signal(mqbits)  # instructions to add (from data_i)
-        self.p_ready_o = Signal()  # instructions were added
-        self.data_i = Instruction._nq(n_in, "data_i")
+        self.p_add_i = Signal(mqbits)  # instructions to add (from i_data)
+        self.p_o_ready = Signal()  # instructions were added
+        self.i_data = Instruction._nq(n_in, "i_data")
 
         self.busy_o = Signal(reset_less=True)  # at least one CU is busy
         self.qlen_o = Signal(mqbits, reset_less=True)
@@ -762,9 +762,9 @@ class IssueToScoreboard(Elaboratable):
 
         # link up instruction queue
         comb += iq.p_add_i.eq(self.p_add_i)
-        comb += self.p_ready_o.eq(iq.p_ready_o)
+        comb += self.p_o_ready.eq(iq.p_o_ready)
         for i in range(self.n_in):
-            comb += eq(iq.data_i[i], self.data_i[i])
+            comb += eq(iq.i_data[i], self.i_data[i])
 
         # take instruction and process it.  note that it's possible to
         # "inspect" the queue contents *without* actually removing the
@@ -793,7 +793,7 @@ class IssueToScoreboard(Elaboratable):
         # "resetting" done above (insn_i=0) could be re-ASSERTed.
         with m.If(iq.qlen_o != 0):
             # get the operands and operation
-            instr = iq.data_o[0]
+            instr = iq.o_data[0]
             imm = instr.imm_data.data
             dest = instr.write_reg.data
             src1 = instr.read_reg1.data
@@ -839,8 +839,8 @@ class IssueToScoreboard(Elaboratable):
         return m
 
     def __iter__(self):
-        yield self.p_ready_o
-        for o in self.data_i:
+        yield self.p_o_ready
+        for o in self.i_data:
             yield from list(o)
         yield self.p_add_i
 
@@ -853,16 +853,16 @@ def power_instr_q(dut, pdecode2, ins, code):
 
     sendlen = 1
     for idx, instr in enumerate(instrs):
-        yield dut.data_i[idx].eq(instr)
+        yield dut.i_data[idx].eq(instr)
         insn_type = yield instr.insn_type
         fn_unit = yield instr.fn_unit
         print("senddata ", idx, insn_type, fn_unit, instr)
     yield dut.p_add_i.eq(sendlen)
     yield
-    o_p_ready = yield dut.p_ready_o
+    o_p_ready = yield dut.p_o_ready
     while not o_p_ready:
         yield
-        o_p_ready = yield dut.p_ready_o
+        o_p_ready = yield dut.p_o_ready
 
     yield dut.p_add_i.eq(0)
 
@@ -881,24 +881,24 @@ def instr_q(dut, op, funit, op_imm, imm, src1, src2, dest,
         dest = instr['write_reg']
         insn_type = instr['insn_type']
         fn_unit = instr['fn_unit']
-        yield dut.data_i[idx].insn_type.eq(insn_type)
-        yield dut.data_i[idx].fn_unit.eq(fn_unit)
-        yield dut.data_i[idx].read_reg1.data.eq(reg1)
-        yield dut.data_i[idx].read_reg1.ok.eq(1)  # XXX TODO
-        yield dut.data_i[idx].read_reg2.data.eq(reg2)
-        yield dut.data_i[idx].read_reg2.ok.eq(1)  # XXX TODO
-        yield dut.data_i[idx].write_reg.data.eq(dest)
-        yield dut.data_i[idx].write_reg.ok.eq(1)  # XXX TODO
-        yield dut.data_i[idx].imm_data.data.eq(imm)
-        yield dut.data_i[idx].imm_data.ok.eq(op_imm)
-        di = yield dut.data_i[idx]
+        yield dut.i_data[idx].insn_type.eq(insn_type)
+        yield dut.i_data[idx].fn_unit.eq(fn_unit)
+        yield dut.i_data[idx].read_reg1.data.eq(reg1)
+        yield dut.i_data[idx].read_reg1.ok.eq(1)  # XXX TODO
+        yield dut.i_data[idx].read_reg2.data.eq(reg2)
+        yield dut.i_data[idx].read_reg2.ok.eq(1)  # XXX TODO
+        yield dut.i_data[idx].write_reg.data.eq(dest)
+        yield dut.i_data[idx].write_reg.ok.eq(1)  # XXX TODO
+        yield dut.i_data[idx].imm_data.data.eq(imm)
+        yield dut.i_data[idx].imm_data.ok.eq(op_imm)
+        di = yield dut.i_data[idx]
         print("senddata %d %x" % (idx, di))
     yield dut.p_add_i.eq(sendlen)
     yield
-    o_p_ready = yield dut.p_ready_o
+    o_p_ready = yield dut.p_o_ready
     while not o_p_ready:
         yield
-        o_p_ready = yield dut.p_ready_o
+        o_p_ready = yield dut.p_o_ready
 
     yield dut.p_add_i.eq(0)
 
index 85b8b45ccdef48feec95ea96707deaa1c2e71979..633de571101d0e2455f77f880929cc4ab84f5ec6 100644 (file)
@@ -20,7 +20,9 @@ from soc.experiment.compldst_multi import LDSTCompUnit
 from soc.experiment.compldst_multi import CompLDSTOpSubset
 from soc.experiment.l0_cache import TstL0CacheBuffer
 
-from soc.experiment.alu_hier import ALU, BranchALU
+# for testing purposes
+from soc.config.test.test_loadstore import TestMemPspec
+from soc.experiment.alu_hier import ALUFunctionUnit, BranchALU
 from soc.fu.alu.alu_input_record import CompALUOpSubset
 
 from openpower.decoder.power_enums import MicrOp, Function
@@ -91,9 +93,9 @@ class CompUnitsBase(Elaboratable):
         self.issue_i = Signal(n_units, reset_less=True)
         self.rd0 = go_record(n_units, "rd0")
         self.rd1 = go_record(n_units, "rd1")
-        self.go_rd_i = [self.rd0.go, self.rd1.go]  # XXX HACK!
+        self.go_rd_i = [self.rd0.go_i, self.rd1.go_i]  # XXX HACK!
         self.wr0 = go_record(n_units, "wr0")
-        self.go_wr_i = [self.wr0.go]
+        self.go_wr_i = [self.wr0.go_i]
         self.shadown_i = Signal(n_units, reset_less=True)
         self.go_die_i = Signal(n_units, reset_less=True)
         if ldstmode:
@@ -102,8 +104,8 @@ class CompUnitsBase(Elaboratable):
 
         # outputs
         self.busy_o = Signal(n_units, reset_less=True)
-        self.rd_rel_o = [self.rd0.rel, self.rd1.rel]  # HACK!
-        self.req_rel_o = self.wr0.rel
+        self.rd_rel_o = [self.rd0.rel_o, self.rd1.rel_o]  # HACK!
+        self.req_rel_o = self.wr0.rel_o
         self.done_o = Signal(n_units, reset_less=True)
         if ldstmode:
             self.ld_o = Signal(n_units, reset_less=True)  # op is LD
@@ -115,7 +117,7 @@ class CompUnitsBase(Elaboratable):
             self.addr_o = Signal(rwid, reset_less=True)
 
         # in/out register data (note: not register#, actual data)
-        self.data_o = Signal(rwid, reset_less=True)
+        self.o_data = Signal(rwid, reset_less=True)
         self.src1_i = Signal(rwid, reset_less=True)
         self.src2_i = Signal(rwid, reset_less=True)
         # input operand
@@ -151,16 +153,16 @@ class CompUnitsBase(Elaboratable):
             go_rd_l1.append(alu.go_rd_i[1])
             issue_l.append(alu.issue_i)
             busy_l.append(alu.busy_o)
-        comb += self.rd0.rel.eq(Cat(*rd_rel0_l))
-        comb += self.rd1.rel.eq(Cat(*rd_rel1_l))
+        comb += self.rd0.rel_o.eq(Cat(*rd_rel0_l))
+        comb += self.rd1.rel_o.eq(Cat(*rd_rel1_l))
         comb += self.req_rel_o.eq(Cat(*req_rel_l))
         comb += self.done_o.eq(Cat(*done_l))
         comb += self.busy_o.eq(Cat(*busy_l))
         comb += Cat(*godie_l).eq(self.go_die_i)
         comb += Cat(*shadow_l).eq(self.shadown_i)
-        comb += Cat(*go_wr_l).eq(self.wr0.go)  # XXX TODO
-        comb += Cat(*go_rd_l0).eq(self.rd0.go)
-        comb += Cat(*go_rd_l1).eq(self.rd1.go)
+        comb += Cat(*go_wr_l).eq(self.wr0.go_i)  # XXX TODO
+        comb += Cat(*go_rd_l0).eq(self.rd0.go_i)
+        comb += Cat(*go_rd_l1).eq(self.rd1.go_i)
         comb += Cat(*issue_l).eq(self.issue_i)
 
         # connect data register input/output
@@ -170,8 +172,8 @@ class CompUnitsBase(Elaboratable):
         # protected by a single go_wr.  multi-issue requires a bus
         # to be inserted here.
         if self.units:
-            data_o = ortreereduce(self.units, "data_o")
-            comb += self.data_o.eq(data_o)
+            o_data = ortreereduce(self.units, "o_data")
+            comb += self.o_data.eq(o_data)
             if self.ldstmode:
                 addr_o = ortreereduce(self.units, "addr_o")
                 comb += self.addr_o.eq(addr_o)
@@ -179,6 +181,10 @@ class CompUnitsBase(Elaboratable):
         for i, alu in enumerate(self.units):
             comb += alu.src1_i.eq(self.src1_i)
             comb += alu.src2_i.eq(self.src2_i)
+            # temporary: set read mask to 0b111111111
+            if hasattr(alu, "rdmaskn"):
+                with m.If(alu.busy_o):
+                    comb += alu.rdmaskn.eq(-1)
 
         if not self.ldstmode:
             return m
@@ -228,7 +234,7 @@ class CompUnitLDSTs(CompUnitsBase):
         # LD/ST Units
         units = []
         for i in range(n_ldsts):
-            pi = l0.l0.dports[i].pi
+            pi = l0.l0.dports[i]
             units.append(LDSTCompUnit(pi, rwid, awid=48))
 
         CompUnitsBase.__init__(self, rwid, units, ldstmode=True)
@@ -259,13 +265,12 @@ class CompUnitALUs(CompUnitsBase):
 
         # Int ALUs
         alus = []
-        for i in range(n_alus):
-            alus.append(ALU(rwid))
 
         units = []
-        for alu in alus:
-            aluopwid = 3  # extra bit for immediate mode
-            units.append(MultiCompUnit(rwid, alu, CompALUOpSubset))
+        for i in range(n_alus):
+            fu = ALUFunctionUnit(i)
+            units.append(fu)
+            alus.append(fu.alu)
 
         CompUnitsBase.__init__(self, rwid, units)
 
@@ -358,15 +363,15 @@ class FunctionUnits(Elaboratable):
             wpnd.append(Signal(nf, name="wr_dst%d_pend_o" %
                                j, reset_less=True))
 
-        self.dest_i = Array(dst)     # Dest in (top)
-        self.src_i = Array(src)      # oper in (top)
+        self.dest_i = dst     # Dest in (top)
+        self.src_i = src      # oper in (top)
 
         # for Register File Select Lines (horizontal), per-reg
-        self.dst_rsel_o = Array(dsel)  # dest reg (bot)
-        self.src_rsel_o = Array(rsel)  # src reg (bot)
+        self.dst_rsel_o = dsel  # dest reg (bot)
+        self.src_rsel_o = rsel  # src reg (bot)
 
-        self.go_rd_i = Array(rd)
-        self.go_wr_i = Array(wr)
+        self.go_rd_i = rd
+        self.go_wr_i = wr
 
         self.go_die_i = Signal(n_int_alus, reset_less=True)
         self.fn_issue_i = Signal(n_int_alus, reset_less=True)
@@ -436,7 +441,12 @@ class Scoreboard(Elaboratable):
         self.fpregs = RegFileArray(rwid, n_regs)
 
         # Memory (test for now)
-        self.l0 = TstL0CacheBuffer()
+        pspec = TestMemPspec(ldst_ifacetype='testpi',
+                             addr_wid=48,
+                             mask_wid=8,
+                             reg_wid=64)
+        dut = TstL0CacheBuffer(pspec)
+        self.l0 = TstL0CacheBuffer(pspec)
 
         # issue q needs to get at these
         self.aluissue = IssueUnitGroup(2)
@@ -558,10 +568,10 @@ class Scoreboard(Elaboratable):
                  ]
 
         # take these to outside (issue needs them)
-        comb += cua.op.eq_from_execute1(self.instr)
+        comb += cua.op.eq_from_execute1(self.instr.do)
         comb += cub.oper_i.eq(self.br_oper_i)
         comb += cub.imm_i.eq(self.br_imm_i)
-        comb += cul.op.eq_from_execute1(self.instr)
+        comb += cul.op.eq_from_execute1(self.instr.do)
 
         # TODO: issueunit.f (FP)
 
@@ -642,6 +652,7 @@ class Scoreboard(Elaboratable):
 
         # Group Picker... done manually for now.
         go_rd_o = ipick1.go_rd_o
+        delay_pick_l = []
         go_wr_o = ipick1.go_wr_o
         go_rd_i = intfus.go_rd_i
         go_wr_i = intfus.go_wr_i
@@ -659,8 +670,16 @@ class Scoreboard(Elaboratable):
         rrel_o = cu.rd_rel_o
         rqrl_o = cu.req_rel_o
         for i in range(fu_n_src):
-            comb += ipick1.rd_rel_i[i][0:n_intfus].eq(rrel_o[i][0:n_intfus])
+            # connect with a delay so that src data arrives at the right time
+            pick = Signal(n_intfus, name="pick_%d" % i)
+            delay_pick = Signal(n_intfus, name="dp_%d" % i)
+            rp = Signal(n_intfus, name="rp_%d" % i)
+            comb += pick[0:n_intfus].eq(rrel_o[i][0:n_intfus] & ~delay_pick)
+            comb += ipick1.rd_rel_i[i][0:n_intfus].eq(pick[0:n_intfus])
             comb += ipick1.readable_i[i][0:n_intfus].eq(int_rd_o[0:n_intfus])
+            sync += delay_pick.eq(rp)
+            comb += rp.eq(go_rd_o[i])
+            delay_pick_l.append(delay_pick)
         int_wr_o = intfus.writable_o
         for i in range(fu_n_dst):
             # XXX FIXME: rqrl_o[i] here
@@ -730,11 +749,11 @@ class Scoreboard(Elaboratable):
         # branch is active (TODO: a better signal: this is over-using the
         # go_write signal - actually the branch should not be "writing")
         with m.If(br1.go_wr_i):
-            sync += self.branch_direction_o.eq(br1.data_o+Const(1, 2))
+            sync += self.branch_direction_o.eq(br1.o_data+Const(1, 2))
             sync += bspec.active_i.eq(0)
             comb += bspec.br_i.eq(1)
             # branch occurs if data == 1, failed if data == 0
-            comb += bspec.br_ok_i.eq(br1.data_o == 1)
+            comb += bspec.br_ok_i.eq(br1.o_data == 1)
             for i in range(n_intfus):
                 # *expected* direction of the branch matched against *actual*
                 comb += bshadow.s_good_i[i][0].eq(bspec.match_g_o[i])
@@ -749,13 +768,13 @@ class Scoreboard(Elaboratable):
         comb += int_src2.ren.eq(intfus.src_rsel_o[1])
 
         # connect ALUs to regfile
-        comb += int_dest.data_i.eq(cu.data_o)
-        comb += cu.src1_i.eq(int_src1.data_o)
-        comb += cu.src2_i.eq(int_src2.data_o)
+        comb += int_dest.i_data.eq(cu.o_data)
+        comb += cu.src1_i.eq(int_src1.o_data)
+        comb += cu.src2_i.eq(int_src2.o_data)
 
         # connect ALU Computation Units
         for i in range(fu_n_src):
-            comb += cu.go_rd_i[i][0:n_intfus].eq(go_rd_o[i][0:n_intfus])
+            comb += cu.go_rd_i[i][0:n_intfus].eq(delay_pick_l[i][0:n_intfus])
         for i in range(fu_n_dst):
             comb += cu.go_wr_i[i][0:n_intfus].eq(go_wr_o[i][0:n_intfus])
         comb += cu.issue_i[0:n_intfus].eq(fn_issue_o[0:n_intfus])
@@ -788,9 +807,9 @@ class IssueToScoreboard(Elaboratable):
         self.n_regs = n_regs
 
         mqbits = unsigned(int(log(qlen) / log(2))+2)
-        self.p_add_i = Signal(mqbits)  # instructions to add (from data_i)
-        self.p_ready_o = Signal()  # instructions were added
-        self.data_i = Instruction._nq(n_in, "data_i")
+        self.p_add_i = Signal(mqbits)  # instructions to add (from i_data)
+        self.p_o_ready = Signal()  # instructions were added
+        self.i_data = Instruction._nq(n_in, "i_data")
 
         self.busy_o = Signal(reset_less=True)  # at least one CU is busy
         self.qlen_o = Signal(mqbits, reset_less=True)
@@ -815,9 +834,9 @@ class IssueToScoreboard(Elaboratable):
 
         # link up instruction queue
         comb += iq.p_add_i.eq(self.p_add_i)
-        comb += self.p_ready_o.eq(iq.p_ready_o)
+        comb += self.p_o_ready.eq(iq.p_o_ready)
         for i in range(self.n_in):
-            comb += eq(iq.data_i[i], self.data_i[i])
+            comb += eq(iq.i_data[i], self.i_data[i])
 
         # take instruction and process it.  note that it's possible to
         # "inspect" the queue contents *without* actually removing the
@@ -846,14 +865,14 @@ class IssueToScoreboard(Elaboratable):
         # "resetting" done above (insn_i=0) could be re-ASSERTed.
         with m.If(iq.qlen_o != 0):
             # get the operands and operation
-            instr = iq.data_o[0]
-            imm = instr.imm_data.data
+            instr = iq.o_data[0]
+            imm = instr.do.imm_data.data
             dest = instr.write_reg.data
             src1 = instr.read_reg1.data
             src2 = instr.read_reg2.data
-            op = instr.insn_type
-            fu = instr.fn_unit
-            opi = instr.imm_data.ok  # immediate set
+            op = instr.do.insn_type
+            fu = instr.do.fn_unit
+            opi = instr.do.imm_data.ok  # immediate set
 
             # set the src/dest regs
             comb += sc.int_dest_i.eq(dest)
@@ -885,8 +904,8 @@ class IssueToScoreboard(Elaboratable):
         return m
 
     def __iter__(self):
-        yield self.p_ready_o
-        for o in self.data_i:
+        yield self.p_o_ready
+        for o in self.i_data:
             yield from list(o)
         yield self.p_add_i
 
@@ -899,16 +918,16 @@ def power_instr_q(dut, pdecode2, ins, code):
 
     sendlen = 1
     for idx, instr in enumerate(instrs):
-        yield dut.data_i[idx].eq(instr)
-        insn_type = yield instr.insn_type
-        fn_unit = yield instr.fn_unit
+        yield dut.i_data[idx].eq(instr)
+        insn_type = yield instr.do.insn_type
+        fn_unit = yield instr.do.fn_unit
         print("senddata ", idx, insn_type, fn_unit, instr)
     yield dut.p_add_i.eq(sendlen)
     yield
-    o_p_ready = yield dut.p_ready_o
+    o_p_ready = yield dut.p_o_ready
     while not o_p_ready:
         yield
-        o_p_ready = yield dut.p_ready_o
+        o_p_ready = yield dut.p_o_ready
 
     yield dut.p_add_i.eq(0)
 
@@ -927,24 +946,24 @@ def instr_q(dut, op, funit, op_imm, imm, src1, src2, dest,
         dest = instr['write_reg']
         insn_type = instr['insn_type']
         fn_unit = instr['fn_unit']
-        yield dut.data_i[idx].insn_type.eq(insn_type)
-        yield dut.data_i[idx].fn_unit.eq(fn_unit)
-        yield dut.data_i[idx].read_reg1.data.eq(reg1)
-        yield dut.data_i[idx].read_reg1.ok.eq(1)  # XXX TODO
-        yield dut.data_i[idx].read_reg2.data.eq(reg2)
-        yield dut.data_i[idx].read_reg2.ok.eq(1)  # XXX TODO
-        yield dut.data_i[idx].write_reg.data.eq(dest)
-        yield dut.data_i[idx].write_reg.ok.eq(1)  # XXX TODO
-        yield dut.data_i[idx].imm_data.data.eq(imm)
-        yield dut.data_i[idx].imm_data.ok.eq(op_imm)
-        di = yield dut.data_i[idx]
-        print("senddata %d %x" % (idx, di))
+        yield dut.i_data[idx].do.insn_type.eq(insn_type)
+        yield dut.i_data[idx].do.fn_unit.eq(fn_unit)
+        yield dut.i_data[idx].read_reg1.data.eq(reg1)
+        yield dut.i_data[idx].read_reg1.ok.eq(1)  # XXX TODO
+        yield dut.i_data[idx].read_reg2.data.eq(reg2)
+        yield dut.i_data[idx].read_reg2.ok.eq(1)  # XXX TODO
+        yield dut.i_data[idx].write_reg.data.eq(dest)
+        yield dut.i_data[idx].write_reg.ok.eq(1)  # XXX TODO
+        yield dut.i_data[idx].do.imm_data.data.eq(imm)
+        yield dut.i_data[idx].do.imm_data.ok.eq(op_imm)
+        #di = yield dut.i_data[idx]
+        #print("senddata %d %x" % (idx, di))
     yield dut.p_add_i.eq(sendlen)
     yield
-    o_p_ready = yield dut.p_ready_o
+    o_p_ready = yield dut.p_o_ready
     while not o_p_ready:
         yield
-        o_p_ready = yield dut.p_ready_o
+        o_p_ready = yield dut.p_o_ready
 
     yield dut.p_add_i.eq(0)
 
@@ -1170,7 +1189,7 @@ def power_sim(m, dut, pdecode2, instruction, alusim):
 
                     ]
 
-        with Program(lst) as program:
+        with Program(lst, bigendian=False) as program:
             gen = program.generate_instructions()
 
             # issue instruction(s), wait for issue to be free before proceeding
@@ -1237,7 +1256,7 @@ def scoreboard_sim(dut, alusim):
                            0, 0, (0, 0)))
             instrs.append((5, 3, 3, MicrOp.OP_ADD, Function.ALU,
                            0, 0, (0, 0)))
-        if False:
+        if True:
             instrs.append((3, 5, 5, MicrOp.OP_MUL_L64, Function.ALU,
                            1, 7, (0, 0)))
         if False:
@@ -1335,8 +1354,9 @@ def scoreboard_sim(dut, alusim):
             instrs.append((6, 7, 7, 0, 0, (0, 0)))
 
         # issue instruction(s), wait for issue to be free before proceeding
+        print("instructions", instrs)
         for i, instr in enumerate(instrs):
-            print(i, instr)
+            print("issue instruction", i, instr)
             src1, src2, dest, op, fn_unit, opi, imm, (br_ok, br_fail) = instr
 
             print("instr %d: (%d, %d, %d, %s, %s, %d, %d)" %
@@ -1386,11 +1406,11 @@ def test_scoreboard():
     with open("test_scoreboard6600.il", "w") as f:
         f.write(vl)
 
-    run_simulation(m, power_sim(m, dut, pdecode2, instruction, alusim),
-                   vcd_name='test_powerboard6600.vcd')
+    #run_simulation(m, power_sim(m, dut, pdecode2, instruction, alusim),
+    #               vcd_name='test_powerboard6600.vcd')
 
-    run_simulation(dut, scoreboard_sim(dut, alusim),
-                  vcd_name='test_scoreboard6600.vcd')
+    run_simulation(dut, scoreboard_sim(dut, alusim),
+                  vcd_name='test_scoreboard6600.vcd')
 
     # run_simulation(dut, scoreboard_branch_sim(dut, alusim),
     #                    vcd_name='test_scoreboard6600.vcd')
index 0547bda6e0bce9dda11d9bace38b0e52d3999cc3..d96cb54dff34f417755c215e9466f81ef078e5bd 100644 (file)
@@ -44,11 +44,13 @@ class RegSim:
             src2 = self.regs[src2] & maxbits
         if op == MicrOp.OP_ADD:
             val = src1 + src2
+            print("    add src1, src2", src1, src2, val)
         elif op == MicrOp.OP_MUL_L64:
             val = src1 * src2
-            print("mul src1, src2", src1, src2, val)
+            print("    mul src1, src2", src1, src2, val)
         elif op == ISUB:
             val = src1 - src2
+            print("    sub src1, src2", src1, src2, val)
         elif op == ISHF:
             val = src1 >> (src2 & maxbits)
         elif op == IBGT:
diff --git a/src/soc/experiment/test/pagetables.py b/src/soc/experiment/test/pagetables.py
new file mode 100644 (file)
index 0000000..e481dd4
--- /dev/null
@@ -0,0 +1,166 @@
+def b(x): # byte-reverse function
+    return int.from_bytes(x.to_bytes(8, byteorder='little'),
+                          byteorder='big', signed=False)
+
+test1 = {
+           0x10000:    # PARTITION_TABLE_2
+                       # PATB_GR=1 PRTB=0x1000 PRTS=0xb
+           b(0x800000000100000b),
+
+           0x30000:     # RADIX_ROOT_PTE
+                        # V = 1 L = 0 NLB = 0x400 NLS = 9
+           b(0x8000000000040009),
+
+           0x40000:     # RADIX_SECOND_LEVEL
+                        # V = 1 L = 1 SW = 0 RPN = 0
+                        # R = 1 C = 1 ATT = 0 EAA 0x3
+           b(0xc000000000000183),
+
+           0x1000000:   # PROCESS_TABLE_3
+                        # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
+           b(0x40000000000300ad),
+
+           #0x10004: 0
+
+}
+
+
+# executable permission is barred here (EAA=0x2)
+test2 = {
+           0x10000:    # PARTITION_TABLE_2
+                       # PATB_GR=1 PRTB=0x1000 PRTS=0xb
+           b(0x800000000100000b),
+
+           0x30000:     # RADIX_ROOT_PTE
+                        # V = 1 L = 0 NLB = 0x400 NLS = 9
+           b(0x8000000000040009),
+
+           0x40000:     # RADIX_SECOND_LEVEL
+                        # V = 1 L = 1 SW = 0 RPN = 0
+                        # R = 1 C = 1 ATT = 0 EAA 0x2
+           b(0xc000000000000182),
+
+           0x1000000:   # PROCESS_TABLE_3
+                        # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
+           b(0x40000000000300ad),
+
+           #0x10004: 0
+
+}
+
+
+# microwatt mmu.bin first part of test 2. PRTBL must be set to 0x12000, PID to 1
+microwatt_test2 = {
+             0x13920: 0x86810000000000c0, # leaf node
+             0x10000: 0x0930010000000080, # directory node
+             0x12010: 0x0a00010000000000, # page table
+             0x8108: 0x0000000badc0ffee,  # memory to be looked up
+            }
+
+microwatt_test4 = {
+             0x13858: 0x86a10000000000c0, # leaf node
+             0x10000: 0x0930010000000080, # directory node
+             0x12010: 0x0a00010000000000, # page table
+}
+
+# microwatt mmu.bin test 5: a misaligned read which crosses over to a TLB that
+# is not valid.  must attempt a 64-bit read at address 0x39fffd to trigger
+
+microwatt_test5 = {
+             0x13cf8: 0x86b10000000000c0, # leaf, covers up to 0x39ffff
+             0x10008: 0x0930010000000080, # directory node
+             0x12010: 0x0a00010000000000, # page table
+             0x39fff8: 0x0123456badc0ffee,  # to be looked up (should fail)
+             0x400000: 0x0123456badc0ffee,  # not page-mapped
+}
+
+# linux kernel 5.7 first MMU enable
+"""
+                          rd @ 000bf803 di b000000000001033 sel ff 3.......
+                          rd @ 000bf804 di                0 sel ff ........
+                          rd @ 000bf805 di                0 sel ff ........
+                          rd @ 000bf806 di            10000 sel ff ........
+                          rd @ 000bf807 di c0000000005fc380 sel ff ........
+                          rd @ 000bf800 di         80000000 sel ff ........
+                          rd @ 000bf801 di c00000000059d400 sel ff ..Y.....
+                          rd @ 000bf802 di c000000000000000 sel ff ........
+pc     a588 insn 7c7a03a6 msr a000000000000003
+pc     a58c insn 7c9b03a6 msr a000000000000003
+pc     a590 insn 4c000024 msr a000000000000003
+pc     a598 insn f82d0190 msr b000000000000033
+                          rd @ 01c00000 di ad005c0000000040 sel ff ........
+                          rd @ 01c00001 di                0 sel ff ........
+                          rd @ 01c00002 di                0 sel ff ........
+                          rd @ 01c00003 di                0 sel ff ........
+                          rd @ 01c00004 di                0 sel ff ........
+                          rd @ 01c00005 di                0 sel ff ........
+                          rd @ 01c00006 di                0 sel ff ........
+                          rd @ 01c00007 di                0 sel ff ........
+                          rd @ 000b8000 di  9e0ff0f00000080 sel ff ........
+                          rd @ 000b8001 di                0 sel ff ........
+                          rd @ 000b8002 di                0 sel ff ........
+                          rd @ 000b8003 di                0 sel ff ........
+                          rd @ 000b8004 di                0 sel ff ........
+                          rd @ 000b8005 di                0 sel ff ........
+                          rd @ 000b8006 di                0 sel ff ........
+                          rd @ 000b8007 di                0 sel ff ........
+                          rd @ 01fffc00 di  9d0ff0f00000080 sel ff ........
+                          rd @ 01fffc01 di                0 sel ff ........
+                          rd @ 01fffc02 di                0 sel ff ........
+                          rd @ 01fffc03 di                0 sel ff ........
+                          rd @ 01fffc04 di                0 sel ff ........
+                          rd @ 01fffc05 di                0 sel ff ........
+                          rd @ 01fffc06 di                0 sel ff ........
+                          rd @ 01fffc07 di                0 sel ff ........
+                          rd @ 01fffa00 di 8f010000000000c0 sel ff ........
+                          rd @ 01fffa01 di 8f012000000000c0 sel ff ........
+                          rd @ 01fffa02 di 8f014000000000c0 sel ff ........
+                          rd @ 01fffa03 di 8e016000000000c0 sel ff ........
+                          rd @ 01fffa04 di 8e018000000000c0 sel ff ........
+                          rd @ 01fffa05 di 8e01a000000000c0 sel ff ........
+                          rd @ 01fffa06 di 8e01c000000000c0 sel ff ........
+                          rd @ 01fffa07 di 8e01e000000000c0 sel ff ........
+"""
+
+microwatt_linux_5_7_boot = {
+                  0x000bf803<<3: 0xb000000000001033,
+                  0x000bf804<<3: 0x0,
+                  0x000bf805<<3: 0x0,
+                  0x000bf806<<3: 0x10000,
+                  0x000bf807<<3: 0xc0000000005fc380,
+                  0x000bf800<<3: 0x80000000,
+                  0x000bf801<<3: 0xc00000000059d400,
+                  0x000bf802<<3: 0xc000000000000000,
+                  0x01c00000<<3: 0xad005c0000000040,
+                  0x01c00001<<3: 0x0,
+                  0x01c00002<<3: 0x0,
+                  0x01c00003<<3: 0x0,
+                  0x01c00004<<3: 0x0,
+                  0x01c00005<<3: 0x0,
+                  0x01c00006<<3: 0x0,
+                  0x01c00007<<3: 0x0,
+                  0x000b8000<<3: 0x09e0ff0f00000080,
+                  0x000b8001<<3: 0x0,
+                  0x000b8002<<3: 0x0,
+                  0x000b8003<<3: 0x0,
+                  0x000b8004<<3: 0x0,
+                  0x000b8005<<3: 0x0,
+                  0x000b8006<<3: 0x0,
+                  0x000b8007<<3: 0x0,
+                  0x01fffc00<<3: 0x09d0ff0f00000080,
+                  0x01fffc01<<3: 0x0,
+                  0x01fffc02<<3: 0x0,
+                  0x01fffc03<<3: 0x0,
+                  0x01fffc04<<3: 0x0,
+                  0x01fffc05<<3: 0x0,
+                  0x01fffc06<<3: 0x0,
+                  0x01fffc07<<3: 0x0,
+                  0x01fffa00<<3: 0x8f010000000000c0,
+                  0x01fffa01<<3: 0x8f012000000000c0,
+                  0x01fffa02<<3: 0x8f014000000000c0,
+                  0x01fffa03<<3: 0x8e016000000000c0,
+                  0x01fffa04<<3: 0x8e018000000000c0,
+                  0x01fffa05<<3: 0x8e01a000000000c0,
+                  0x01fffa06<<3: 0x8e01c000000000c0,
+                  0x01fffa07<<3: 0x8e01e000000000c0,
+}
index 2f858b6cfa2f628821177bf8bceeb4b57bcb8e38..2f2c51d1c18888187c4d540e54fb5604d9b8e236 100644 (file)
@@ -58,6 +58,7 @@ class OperandProducer:
         # transaction parameters, passed via signals
         self.delay = Signal(8)
         self.data = Signal.like(self.port)
+        self.data_valid = False
         # add ourselves to the simulation process list
         sim.add_sync_process(self._process)
 
@@ -72,6 +73,7 @@ class OperandProducer:
                 yield
                 yield Settle()
             # read the transaction parameters
+            assert self.data_valid, "an unexpected operand was consumed"
             delay = (yield self.delay)
             data = (yield self.data)
             # wait for `delay` cycles
@@ -82,6 +84,7 @@ class OperandProducer:
             yield self.port.eq(data)
             yield self.count.eq(self.count + 1)
             yield
+            self.data_valid = False
             yield self.go_i.eq(0)
             yield self.port.eq(0)
 
@@ -99,6 +102,7 @@ class OperandProducer:
         """
         yield self.data.eq(data)
         yield self.delay.eq(delay)
+        self.data_valid = True
 
 
 class ResultConsumer:
@@ -127,6 +131,7 @@ class ResultConsumer:
         # transaction parameters, passed via signals
         self.delay = Signal(8)
         self.expected = Signal.like(self.port)
+        self.expecting = False
         # add ourselves to the simulation process list
         sim.add_sync_process(self._process)
 
@@ -141,6 +146,7 @@ class ResultConsumer:
                 yield
                 yield Settle()
             # read the transaction parameters
+            assert self.expecting, "an unexpected result was produced"
             delay = (yield self.delay)
             expected = (yield self.expected)
             # wait for `delay` cycles
@@ -171,6 +177,7 @@ class ResultConsumer:
         """
         yield self.expected.eq(expected)
         yield self.delay.eq(delay)
+        self.expecting = True
 
 
 def op_sim(dut, a, b, op, inv_a=0, imm=0, imm_ok=0, zero_a=0):
@@ -457,13 +464,6 @@ def scoreboard_sim(op):
                         wrmask=[0, 1],
                         src_delays=[2, 0], dest_delays=[1, 0])
 
-    # test combinatorial zero-delay operation
-    # In the test ALU, any operation other than ADD, MUL, EXTS or SHR
-    # is zero-delay, and do a subtraction.
-    # 5 - 2 = 3
-    yield from op.issue([5, 2], MicrOp.OP_CMP, [3, 0],
-                        wrmask=[0, 1],
-                        src_delays=[0, 1], dest_delays=[2, 0])
     # test all combinations of masked input ports
     # NOP does not make any request nor response
     yield from op.issue([5, 2], MicrOp.OP_NOP, [0, 0],
@@ -477,6 +477,15 @@ def scoreboard_sim(op):
     yield from op.issue([2, 0x80], MicrOp.OP_EXTSWSLI, [0xFF80, 0],
                         rdmaskn=[1, 0], wrmask=[0, 1],
                         src_delays=[1, 2], dest_delays=[1, 0])
+
+    # test combinatorial zero-delay operation
+    # In the test ALU, any operation other than ADD, MUL, EXTS or SHR
+    # is zero-delay, and do a subtraction.
+    # 5 - 2 = 3
+    yield from op.issue([5, 2], MicrOp.OP_CMP, [3, 0],
+                        wrmask=[0, 1],
+                        src_delays=[0, 1], dest_delays=[2, 0])
+
     # test with rc=1, so expect results on the CR output port
     # 5 + 2 = 7
     # 7 > 0 => CR = 0b100
@@ -520,19 +529,19 @@ def test_compunit_fsm():
             ('prev port', 'in', [
                 'op__sdir', 'p_data_i[7:0]', 'p_shift_i[7:0]',
                 ({'submodule': 'p'},
-                    ['p_valid_i', 'p_ready_o'])]),
+                    ['p_i_valid', 'p_o_ready'])]),
             ('next port', 'out', [
                 'n_data_o[7:0]',
                 ({'submodule': 'n'},
-                    ['n_valid_o', 'n_ready_i'])])]),
-        ('debug', {'module': 'top'},
+                    ['n_o_valid', 'n_i_ready'])])]),
+        ('debug', {'module': 'bench'},
             ['src1_count[7:0]', 'src2_count[7:0]', 'dest1_count[7:0]'])]
 
     write_gtkw(
         "test_compunit_fsm1.gtkw",
         "test_compunit_fsm1.vcd",
         traces, style,
-        module='top.cu'
+        module='bench.top.cu'
     )
     m = Module()
     alu = Shifter(8)
@@ -650,15 +659,15 @@ def test_compunit_regspec3():
         ('alu', {'submodule': 'alu'}, [
             ('prev port', 'in', [
                 'oper_i_None__insn_type', 'i1[15:0]',
-                'valid_i', 'ready_o']),
+                'i_valid', 'o_ready']),
             ('next port', 'out', [
-                'alu_o[15:0]', 'valid_o', 'ready_i'])])]
+                'alu_o[15:0]', 'o_valid', 'i_ready'])])]
 
     write_gtkw("test_compunit_regspec3.gtkw",
                "test_compunit_regspec3.vcd",
                traces, style,
                clk_period=1e-6,
-               module='top.cu')
+               module='bench.top.cu')
 
     inspec = [('INT', 'a', '0:15'),
               ('INT', 'b', '0:15'),
@@ -725,18 +734,18 @@ def test_compunit_regspec1():
         ('alu', {'submodule': 'alu'}, [
             ('prev port', 'in', [
                 'op__insn_type', 'op__invert_in', 'a[15:0]', 'b[15:0]',
-                'valid_i', 'ready_o']),
+                'i_valid', 'o_ready']),
             ('next port', 'out', [
-                'alu_o[15:0]', 'valid_o', 'ready_i',
+                'alu_o[15:0]', 'o_valid', 'i_ready',
                 'alu_o_ok', 'alu_cr_ok'])]),
-        ('debug', {'module': 'top'},
+        ('debug', {'module': 'bench'},
             ['src1_count[7:0]', 'src2_count[7:0]', 'dest1_count[7:0]'])]
 
     write_gtkw("test_compunit_regspec1.gtkw",
                "test_compunit_regspec1.vcd",
                traces, style,
                clk_period=1e-6,
-               module='top.cu')
+               module='bench.top.cu')
 
     inspec = [('INT', 'a', '0:15'),
               ('INT', 'b', '0:15')]
index f173714f97bf81ee8b0c1df95368be0405424269..a0d2372a30dc5cd81adbc62e48339d0708a8c737 100644 (file)
 import unittest
 from nmigen import Module
 from nmigen.sim import Simulator
+from nmutil.gtkw import write_gtkw
+
+from openpower.consts import MSR
+from openpower.decoder.power_enums import MicrOp, LDSTMode
+
 from soc.experiment.compldst_multi import LDSTCompUnit
 from soc.experiment.pimem import PortInterface
+from soc.experiment.test.test_compalu_multi import OperandProducer
 from soc.fu.ldst.pipe_data import LDSTPipeSpec
 
 
+class OpSim:
+    def __init__(self, dut, sim):
+        self.dut = dut
+        # create one operand producer for each input port
+        self.producers = list()
+        for i in range(len(dut.src_i)):
+            self.producers.append(OperandProducer(sim, dut, i))
+
+    def issue(self, op, ra=None, rb=None, rc=None,
+              zero_a=False, imm=None, update=False,
+              byterev=True, signext=False,
+              data_len=2, msr_pr=0,
+              delays=None):
+        assert zero_a == (ra is None), \
+            "ra and zero_a are mutually exclusive"
+        assert (rb is None) != (imm is None), \
+            "rb and imm are mutually exclusive"
+        if op == MicrOp.OP_STORE:
+            assert rc, "need source operand for store"
+        dut = self.dut
+        pi = dut.pi
+        producers = self.producers
+        if ra:
+            yield from producers[0].send(ra, delays['ra'])
+        if rb:
+            yield from producers[1].send(rb, delays['rb'])
+        if rc:
+            yield from producers[2].send(rc, delays['rc'])
+        yield dut.oper_i.insn_type.eq(op)
+        yield dut.oper_i.data_len.eq(data_len)
+        yield dut.oper_i.zero_a.eq(zero_a)
+        yield dut.oper_i.byte_reverse.eq(byterev)
+        yield dut.oper_i.sign_extend.eq(signext)
+        if imm is not None:
+            yield dut.oper_i.imm_data.data.eq(imm)
+            yield dut.oper_i.imm_data.ok.eq(1)
+        if update:
+            yield dut.oper_i.ldst_mode.eq(LDSTMode.update)
+        yield dut.oper_i.msr[MSR.PR].eq(msr_pr)
+        yield dut.issue_i.eq(1)
+        yield
+        yield dut.issue_i.eq(0)
+        # deactivate decoder inputs along with issue_i, so we can be sure they
+        # were latched at the correct cycle
+        yield dut.oper_i.insn_type.eq(0)
+        yield dut.oper_i.data_len.eq(0)
+        yield dut.oper_i.zero_a.eq(0)
+        yield dut.oper_i.byte_reverse.eq(0)
+        yield dut.oper_i.sign_extend.eq(0)
+        yield dut.oper_i.imm_data.data.eq(0)
+        yield dut.oper_i.imm_data.ok.eq(0)
+        yield dut.oper_i.ldst_mode.eq(LDSTMode.NONE)
+        yield dut.oper_i.msr[MSR.PR].eq(0)
+        while not (yield pi.addr.ok):
+            yield
+
+
+# FIXME: AttributeError: type object 'LDSTPipeSpec' has no attribute 'regspec'
+@unittest.skip('broken')
 class TestLDSTCompUnit(unittest.TestCase):
 
     def test_ldst_compunit(self):
         m = Module()
         pi = PortInterface(name="pi")
         regspec = LDSTPipeSpec.regspec
-        dut = LDSTCompUnit(pi, regspec)
+        dut = LDSTCompUnit(pi, regspec, name="ldst")
         m.submodules.dut = dut
         sim = Simulator(m)
         sim.add_clock(1e-6)
+        op = OpSim(dut, sim)
+        self.write_gtkw()
 
         def process():
-            yield
+            yield from op.issue(MicrOp.OP_STORE, ra=1, rb=2, rc=3,
+                                delays={'ra': 1, 'rb': 2, 'rc': 5})
 
         sim.add_sync_process(process)
         sim_writer = sim.write_vcd("test_ldst_compunit.vcd")
         with sim_writer:
             sim.run()
 
+    @classmethod
+    def write_gtkw(cls):
+        style = {'dec': {'base': 'dec'}}
+        traces = [
+            'clk',
+            ('state latches', [
+                'q_opc',
+                ('q_src[2:0]', {'bit': 2}),
+                ('q_src[2:0]', {'bit': 1}),
+                ('q_src[2:0]', {'bit': 0}),
+                'q_alu', 'q_adr', 'qn_lod', 'q_sto',
+                'q_wri', 'q_upd', 'q_rst',  'q_lsd'
+            ]),
+            ('operation', [
+                ('oper_i_ldst__insn_type', {'display': 'insn_type'}),
+                ('oper_i_ldst__ldst_mode', {'display': 'ldst_mode'}),
+                ('oper_i_ldst__zero_a', {'display': 'zero_a'}),
+                ('oper_i_ldst__imm_data__ok', {'display': 'imm_data_ok'}),
+                ('oper_i_ldst__imm_data__data[63:0]', 'dec',
+                 {'display': 'imm_data_data'})
+            ]),
+            'cu_issue_i', 'cu_busy_o',
+            ('address ALU', [
+                ('cu_rd__rel_o[2:0]', {'bit': 2}),
+                ('cu_rd__go_i[2:0]', {'bit': 2}),
+                ('src1_i[63:0]', 'dec'),
+                ('cu_rd__rel_o[2:0]', {'bit': 1}),
+                ('cu_rd__go_i[2:0]', {'bit': 1}),
+                ('src2_i[63:0]', 'dec'),
+                'alu_valid', 'alu_ok', ('alu_o[63:0]', 'dec'),
+                'cu_ad__rel_o', 'cu_ad__go_i',
+                'pi_addr_i_ok', ('pi_addr_i[47:0]', 'dec'),
+            ]),
+            ('store operand', [
+                ('cu_rd__rel_o[2:0]', {'bit': 0}),
+                ('cu_rd__go_i[2:0]', {'bit': 0}),
+                ('src3_i[63:0]', 'dec'),
+                'rd_done',
+            ]),
+            'cu_st__rel_o', 'cu_st__go_i'
+        ]
+        write_gtkw("test_ldst_compunit.gtkw",
+                   "test_ldst_compunit.vcd",
+                   traces, style, module="top.dut")
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/src/soc/experiment/test/test_compldst_multi_mmu.py b/src/soc/experiment/test/test_compldst_multi_mmu.py
new file mode 100644 (file)
index 0000000..f3a3421
--- /dev/null
@@ -0,0 +1,266 @@
+# test case for LOAD / STORE Computation Unit using MMU
+
+from nmigen.sim import Simulator, Delay, Settle, Tick
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array, Repl
+from nmigen.hdl.rec import Record, Layout
+
+from nmutil.latch import SRLatch, latchregister
+from nmutil.byterev import byte_reverse
+from nmutil.extend import exts
+from nmutil.util import wrap
+from soc.fu.regspec import RegSpecAPI
+
+from openpower.decoder.power_enums import MicrOp, Function, LDSTMode
+from soc.fu.ldst.ldst_input_record import CompLDSTOpSubset
+from openpower.decoder.power_decoder2 import Data
+from openpower.consts import MSR
+
+from soc.experiment.compalu_multi import go_record, CompUnitRecord
+from soc.experiment.l0_cache import PortInterface
+from soc.experiment.pimem import LDSTException
+from soc.experiment.compldst_multi import LDSTCompUnit, load, store
+from soc.config.test.test_loadstore import TestMemPspec
+
+from soc.experiment.mmu import MMU
+from nmutil.util import Display
+
+from soc.config.loadstore import ConfigMemoryPortInterface
+from soc.experiment.test import pagetables
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
+
+
+########################################
+
+def wait_for_debug(sig, reason, wait=True, test1st=False):
+    v = (yield sig)
+    cnt = 0
+    print("wait for", reason, sig, v, wait, test1st)
+    if test1st and bool(v) == wait:
+        return
+    while True:
+        cnt = cnt + 1
+        if cnt > 15:
+            raise(Exception(reason))
+            break
+        yield
+        v = (yield sig)
+        #print("...wait for", sig, v)
+        if bool(v) == wait:
+            break
+
+def store_debug(dut, src1, src2, src3, imm, imm_ok=True, update=False,
+          byterev=True,dcbz=False):
+    print("cut here ======================================")
+    print("ST", src1, src2, src3, imm, imm_ok, update)
+    if dcbz:
+        yield dut.oper_i.insn_type.eq(MicrOp.OP_DCBZ)
+    else:
+        yield dut.oper_i.insn_type.eq(MicrOp.OP_STORE)
+    yield dut.oper_i.data_len.eq(2)  # half-word
+    yield dut.oper_i.byte_reverse.eq(byterev)
+    yield dut.src1_i.eq(src1)
+    yield dut.src2_i.eq(src2)
+    yield dut.src3_i.eq(src3)
+    yield dut.oper_i.imm_data.data.eq(imm)
+    yield dut.oper_i.imm_data.ok.eq(imm_ok)
+    #guess: this one was removed -- yield dut.oper_i.update.eq(update)
+    yield dut.issue_i.eq(1)
+    yield
+    yield dut.issue_i.eq(0)
+
+    if imm_ok:
+        active_rel = 0b101
+    else:
+        active_rel = 0b111
+    if dcbz:
+        active_rel = 0b001 # may be wrong, verify
+
+    # wait for all active rel signals to come up
+    cnt = 0
+    while True:
+        rel = yield dut.rd.rel_o # guess: wrong in dcbz case
+        cnt = cnt + 1
+        print("waitActiveRel",cnt)
+        if cnt > 10:
+            raise(Exception("Error1"))
+        print("rel EQ active_rel ?",rel,active_rel)
+        if rel == active_rel:
+            break
+        yield
+    yield dut.rd.go_i.eq(active_rel)
+    yield
+    yield dut.rd.go_i.eq(0)
+
+    yield from wait_for_debug(dut.adr_rel_o, "addr valid",False, test1st=True)
+    # yield from wait_for(dut.adr_rel_o)
+    # yield dut.ad.go.eq(1)
+    # yield
+    # yield dut.ad.go.eq(0)
+
+    if update:
+        yield from wait_for_debug(dut.wr.rel_o[1],"update")
+        yield dut.wr.go.eq(0b10)
+        yield
+        addr = yield dut.addr_o
+        print("addr", addr)
+        yield dut.wr.go.eq(0)
+    else:
+        addr = None
+        print("not update ===============")
+
+    yield from wait_for_debug(dut.sto_rel_o,"sto_rel_o")
+    yield dut.go_st_i.eq(1)
+    yield
+    yield dut.go_st_i.eq(0)
+    yield from wait_for_debug(dut.busy_o,"not_busy" ,False)
+    ###wait_for(dut.stwd_mem_o)
+    yield
+    return addr
+
+# same thing as soc/src/soc/experiment/test/test_dcbz_pi.py
+def ldst_sim(dut):
+    yield dut.mmu.rin.prtbl.eq(0x1000000) # set process table
+    addr = 0x100e0
+    data = 0xFF #just a single byte for this test
+    #data = 0xf553b658ba7e1f51
+
+    yield from store(dut, addr, 0, data, 0)
+    yield
+    ld_data, data_ok, ld_addr = yield from load(dut, addr, 0, 0)
+    print(data,data_ok,ld_addr)
+    assert(ld_data==data)
+    yield
+
+    data = 0
+
+    print("doing dcbz/store with data 0 .....")
+    yield from store_debug(dut, addr, 0, data, 0, dcbz=True) #hangs
+
+    ld_data, data_ok, ld_addr = yield from load(dut, addr, 0, 0)
+    print(data,data_ok,ld_addr)
+    print("ld_data is")
+    print(ld_data)
+    assert(ld_data==data)
+    print("dzbz test passed")
+
+    wbget.stop = True # stop simulation
+
+########################################
+class TestLDSTCompUnitMMU(LDSTCompUnit):
+
+    def __init__(self, rwid, pspec):
+        # use a LoadStore1 here
+        cmpi = ConfigMemoryPortInterface(pspec)
+        self.cmpi = cmpi
+        ldst = cmpi.pi
+        self.l0 = ldst
+
+        self.mmu = MMU()
+        LDSTCompUnit.__init__(self, ldst.pi, rwid, 4)
+
+    def elaborate(self, platform):
+        m = LDSTCompUnit.elaborate(self, platform)
+        m.submodules.l0 = self.l0
+        m.submodules.mmu = self.mmu
+        # link addr-go direct to rel
+        m.d.comb += self.ad.go_i.eq(self.ad.rel_o)
+
+        # link mmu and dcache together
+        dcache = self.l0.dcache
+        mmu = self.mmu
+        m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
+        m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
+
+        return m
+
+
+def test_scoreboard_mmu():
+
+    m = Module()
+
+    units = {}
+    pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
+                         imem_ifacetype='bare_wb',
+                         addr_wid=48,
+                         mask_wid=8,
+                         reg_wid=64,
+                         units=units)
+
+    dut = TestLDSTCompUnitMMU(16,pspec)
+
+    m.submodules.dut = dut
+
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    dut.mem = pagetables.test1
+    wbget.stop = False
+
+    sim.add_sync_process(wrap(ldst_sim(dut)))
+    sim.add_sync_process(wrap(wb_get(dut.cmpi.wb_bus(), dut.mem)))
+    with sim.write_vcd('test_scoreboard_mmu.vcd'):
+        sim.run()
+
+########################################
+class TestLDSTCompUnitRegSpecMMU(LDSTCompUnit):
+
+    def __init__(self, pspec):
+        from soc.fu.ldst.pipe_data import LDSTPipeSpec
+        regspec = LDSTPipeSpec.regspec
+
+        # use a LoadStore1 here
+        cmpi = ConfigMemoryPortInterface(pspec)
+        self.cmpi = cmpi
+        ldst = cmpi.pi
+        self.l0 = ldst
+
+        self.mmu = MMU()
+        LDSTCompUnit.__init__(self, ldst.pi, regspec, 4)
+
+    def elaborate(self, platform):
+        m = LDSTCompUnit.elaborate(self, platform)
+        m.submodules.l0 = self.l0
+        m.submodules.mmu = self.mmu
+        # link addr-go direct to rel
+        m.d.comb += self.ad.go_i.eq(self.ad.rel_o)
+
+        # link mmu and dcache together
+        dcache = self.l0.dcache
+        mmu = self.mmu
+        m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
+        m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
+
+        return m
+
+def test_scoreboard_regspec_mmu():
+
+    m = Module()
+
+    units = {}
+    pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
+                         imem_ifacetype='bare_wb',
+                         addr_wid=48,
+                         mask_wid=8,
+                         reg_wid=64,
+                         units=units)
+
+    dut = TestLDSTCompUnitRegSpecMMU(pspec)
+
+    m.submodules.dut = dut
+
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    dut.mem = pagetables.test1
+    wbget.stop = False
+
+    sim.add_sync_process(wrap(ldst_sim(dut)))
+    sim.add_sync_process(wrap(wb_get(dut.cmpi.wb_bus(), dut.mem)))
+    with sim.write_vcd('test_scoreboard_regspec_mmu.vcd'):
+        sim.run()
+
+if __name__ == '__main__':
+    test_scoreboard_regspec_mmu()
+    test_scoreboard_mmu()
diff --git a/src/soc/experiment/test/test_compldst_multi_mmu_fsm.py b/src/soc/experiment/test/test_compldst_multi_mmu_fsm.py
new file mode 100644 (file)
index 0000000..81d21c1
--- /dev/null
@@ -0,0 +1,196 @@
+# test case for LOAD / STORE Computation Unit using MMU
+
+from nmigen.back.pysim import Simulator, Delay, Settle, Tick
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array, Repl
+from nmigen.hdl.rec import Record, Layout
+
+from nmutil.latch import SRLatch, latchregister
+from nmutil.byterev import byte_reverse
+from nmutil.extend import exts
+from nmutil.util import wrap
+from soc.fu.regspec import RegSpecAPI
+
+from openpower.decoder.power_enums import MicrOp, Function, LDSTMode
+from soc.fu.ldst.ldst_input_record import CompLDSTOpSubset
+from openpower.decoder.power_decoder2 import Data
+from openpower.consts import MSR
+
+from soc.experiment.compalu_multi import go_record, CompUnitRecord
+from soc.experiment.l0_cache import PortInterface
+from soc.experiment.pimem import LDSTException
+from soc.experiment.compldst_multi import LDSTCompUnit, load, store
+from soc.config.test.test_loadstore import TestMemPspec
+
+from soc.experiment.mmu import MMU
+from nmutil.util import Display
+
+from soc.config.loadstore import ConfigMemoryPortInterface
+from soc.experiment.test import pagetables
+from soc.experiment.test.test_wishbone import wb_get
+
+# new unit added to this test case
+from soc.fu.mmu.pipe_data import MMUPipeSpec
+from soc.fu.mmu.fsm import FSMMMUStage
+
+# for sending instructions to the FSM
+from openpower.consts import MSR
+from openpower.decoder.power_fields import DecodeFields
+from openpower.decoder.power_fieldsn import SignalBitRange
+from openpower.decoder.power_decoder2 import decode_spr_num
+from openpower.decoder.power_enums import MicrOp
+
+
+def test_TLBIE(dut):
+    yield dut.fsm.p.i_data.ctx.op.eq(MicrOp.OP_TLBIE)
+    yield dut.fsm.p.valid_i.eq(1)
+    yield
+    yield dut.fsm.p.valid_i.eq(0)
+    yield
+    yield
+    yield
+    yield
+    yield Display("OP_TLBIE test done")
+
+
+def ldst_sim(dut):
+    yield dut.mmu.rin.prtbl.eq(0x1000000)  # set process table
+    addr = 0x100e0
+    data = 0xFF  # just a single byte for this test
+    #data = 0xf553b658ba7e1f51
+
+    yield from store(dut, addr, 0, data, 0)
+    yield
+    ld_data, data_ok, ld_addr = yield from load(dut, addr, 0, 0)
+    print(data, data_ok, ld_addr)
+    assert(ld_data == data)
+    yield
+    yield from test_TLBIE(dut)
+
+    """
+    -- not testing dzbz here --
+    data = 0
+
+    print("doing dcbz/store with data 0 .....")
+    yield from store_debug(dut, addr, 0, data, 0, dcbz=True) #hangs
+
+    ld_data, data_ok, ld_addr = yield from load(dut, addr, 0, 0)
+    print(data,data_ok,ld_addr)
+    print("ld_data is")
+    print(ld_data)
+    assert(ld_data==data)
+    print("dzbz test passed")
+    """
+
+    dut.stop = True  # stop simulation
+
+########################################
+
+
+class TestLDSTCompUnitMMUFSM(LDSTCompUnit):
+
+    def __init__(self, rwid, pspec):
+        from soc.experiment.l0_cache import TstL0CacheBuffer
+        self.l0 = l0 = TstL0CacheBuffer(pspec)
+        pi = l0.l0.dports[0]
+        LDSTCompUnit.__init__(self, pi, rwid, 4)
+
+    def elaborate(self, platform):
+        m = LDSTCompUnit.elaborate(self, platform)
+        m.submodules.l0 = self.l0
+        # link addr-go direct to rel
+        m.d.comb += self.ad.go_i.eq(self.ad.rel_o)
+        return m
+
+
+def test_scoreboard_mmu():
+
+    units = {}
+    pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
+                         imem_ifacetype='bare_wb',
+                         addr_wid=48,
+                         mask_wid=8,
+                         reg_wid=64,
+                         units=units)
+
+    dut = TestLDSTCompUnit(16, pspec)
+    vl = rtlil.convertMMUFSM(dut, ports=dut.ports())
+    with open("test_ldst_comp_mmu1.il", "w") as f:
+        f.write(vl)
+
+    run_simulation(dut, ldst_sim(dut), vcd_name='test_ldst_comp.vcd')
+
+########################################
+
+
+class TestLDSTCompUnitRegSpecMMUFSM(LDSTCompUnit):
+
+    def __init__(self, pspec):
+        from soc.experiment.l0_cache import TstL0CacheBuffer
+        from soc.fu.ldst.pipe_data import LDSTPipeSpec
+        regspec = LDSTPipeSpec.regspec
+
+        # use a LoadStore1 here
+
+        cmpi = ConfigMemoryPortInterface(pspec)
+        self.cmpi = cmpi
+        ldst = cmpi.pi
+        self.l0 = ldst
+
+        self.mmu = MMU()
+
+        pipe_spec = MMUPipeSpec(id_wid=2, parent_pspec=None)
+        self.fsm = FSMMMUStage(pipe_spec)
+
+        self.fsm.set_ldst_interface(ldst)
+
+        LDSTCompUnit.__init__(self, ldst.pi, regspec, 4)
+
+    def elaborate(self, platform):
+        m = LDSTCompUnit.elaborate(self, platform)
+        m.submodules.l0 = self.l0
+        m.submodules.mmu = self.mmu
+        m.submodules.fsm = self.fsm
+        # link addr-go direct to rel
+        m.d.comb += self.ad.go_i.eq(self.ad.rel_o)
+
+        # link mmu and dcache together
+        dcache = self.l0.dcache
+        mmu = self.mmu
+        m.d.comb += dcache.m_in.eq(mmu.d_out)  # MMUToDCacheType
+        m.d.comb += mmu.d_in.eq(dcache.m_out)  # DCacheToMMUType
+
+        return m
+
+
+def test_scoreboard_regspec_mmufsm():
+
+    m = Module()
+
+    units = {}
+    pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
+                         imem_ifacetype='bare_wb',
+                         addr_wid=48,
+                         mask_wid=8,
+                         reg_wid=64,
+                         units=units)
+
+    dut = TestLDSTCompUnitRegSpecMMUFSM(pspec)
+
+    m.submodules.dut = dut
+
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    dut.mem = pagetables.test1
+    dut.stop = False
+
+    sim.add_sync_process(wrap(ldst_sim(dut)))  # rename ?
+    sim.add_sync_process(wrap(wb_get(dut)))
+    with sim.write_vcd('test_scoreboard_regspec_mmufsm.vcd'):
+        sim.run()
+
+
+if __name__ == '__main__':
+    test_scoreboard_regspec_mmufsm()
+    # only one test for now -- test_scoreboard_mmu()
index fa4fa41720f0bae808b9d0230e45ac235e5795bf..3b795ef7c463e96b4f4ad85d51e97ded47fa344c 100644 (file)
@@ -245,7 +245,7 @@ def dcache_sim(dut, mem):
     yield
 
 
-def test_dcache(mem, test_fn, test_name):
+def tst_dcache(mem, test_fn, test_name):
     dut = DCache()
 
     memory = Memory(width=64, depth=len(mem), init=mem, simulate=True)
@@ -255,15 +255,15 @@ def test_dcache(mem, test_fn, test_name):
     m.submodules.dcache = dut
     m.submodules.sram = sram
 
-    m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
-    m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
-    m.d.comb += sram.bus.we.eq(dut.wb_out.we)
-    m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
-    m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
-    m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
+    m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
+    m.d.comb += sram.bus.stb.eq(dut.bus.stb)
+    m.d.comb += sram.bus.we.eq(dut.bus.we)
+    m.d.comb += sram.bus.sel.eq(dut.bus.sel)
+    m.d.comb += sram.bus.adr.eq(dut.bus.adr)
+    m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
 
-    m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
-    m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
+    m.d.comb += dut.bus.ack.eq(sram.bus.ack)
+    m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
 
     dcache_write_gtkw(test_name)
 
@@ -286,6 +286,7 @@ def dcache_write_gtkw(test_name):
         ('d_out', [
             'd_out_valid', 'd_out_data[63:0]'
         ]),
+        # XXX TODO, update to standard wishbone Signals (single "bus" Interface)
         ('wb_out', [
             'wb_out_cyc', 'wb_out_stb', 'wb_out_we',
             'wb_out_adr[31:0]', 'wb_out_sel[7:0]', 'wb_out_dat[63:0]'
@@ -311,18 +312,18 @@ if __name__ == '__main__':
     for i in range(memsize):
         mem.append(i)
 
-    test_dcache(mem, dcache_regression_sim, "simpleregression")
+    tst_dcache(mem, dcache_regression_sim, "simpleregression")
 
     mem = []
     memsize = 256
     for i in range(memsize):
         mem.append(i)
 
-    test_dcache(mem, dcache_random_sim, "random")
+    tst_dcache(mem, dcache_random_sim, "random")
 
     mem = []
     for i in range(1024):
         mem.append((i*2)| ((i*2+1)<<32))
 
-    test_dcache(mem, dcache_sim, "")
+    tst_dcache(mem, dcache_sim, "")
 
index 3f32a28143aa819e2332d46e7473bb1e0fd87c40..5fa10c0ffc4c56783c9ef996ef830274e3105311 100644 (file)
@@ -276,7 +276,7 @@ def dcache_sim(dut, mem):
     yield
 
 
-def test_dcache(mem, test_fn, test_name):
+def tst_dcache(mem, test_fn, test_name):
     dut = DCache()
 
     memory = Memory(width=64, depth=len(mem), init=mem, simulate=True)
@@ -286,15 +286,15 @@ def test_dcache(mem, test_fn, test_name):
     m.submodules.dcache = dut
     m.submodules.sram = sram
 
-    m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
-    m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
-    m.d.comb += sram.bus.we.eq(dut.wb_out.we)
-    m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
-    m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
-    m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
+    m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
+    m.d.comb += sram.bus.stb.eq(dut.bus.stb)
+    m.d.comb += sram.bus.we.eq(dut.bus.we)
+    m.d.comb += sram.bus.sel.eq(dut.bus.sel)
+    m.d.comb += sram.bus.adr.eq(dut.bus.adr)
+    m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
 
-    m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
-    m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
+    m.d.comb += dut.bus.ack.eq(sram.bus.ack)
+    m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
 
     dcache_write_gtkw(test_name)
 
@@ -350,18 +350,18 @@ if __name__ == '__main__':
         for i in range(memsize):
             mem.append(i)
 
-        test_dcache(mem, dcache_regression_sim, "simpleregression")
+        tst_dcache(mem, dcache_regression_sim, "simpleregression")
 
         mem = []
         memsize = 256
         for i in range(memsize):
             mem.append(i)
 
-        test_dcache(mem, dcache_random_sim, "random")
+        tst_dcache(mem, dcache_random_sim, "random")
 
     mem = []
     for i in range(1024):
         mem.append((i*2)| ((i*2+1)<<32))
 
-    test_dcache(mem, dcache_sim, "")
+    tst_dcache(mem, dcache_sim, "")
 
diff --git a/src/soc/experiment/test/test_dcbz_pi.py b/src/soc/experiment/test/test_dcbz_pi.py
new file mode 100644 (file)
index 0000000..f4717fd
--- /dev/null
@@ -0,0 +1,113 @@
+"""DCache PortInterface Test
+    starting as a copy to test_ldst_pi.py
+"""
+
+from nmigen import (C, Module, Signal, Elaboratable, Mux, Cat, Repl, Signal)
+from nmigen.cli import main
+from nmigen.cli import rtlil
+from nmutil.mask import Mask, masked
+from nmutil.util import Display
+from random import randint, seed
+from nmigen.sim import Simulator, Delay, Settle
+from nmutil.util import wrap
+
+from soc.config.test.test_pi2ls import pi_ld, pi_st, pi_ldst
+from soc.config.test.test_loadstore import TestMemPspec
+from soc.config.loadstore import ConfigMemoryPortInterface
+
+from soc.fu.ldst.loadstore import LoadStore1
+from soc.experiment.mmu import MMU
+from soc.experiment.test import pagetables
+
+from nmigen.compat.sim import run_simulation
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
+from openpower.decoder.power_enums import MSRSpec
+
+wbget.stop = False
+
+
+def setup_mmu():
+
+    wbget.stop = False
+
+    pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
+                         imem_ifacetype='',
+                         addr_wid=48,
+                         #disable_cache=True, # hmmm...
+                         mask_wid=8,
+                         reg_wid=64)
+
+    m = Module()
+    comb = m.d.comb
+    cmpi = ConfigMemoryPortInterface(pspec)
+    m.submodules.ldst = ldst = cmpi.pi
+    m.submodules.mmu = mmu = MMU()
+    dcache = ldst.dcache
+
+    l_in, l_out = mmu.l_in, mmu.l_out
+    d_in, d_out = dcache.d_in, dcache.d_out
+
+    # link mmu and dcache together
+    m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
+    m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
+
+    # link ldst and MMU together
+    comb += l_in.eq(ldst.m_out)
+    comb += ldst.m_in.eq(l_out)
+
+    return m, cmpi
+
+### test case for dcbz
+
+def _test_dcbz_addr_100e0(dut, mem):
+    mmu = dut.submodules.mmu
+    pi = dut.submodules.ldst.pi
+    wbget.stop = False
+
+    yield mmu.rin.prtbl.eq(0x1000000) # set process table
+    yield
+
+    addr = 0x100e0
+    data = 0xf553b658ba7e1f51
+
+    msr = MSRSpec(pr=1, dr=0, sf=1) # 64 bit by default
+
+    yield from pi_st(pi, addr, data, 8, msr)
+    yield
+
+    ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr)
+    assert ld_data == 0xf553b658ba7e1f51
+    ld_data, _, _  = yield from pi_ld(pi, addr, 8, msr)
+    assert ld_data == 0xf553b658ba7e1f51
+
+    print("do_dcbz ===============")
+    yield from pi_st(pi, addr, data, 8, msr, is_dcbz=1)
+    print("done_dcbz ===============")
+    yield
+
+    ld_data, _, _  = yield from pi_ld(pi, addr, 8, msr)
+    print("ld_data after dcbz")
+    print(ld_data)
+    assert ld_data == 0
+
+    yield
+    wbget.stop = True
+
+def test_dcbz_addr_100e0():
+
+    m, cmpi = setup_mmu()
+
+    mem = pagetables.test1
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    sim.add_sync_process(wrap(_test_dcbz_addr_100e0(m, mem)))
+    sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
+    with sim.write_vcd('test_dcbz_addr_zero.vcd'):
+        sim.run()
+
+if __name__ == '__main__':
+    test_dcbz_addr_100e0()
index ec435e65296dc6ddfa32a6b10706ebb3d7c4acb2..c331a7b5e5958238a78d5c28f5fdcab873aa351d 100644 (file)
@@ -25,10 +25,10 @@ class TestCachedMemoryPortInterface(PortInterfaceBase):
         super().__init__(regwid, addrwid)
         self.ldst = LDSTSplitter(32, 48, 4)
 
-    def set_wr_addr(self, m, addr, mask, misalign, msr_pr):
+    def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
         m.d.comb += self.ldst.addr_i.eq(addr)
 
-    def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+    def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
         m.d.comb += self.ldst.addr_i.eq(addr)
 
     def set_wr_data(self, m, data, wen):
@@ -55,7 +55,7 @@ class TestCachedMemoryPortInterface(PortInterfaceBase):
         # TODO: memory ports
 
 
-def test_cache_single_run(dut):
+def tst_cache_single_run(dut):
     #test single byte
     addr = 0
     data = 0xfeedface
@@ -65,7 +65,7 @@ def test_cache_single():
     dut = TestCachedMemoryPortInterface()
     #LDSTSplitter(8, 48, 4) #data leng in bytes, address bits, select bits
 
-    run_simulation(dut, test_cache_single_run(dut),
+    run_simulation(dut, tst_cache_single_run(dut),
                    vcd_name='test_cache_single.vcd')
 
 
index b84481682afc42b5a160a4131c7a10b2c20bd603..003edf1264566ac27528a55df97b88030f976d38 100644 (file)
@@ -10,6 +10,8 @@ from nmigen.cli import rtlil
 from nmutil.mask import Mask, masked
 from nmutil.util import Display
 from random import randint, seed
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
 
 if True:
     from nmigen.back.pysim import Simulator, Delay, Settle
@@ -25,71 +27,33 @@ from soc.fu.ldst.loadstore import LoadStore1
 from soc.experiment.mmu import MMU
 
 from nmigen.compat.sim import run_simulation
+from openpower.decoder.power_enums import MSRSpec
 
 
-stop = False
+msr_default = MSRSpec(pr=1, dr=0, sf=1) # 64 bit by default
+
+
+wbget.stop = False
 
 def b(x): # byte-reverse function
     return int.from_bytes(x.to_bytes(8, byteorder='little'),
                           byteorder='big', signed=False)
 
-def wb_get(wb, mem):
-    """simulator process for getting memory load requests
-    """
-
-    global stop
-    assert(stop==False)
-
-    while not stop:
-        while True: # wait for dc_valid
-            if stop:
-                return
-            cyc = yield (wb.cyc)
-            stb = yield (wb.stb)
-            if cyc and stb:
-                break
-            yield
-        addr = (yield wb.adr) << 3
-        if addr not in mem:
-            print ("    WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
-        # read or write?
-        we = (yield wb.we)
-        if we:
-            store = (yield wb.dat_w)
-            sel = (yield wb.sel)
-            data = mem.get(addr, 0)
-            # note we assume 8-bit sel, here
-            res = 0
-            for i in range(8):
-                mask = 0xff << (i*8)
-                if sel & (1<<i):
-                    res |= store & mask
-                else:
-                    res |= data & mask
-            mem[addr] = res
-            print ("    DCACHE set %x mask %x data %x" % (addr, sel, res))
-        else:
-            data = mem.get(addr, 0)
-            yield wb.dat_r.eq(data)
-            print ("    DCACHE get %x data %x" % (addr, data))
-
-        yield wb.ack.eq(1)
-        yield
-        yield wb.ack.eq(0)
-        yield
+#def dumpmem(mem,fn):
+#    f = open(fn,"w")
+#    for cell in mem:
+#        f.write(str(hex(cell))+"="+str(hex(mem[cell]))+"\n")
 
 
 def mmu_lookup(dut, addr):
     mmu = dut.submodules.mmu
-    global stop
 
     print("pi_ld", hex(addr))
-    data = yield from pi_ld(dut.submodules.ldst.pi, addr, 4, msr_pr=1)
+    data, _, _ = yield from pi_ld(dut.submodules.ldst.pi, addr, 4, msr=msr_default)
     print("pi_ld done, data", hex(data))
     """
     # original test code kept for reference
-    while not stop: # wait for dc_valid / err
+    while not wbget.stop: # wait for dc_valid / err
         print("waiting for mmu")
         l_done = yield (mmu.l_out.done)
         l_err = yield (mmu.l_out.err)
@@ -118,7 +82,6 @@ def mmu_lookup(dut, addr):
 
 def ldst_sim(dut):
     mmu = dut.submodules.mmu
-    global stop
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
 
@@ -144,7 +107,7 @@ def ldst_sim(dut):
     data = yield from mmu_lookup(dut, addr+8)
     assert data == 0xf001a5a5
 
-    yield from pi_st(dut.submodules.ldst.pi, addr+4, 0x10015a5a, 4, msr_pr=1)
+    yield from pi_st(dut.submodules.ldst.pi, addr+4, 0x10015a5a, 4, msr=msr_default)
 
     data = yield from mmu_lookup(dut, addr+4)
     assert data == 0x10015a5a
@@ -152,12 +115,11 @@ def ldst_sim(dut):
     yield
     yield
 
-    stop = True
+    wbget.stop = True
 
 def setup_mmu():
 
-    global stop
-    stop = False
+    wbget.stop = False
 
     pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
                          imem_ifacetype='',
@@ -175,7 +137,6 @@ def setup_mmu():
 
     l_in, l_out = mmu.l_in, mmu.l_out
     d_in, d_out = dcache.d_in, dcache.d_out
-    wb_out, wb_in = dcache.wb_out, dcache.wb_in
 
     # link mmu and dcache together
     m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
@@ -229,17 +190,16 @@ def test_mmu():
 
 def ldst_sim_misalign(dut):
     mmu = dut.submodules.mmu
-    global stop
-    stop = False
+    wbget.stop = False
 
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
 
-    data = yield from pi_ld(dut.submodules.ldst.pi, 0x1007, 8, msr_pr=1)
-    print ("misalign ld data", hex(data))
+    data, _, _ = yield from pi_ld(dut.submodules.ldst.pi, 0x1007, 8, msr_default)
+    print ("misalign ld data", data)
 
     yield
-    stop = True
+    wbget.stop = True
 
 
 def test_misalign_mmu():
@@ -283,77 +243,37 @@ def test_misalign_mmu():
 
 def ldst_sim_radixmiss(dut):
     mmu = dut.submodules.mmu
-    global stop
-    stop = False
+    wbget.stop = False
 
     yield mmu.rin.prtbl.eq(1<<40) # set process table
     yield
 
-    data = yield from pi_ld(dut.submodules.ldst.pi, 0x10000000, 8, msr_pr=1)
-    print ("radixmiss ld data", hex(data))
+    data, _, _ = yield from pi_ld(dut.submodules.ldst.pi,
+                                  0x10000000, 8, msr=msr_default)
+    print ("radixmiss ld data", data)
 
     yield
-    stop = True
+    wbget.stop = True
 
 def ldst_sim_dcache_regression(dut):
     mmu = dut.submodules.mmu
-    global stop
-    stop = False
+    wbget.stop = False
 
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
 
     addr = 0x10000
-    data = yield from pi_ld(dut.submodules.ldst.pi, addr, 8, msr_pr=1)
-    print ("=== dcache_regression ld data", hex(data))
+    data, _, _ = yield from pi_ld(dut.submodules.ldst.pi, addr, 8, msr=msr_default)
+    print ("=== dcache_regression ld data", data)
     assert(data == 0xdeadbeef01234567)
 
     yield
-    stop = True
-
-def ldst_sim_dcache_random2(dut):
-    mmu = dut.submodules.mmu
-    pi = dut.submodules.ldst.pi
-    global stop
-    stop = False
-
-    yield mmu.rin.prtbl.eq(0x1000000) # set process table
-    yield
-
-    memsize = 64
-
-    refs = [
-         ## random values from a failed test
-         [0x100e0,0xf553b658ba7e1f51],
-         [0x10150,0x12c95a730df1cee7],
-         [0x10080,0x5a921ae06674cd81],
-         [0x100f8,0x4fea5eab80090fa5],
-         [0x10080,0xd481432d17a340be],
-         [0x10060,0x8553fcf29526fb32],
-        # [0x101d0,0x327c967c8be30ded],
-         [0x101e0,0x8f15d8d05d25b151]
-    ]
-
-    for i in refs:
-        addr = i[0]
-        data = i[1]
-
-        yield from pi_st(pi, addr, data, 8, msr_pr=1)
-        yield
-
-        ld_data = yield from pi_ld(pi, addr, 8, msr_pr=1)
-
-        print ("dcache_random values", hex(addr), hex(data), hex(ld_data), data==ld_data)
-        assert(data==ld_data)   ## investigate why this fails -- really seldom
-
-    yield
-    stop = True
+    wbget.stop = True
 
 def ldst_sim_dcache_random(dut):
     mmu = dut.submodules.mmu
     pi = dut.submodules.ldst.pi
-    global stop
-    stop = False
+    wbget.stop = False
 
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
@@ -366,22 +286,22 @@ def ldst_sim_dcache_random(dut):
         addr *= 8
         addr += 0x10000
 
-        yield from pi_st(pi, addr, data, 8, msr_pr=1)
+        yield from pi_st(pi, addr, data, 8, msr=msr_default)
         yield
 
-        ld_data = yield from pi_ld(pi, addr, 8, msr_pr=1)
+        ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr=msr_default)
 
-        print ("dcache_random values", hex(addr), hex(data), hex(ld_data), data==ld_data)
+        eq = (data==ld_data)
+        print ("dcache_random values", hex(addr), hex(data), hex(ld_data), eq)
         assert(data==ld_data)   ## investigate why this fails -- really seldom
 
     yield
-    stop = True
+    wbget.stop = True
 
 def ldst_sim_dcache_first(dut): # this test is likely to fail
     mmu = dut.submodules.mmu
     pi = dut.submodules.ldst.pi
-    global stop
-    stop = False
+    wbget.stop = False
 
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
@@ -391,10 +311,10 @@ def ldst_sim_dcache_first(dut): # this test is likely to fail
     data = 0x8c5a3e460d71f0b4
 
     # known to fail without bugfix in src/soc/fu/ldst/loadstore.py
-    yield from pi_st(pi, addr, data, 8, msr_pr=1)
+    yield from pi_st(pi, addr, data, 8, msr=msr_default)
     yield
 
-    ld_data = yield from pi_ld(pi, addr, 8, msr_pr=1)
+    ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr=msr_default)
 
     print ("addr",addr)
     print ("dcache_first ld data", hex(data), hex(ld_data))
@@ -402,7 +322,7 @@ def ldst_sim_dcache_first(dut): # this test is likely to fail
     assert(data==ld_data)
 
     yield
-    stop = True
+    wbget.stop = True
 
 def test_radixmiss_mmu():
 
@@ -512,6 +432,63 @@ def test_dcache_random():
     with sim.write_vcd('test_ldst_pi_random.vcd'):
         sim.run()
 
+def ldst_sim_dcache_random2(dut, mem):
+    mmu = dut.submodules.mmu
+    pi = dut.submodules.ldst.pi
+    wbget.stop = False
+
+    yield mmu.rin.prtbl.eq(0x1000000) # set process table
+    yield
+
+    memsize = 64
+
+    refs = [
+         ## random values from a failed test
+         #[0x100e0,0xf553b658ba7e1f51,0,0], ## 1
+         #[0x10150,0x12c95a730df1cee7,0,0], ## 2
+         #[0x10080,0x5a921ae06674cd81,0,0], ## 3
+         #[0x100f8,0x4fea5eab80090fa5,0,0], ## 4
+         #[0x10080,0xd481432d17a340be,0,0], ## 5
+         #[0x10060,0x8553fcf29526fb32,0,0], ## 6
+         [0x101d0,0x327c967c8be30ded,0,0], ## 7
+         [0x101e0,0x8f15d8d05d25b151,1,0]  ## 8
+         #uncommenting line 7 will cause the original test not to fail
+
+    ]
+
+    c = 0
+    for i in refs:
+        addr = i[0]
+        data = i[1]
+        c1 = i[2]
+        c2 = i[3]
+
+        print("== write: wb_get")
+
+        for i in range(0,c1):
+            print("before_pi_st")
+            yield
+
+        yield from pi_st(pi, addr, data, 8, msr=msr_default)
+        yield
+
+        for i in range(0,c2):
+            print("before_pi_ld")
+            yield
+
+        print("== read: wb_get")
+        ld_data, _, _ = yield from pi_ld(pi, addr, 8, msr=msr_default)
+
+        #dumpmem(mem,"/tmp/dumpmem"+str(c)+".txt")
+        #c += 1
+
+        eq = (data==ld_data)
+        print ("dcache_random values", hex(addr), hex(data), hex(ld_data), eq)
+        assert(data==ld_data)   ## investigate why this fails -- really seldom
+
+    yield
+    wbget.stop = True
+
 def test_dcache_random2():
 
     m, cmpi = setup_mmu()
@@ -534,13 +511,15 @@ def test_dcache_random2():
            0x1000000:   # PROCESS_TABLE_3
                         # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
            b(0x40000000000300ad),
+
+           ###0x101e0:0x8f15d8d05d25b152      ## flush cache -- then check again
     }
 
     # nmigen Simulation
     sim = Simulator(m)
     sim.add_clock(1e-6)
 
-    sim.add_sync_process(wrap(ldst_sim_dcache_random2(m)))
+    sim.add_sync_process(wrap(ldst_sim_dcache_random2(m, mem)))
     sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
     with sim.write_vcd('test_ldst_pi_random2.vcd'):
         sim.run()
index df679977dd54728fbc68e7e26840e707535b4318..6090710da470a60893f2075d37540f56fc18cc86 100644 (file)
@@ -24,59 +24,19 @@ from soc.fu.ldst.loadstore import LoadStore1
 from soc.experiment.mmu import MMU
 
 from nmigen.compat.sim import run_simulation
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
+from openpower.decoder.power_enums import MSRSpec
 
+msr_default = MSRSpec(pr=0, dr=0, sf=1) # 64 bit by default
 
-stop = False
+
+wbget.stop = False
 
 def b(x): # byte-reverse function
     return int.from_bytes(x.to_bytes(8, byteorder='little'),
                           byteorder='big', signed=False)
 
-def wb_get(wb, mem):
-    """simulator process for getting memory load requests
-    """
-
-    global stop
-
-    while not stop:
-        while True: # wait for dc_valid
-            if stop:
-                return
-            cyc = yield (wb.cyc)
-            stb = yield (wb.stb)
-            if cyc and stb:
-                break
-            yield
-        addr = (yield wb.adr) << 3
-        if addr not in mem:
-            print ("    WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
-        # read or write?
-        we = (yield wb.we)
-        if we:
-            store = (yield wb.dat_w)
-            sel = (yield wb.sel)
-            data = mem.get(addr, 0)
-            # note we assume 8-bit sel, here
-            res = 0
-            for i in range(8):
-                mask = 0xff << (i*8)
-                if sel & (1<<i):
-                    res |= store & mask
-                else:
-                    res |= data & mask
-            mem[addr] = res
-            print ("    DCACHE set %x mask %x data %x" % (addr, sel, res))
-        else:
-            data = mem.get(addr, 0)
-            yield wb.dat_r.eq(data)
-            print ("    DCACHE get %x data %x" % (addr, data))
-
-        yield wb.ack.eq(1)
-        yield
-        yield wb.ack.eq(0)
-        yield
-
 
 def setup_mmu():
 
@@ -96,7 +56,6 @@ def setup_mmu():
 
     l_in, l_out = mmu.l_in, mmu.l_out
     d_in, d_out = dcache.d_in, dcache.d_out
-    wb_out, wb_in = dcache.wb_out, dcache.wb_in
 
     # link mmu and dcache together
     m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
@@ -112,13 +71,66 @@ def setup_mmu():
 
 def ldst_sim_misalign(dut):
     mmu = dut.submodules.mmu
-    global stop
-    stop = False
+    wbget.stop = False
 
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
 
-    data = yield from pi_ld(dut.submodules.ldst.pi, 0x1000, 4, msr_pr=1)
+    # load 8 bytes at aligned address
+    align_addr = 0x1000
+    data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+                                          align_addr, 8, msr=msr_default)
+    print ("ldst_sim_misalign (aligned)", hex(data), exctype, exc)
+    assert data == 0xdeadbeef01234567
+
+    # load 4 bytes at aligned address
+    align_addr = 0x1004
+    data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+                                          align_addr, 4, msr=msr_default)
+    print ("ldst_sim_misalign (aligned)", hex(data), exctype, exc)
+    assert data == 0xdeadbeef
+
+    # load 8 bytes at *mis*-aligned address which is still within
+    # the page
+    misalign_addr = 0x1004
+    data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+                                          misalign_addr, 8, msr=msr_default)
+
+    print ("ldst_sim_misalign", hex(data), exctype, exc)
+    assert data == 0xf001a5a5deadbeef
+
+    # load 8 bytes at *mis*-aligned address which is still within
+    # the page
+    misalign_addr = 0x1006
+    data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+                                          misalign_addr, 8, msr=msr_default)
+
+    print ("ldst_sim_misalign", hex(data), exctype, exc)
+    assert data == 0xf00ff001a5a5dead
+    wbget.stop = True
+    return
+
+    # load 8 bytes at *mis*-aligned address which is NOT within
+    # the page - TODO - work this out
+    misalign_addr = 0x10000004
+    data, exctype, exc = yield from pi_ld(dut.submodules.ldst.pi,
+                                          misalign_addr, 8, msr=msr_default)
+    print ("ldst_sim_misalign", data, exctype, exc)
+    yield
+    dar = yield dut.submodules.ldst.dar
+    print ("DAR", hex(dar))
+    assert dar == misalign_addr
+    # check exception bits
+    assert exc.happened
+    assert exc.alignment
+    assert not exc.segment_fault
+    assert not exc.instr_fault
+    assert not exc.invalid
+    assert not exc.perm_error
+    assert not exc.rc_error
+    assert not exc.badtree
+
+    wbget.stop = True
 
 
 def test_misalign_mmu():
diff --git a/src/soc/experiment/test/test_loadstore1.py b/src/soc/experiment/test/test_loadstore1.py
new file mode 100644 (file)
index 0000000..e79e0c1
--- /dev/null
@@ -0,0 +1,1043 @@
+from nmigen import (C, Module, Signal, Elaboratable, Mux, Cat, Repl, Signal,
+                    Const)
+from nmigen.cli import main
+from nmigen.cli import rtlil
+from nmutil.mask import Mask, masked
+from nmutil.util import Display
+from random import randint, seed
+from nmigen.sim import Simulator, Delay, Settle
+from nmutil.util import wrap
+
+from soc.config.test.test_pi2ls import (pi_ld, pi_st, pi_ldst, wait_busy,
+                                        get_exception_info)
+#from soc.config.test.test_pi2ls import pi_st_debug
+from soc.config.test.test_loadstore import TestMemPspec
+from soc.config.loadstore import ConfigMemoryPortInterface
+
+from soc.fu.ldst.loadstore import LoadStore1
+from soc.experiment.mmu import MMU
+from soc.experiment.test import pagetables
+
+from nmigen.compat.sim import run_simulation
+from random import random
+from openpower.test.wb_get import wb_get_classic
+from openpower.test import wb_get as wbget
+from openpower.exceptions import LDSTExceptionTuple
+
+from soc.config.test.test_fetch import read_from_addr
+from openpower.decoder.power_enums import MSRSpec
+
+
+def setup_mmu():
+
+    wbget.stop = False
+
+    pspec = TestMemPspec(ldst_ifacetype='mmu_cache_wb',
+                         imem_ifacetype='',
+                         addr_wid=48,
+                         #disable_cache=True, # hmmm...
+                         mask_wid=8,
+                         reg_wid=64)
+
+    m = Module()
+    comb = m.d.comb
+    cmpi = ConfigMemoryPortInterface(pspec)
+    m.submodules.ldst = ldst = cmpi.pi
+    m.submodules.mmu = mmu = MMU()
+    dcache = ldst.dcache
+    icache = ldst.icache
+
+    l_in, l_out = mmu.l_in, mmu.l_out
+    d_in, d_out = dcache.d_in, dcache.d_out
+    i_in, i_out = icache.i_in, icache.i_out # FetchToICache, ICacheToDecode
+
+    # link mmu, dcache and icache together
+    m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
+    m.d.comb += icache.m_in.eq(mmu.i_out) # MMUToICacheType
+    m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
+
+    # link ldst and MMU together
+    comb += l_in.eq(ldst.m_out)
+    comb += ldst.m_in.eq(l_out)
+
+    # add a debug status Signal: use "msg.str = "blah"
+    # then toggle with yield msg.eq(0); yield msg.eq(1)
+    debug_status = Signal(8, decoder=lambda _ : debug_status.str)
+    m.debug_status = debug_status
+    debug_status.str = ''
+
+    return m, cmpi
+
+
+def icache_read(dut,addr,priv,virt):
+
+    icache = dut.submodules.ldst.icache
+    i_in = icache.i_in
+    i_out  = icache.i_out
+
+    yield i_in.priv_mode.eq(priv)
+    yield i_in.virt_mode.eq(virt)
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(addr)
+    yield i_in.stop_mark.eq(0)
+
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(addr)
+    yield
+    valid = yield i_out.valid
+    failed = yield i_out.fetch_failed
+    while not valid and not failed:
+        yield
+        valid = yield i_out.valid
+        failed = yield i_out.fetch_failed
+    yield i_in.req.eq(0)
+
+    nia   = yield i_out.nia
+    insn  = yield i_out.insn
+    yield
+    yield
+
+    return nia, insn, valid, failed
+
+
+test_exceptions = True
+test_dcbz = True
+test_random = True
+
+
+def debug(dut, msg):
+    print ("set debug message", msg)
+    dut.debug_status.str = msg # set the message
+    yield dut.debug_status.eq(0) # trigger an update
+    yield dut.debug_status.eq(1)
+
+
+def _test_loadstore1_ifetch_iface(dut, mem):
+    """test_loadstore1_ifetch_iface
+
+    read in priv mode, non-virtual.  tests the FetchUnitInterface
+
+    """
+
+    mmu = dut.submodules.mmu
+    ldst = dut.submodules.ldst
+    pi = ldst.pi
+    icache = dut.submodules.ldst.icache
+    wbget.stop = False
+
+    print("=== test loadstore instruction (real) ===")
+
+    i_in = icache.i_in
+    i_out  = icache.i_out
+    i_m_in = icache.m_in
+
+    yield from debug(dut, "real mem instruction")
+    # set address to 0x8, update mem[0x8] to 01234 | 0x5678<<32
+    # (have to do 64-bit writes into the dictionary-memory-emulated-thing)
+    addr = 8
+    addr2 = 12
+    expected_insn2 = 0x5678
+    expected_insn = 0x1234
+    mem[addr] = expected_insn | expected_insn2<<32
+
+    yield i_in.priv_mode.eq(1)
+    insn = yield from read_from_addr(icache, addr, stall=False)
+
+    nia   = yield i_out.nia  # NO, must use FetchUnitInterface
+    print ("fetched %x from addr %x" % (insn, nia))
+    assert insn == expected_insn
+
+    print("=== test loadstore instruction (2nd, real) ===")
+    yield from debug(dut, "real mem 2nd (addr 0xc)")
+
+    insn2 = yield from read_from_addr(icache, addr2, stall=False)
+
+    nia   = yield i_out.nia  # NO, must use FetchUnitInterface
+    print ("fetched %x from addr2 %x" % (insn2, nia))
+    assert insn2 == expected_insn2
+
+    print("=== test loadstore instruction (done) ===")
+
+    yield from debug(dut, "test done")
+    yield
+    yield
+
+    print ("fetched %x from addr %x" % (insn, nia))
+    assert insn == expected_insn
+
+    wbget.stop = True
+
+
+def write_mem2(mem, addr, i1, i2):
+    mem[addr] = i1 | i2<<32
+
+
+#TODO: use fetch interface here
+def lookup_virt(dut,addr):
+    icache = dut.submodules.ldst.icache
+    i_in = icache.i_in
+    i_out  = icache.i_out
+    yield i_in.priv_mode.eq(0)
+    yield i_in.virt_mode.eq(1)
+    yield i_in.req.eq(0)
+    yield i_in.stop_mark.eq(0)
+
+    yield icache.a_i_valid.eq(1)
+    yield icache.a_pc_i.eq(addr)
+    yield
+    valid = yield i_out.valid
+    failed = yield i_out.fetch_failed
+    while not valid and not failed:
+        yield
+        valid = yield i_out.valid
+        failed = yield i_out.fetch_failed
+    yield icache.a_i_valid.eq(0)
+
+    return valid,failed
+
+
+def mmu_lookup(dut,addr):
+    ldst = dut.submodules.ldst
+    pi = ldst.pi
+    yield from debug(dut, "instr fault "+hex(addr))
+    yield ldst.priv_mode.eq(0)
+    yield ldst.instr_fault.eq(1)
+    yield ldst.maddr.eq(addr)
+    yield
+    yield ldst.instr_fault.eq(0)
+    while True:
+        done = yield (ldst.done)
+        exc_info = yield from get_exception_info(pi.exc_o)
+        if done or exc_info.happened:
+            break
+        yield
+    yield
+    assert exc_info.happened == 0 # assert just before doing the fault set zero
+    yield ldst.instr_fault.eq(0)
+    yield from debug(dut, "instr fault done "+hex(addr))
+    yield
+    yield
+    yield
+
+
+def _test_loadstore1_ifetch_multi(dut, mem):
+    mmu = dut.submodules.mmu
+    ldst = dut.submodules.ldst
+    pi = ldst.pi
+    icache = dut.submodules.ldst.icache
+    assert wbget.stop == False
+
+    print ("set process table")
+    yield from debug(dut, "set prtble")
+    yield mmu.rin.prtbl.eq(0x1000000) # set process table
+    yield
+
+    i_in = icache.i_in
+    i_out  = icache.i_out
+    i_m_in = icache.m_in
+
+    # fetch instructions from multiple addresses
+    # should cope with some addresses being invalid
+    real_addrs = [0,4,8,0,8,4,0,0,12]
+    write_mem2(mem,0,0xF0,0xF4)
+    write_mem2(mem,8,0xF8,0xFC)
+
+    yield i_in.priv_mode.eq(1)
+    for addr in real_addrs:
+        yield from debug(dut, "real_addr "+hex(addr))
+        insn = yield from read_from_addr(icache, addr, stall=False)
+        nia   = yield i_out.nia  # NO, must use FetchUnitInterface
+        print ("TEST_MULTI: fetched %x from addr %x == %x" % (insn, nia,addr))
+        assert insn==0xF0+addr
+
+    # now with virtual memory enabled
+    yield i_in.virt_mode.eq(1)
+
+    virt_addrs = [0x10200,0x10204,0x10208,0x10200,
+                  0x102008,0x10204,0x10200,0x10200,0x10200C]
+
+    write_mem2(mem,0x10200,0xF8,0xFC)
+
+    for addr in virt_addrs:
+        yield from debug(dut, "virt_addr "+hex(addr))
+
+        valid, failed = yield from lookup_virt(dut,addr)
+        yield
+        print("TEST_MULTI: failed=",failed) # this is reported wrong
+        if failed==1: # test one first
+            yield from mmu_lookup(dut,addr)
+            valid, failed = yield from lookup_virt(dut,addr)
+            assert(valid==1)
+
+    wbget.stop = True
+
+
+def _test_loadstore1_ifetch(dut, mem):
+    """test_loadstore1_ifetch
+
+    this is quite a complex multi-step test.
+
+    * first (just because, as a demo) read in priv mode, non-virtual.
+      just like in experiment/icache.py itself.
+
+    * second, using the (usual) PTE for these things (which came originally
+      from gem5-experimental experiment/radix_walk_example.txt) do a
+      virtual-memory read through the *instruction* cache.
+      this is expected to FAIL
+
+    * third: mess about with the MMU, setting "iside" (instruction-side),
+      requesting an MMU RADIX LOOKUP.  this triggers an itlb_load
+      (instruction-cache TLB entry-insertion)
+
+    * fourth and finally: retry the read of the instruction through i-cache.
+      this is now expected to SUCCEED
+
+    a lot going on.
+    """
+
+    mmu = dut.submodules.mmu
+    ldst = dut.submodules.ldst
+    pi = ldst.pi
+    icache = dut.submodules.ldst.icache
+    wbget.stop = False
+
+    print("=== test loadstore instruction (real) ===")
+
+    i_in = icache.i_in
+    i_out  = icache.i_out
+    i_m_in = icache.m_in
+
+    # first virtual memory test
+
+    print ("set process table")
+    yield from debug(dut, "set prtble")
+    yield mmu.rin.prtbl.eq(0x1000000) # set process table
+    yield
+
+    yield from debug(dut, "real mem instruction")
+    # set address to zero, update mem[0] to 01234
+    addr = 8
+    expected_insn = 0x1234
+    mem[addr] = expected_insn
+
+    yield i_in.priv_mode.eq(1)
+    yield i_in.req.eq(0)
+    yield i_in.nia.eq(addr)
+    yield i_in.stop_mark.eq(0)
+    yield i_m_in.tlbld.eq(0)
+    yield i_m_in.tlbie.eq(0)
+    yield i_m_in.addr.eq(0)
+    yield i_m_in.pte.eq(0)
+    yield
+    yield
+    yield
+
+    # miss, stalls for a bit -- this one is different here
+    ##nia, insn, valid, failed = yield from icache_read(dut,addr,0,0)
+    ##assert(valid==0)
+    ##assert(failed==1)
+
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(addr)
+    yield
+    valid = yield i_out.valid
+    while not valid:
+        yield
+        valid = yield i_out.valid
+    yield i_in.req.eq(0)
+
+    nia   = yield i_out.nia
+    insn  = yield i_out.insn
+    yield
+    yield
+
+    print ("fetched %x from addr %x" % (insn, nia))
+    assert insn == expected_insn
+
+    print("=== test loadstore instruction (virtual) ===")
+
+    # look up i-cache expecting it to fail
+
+    yield from debug(dut, "virtual instr req")
+    # set address to 0x10200, update mem[] to 5678
+    virt_addr = 0x10200
+    real_addr = virt_addr
+    expected_insn = 0x5678
+    mem[real_addr] = expected_insn
+
+    yield i_in.priv_mode.eq(0)
+    yield i_in.virt_mode.eq(1)
+    yield i_in.req.eq(0)
+    yield i_in.nia.eq(virt_addr)
+    yield i_in.stop_mark.eq(0)
+    yield i_m_in.tlbld.eq(0)
+    yield i_m_in.tlbie.eq(0)
+    yield i_m_in.addr.eq(0)
+    yield i_m_in.pte.eq(0)
+    yield
+    yield
+    yield
+
+    # miss, stalls for a bit
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(virt_addr)
+    yield
+    valid = yield i_out.valid
+    failed = yield i_out.fetch_failed
+    while not valid and not failed:
+        yield
+        valid = yield i_out.valid
+        failed = yield i_out.fetch_failed
+    yield i_in.req.eq(0)
+
+    print ("failed?", "yes" if failed else "no")
+    assert failed == 1
+    yield
+    yield
+
+    print("=== test loadstore instruction (instruction fault) ===")
+
+    yield from debug(dut, "instr fault")
+
+    virt_addr = 0x10200
+
+    yield ldst.priv_mode.eq(0)
+    yield ldst.instr_fault.eq(1)
+    yield ldst.maddr.eq(virt_addr)
+    # still broken -- investigate
+    # msr = MSRSpec(pr=?, dr=?, sf=0)
+    # ld_data, exctype, exc = yield from pi_ld(pi, virt_addr, 8, msr=msr)
+    yield
+    yield ldst.instr_fault.eq(0)
+    while True:
+        done = yield (ldst.done)
+        exc_info = yield from get_exception_info(pi.exc_o)
+        if done or exc_info.happened:
+            break
+        yield
+    assert exc_info.happened == 0 # assert just before doing the fault set zero
+    yield ldst.instr_fault.eq(0)
+    yield
+    yield
+    yield
+
+    print("=== test loadstore instruction (try instruction again) ===")
+    yield from debug(dut, "instr virt retry")
+    # set address to 0x10200, update mem[] to 5678
+    virt_addr = 0x10200
+    real_addr = virt_addr
+    expected_insn = 0x5678
+
+    yield i_in.priv_mode.eq(0)
+    yield i_in.virt_mode.eq(1)
+    yield i_in.req.eq(0)
+    yield i_in.nia.eq(virt_addr)
+    yield i_in.stop_mark.eq(0)
+    yield i_m_in.tlbld.eq(0)
+    yield i_m_in.tlbie.eq(0)
+    yield i_m_in.addr.eq(0)
+    yield i_m_in.pte.eq(0)
+    yield
+    yield
+    yield
+
+    # miss, stalls for a bit
+    """
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(virt_addr)
+    yield
+    valid = yield i_out.valid
+    failed = yield i_out.fetch_failed
+    while not valid and not failed:
+        yield
+        valid = yield i_out.valid
+        failed = yield i_out.fetch_failed
+    yield i_in.req.eq(0)
+    nia   = yield i_out.nia
+    insn  = yield i_out.insn
+    """
+
+    ## part 4
+    nia, insn, valid, failed = yield from icache_read(dut,virt_addr,0,1)
+
+    yield from debug(dut, "test done")
+    yield
+    yield
+
+    print ("failed?", "yes" if failed else "no")
+    assert failed == 0
+
+    print ("fetched %x from addr %x" % (insn, nia))
+    assert insn == expected_insn
+
+    wbget.stop = True
+
+
+def _test_loadstore1_invalid(dut, mem):
+    mmu = dut.submodules.mmu
+    pi = dut.submodules.ldst.pi
+    wbget.stop = False
+
+    print("=== test invalid ===")
+
+    addr = 0
+    msr = MSRSpec(pr=1, dr=0, sf=0) # set problem-state
+    ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+    print("ld_data", ld_data, exctype, exc)
+    assert (exctype == "slow")
+    invalid = exc.invalid
+    assert (invalid == 1)
+
+    print("=== test invalid done ===")
+
+    wbget.stop = True
+
+
+def _test_loadstore1_microwatt_mmu_bin_test2(dut, mem):
+    mmu = dut.submodules.mmu
+    pi = dut.submodules.ldst.pi
+    ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+    wbget.stop = False
+
+    yield mmu.rin.prtbl.eq(0x12000) # set process table
+    yield mmu.rin.pid.eq(0x1)       # set PID=1
+    yield
+
+    addr = 0x124108
+    msr = MSRSpec(pr=1, dr=1, sf=1)
+
+    print("=== alignment error (ld) ===")
+
+    ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+    print("ld_data after mmu.bin test2")
+    print(ld_data)
+    assert ld_data == 0x0000000badc0ffee
+    assert exctype is None
+
+    wbget.stop = True
+
+
+def _test_loadstore1_microwatt_mmu_bin_test5(dut, mem):
+    mmu = dut.submodules.mmu
+    pi = dut.submodules.ldst.pi
+    ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+    wbget.stop = False
+
+    yield mmu.rin.prtbl.eq(0x12000) # set process table
+    yield mmu.rin.pid.eq(0x1)       # set PID=1
+    yield
+
+    addr = 0x39fffd
+    msr = MSRSpec(pr=1, dr=1, sf=1)
+
+    print("=== page-fault alignment error (ld) ===")
+
+    ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+    print("ld_data after mmu.bin test5")
+    print(ld_data)
+    print (exctype, exc)
+
+    wbget.stop = True
+
+
+def test_pi_ld_misalign(pi, addr, data_len, msr):
+    for i in range(0,data_len):
+        ld_data, exctype, exc = yield from pi_ld(pi, addr+i, data_len, msr=msr)
+        yield
+        assert exc is None # use "is None" not "== None"
+        print("MISALIGN: test_pi_ld_misalign returned",hex(ld_data))
+
+
+def test_pi_st_ld_misalign(pi, addr, data_len, msr):
+    data = 0x0102030405060708
+    for i in range(0, data_len):
+        exctype, exc = yield from pi_st(pi, addr+i, data, data_len, msr=msr)
+        print (exctype, exc)
+        assert exc is None # use "is None" not "== None"
+        ld_data, exctype, exc = yield from pi_ld(pi, addr+i, data_len, msr=msr)
+        yield
+        assert exc is None # use "is None" not "== None"
+        print("MISALIGN: test_pi_ld_misalign returned",hex(ld_data))
+        assert ld_data == data
+
+
+def _test_loadstore1_misalign(dut, mem):
+    mmu = dut.submodules.mmu
+    pi = dut.submodules.ldst.pi
+    ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+    wbget.stop = False
+
+    yield mmu.rin.prtbl.eq(0x12000) # set process table
+    yield mmu.rin.pid.eq(0x1)       # set PID=1
+    #yield
+
+    addr = 1
+    msr = MSRSpec(pr=0, dr=0, sf=1)
+
+    yield from test_pi_ld_misalign(pi,0,8,msr)
+
+    yield from test_pi_st_ld_misalign(pi,0,8,msr)
+
+    wbget.stop = True
+
+
+def _test_loadstore1(dut, mem):
+    mmu = dut.submodules.mmu
+    pi = dut.submodules.ldst.pi
+    ldst = dut.submodules.ldst # to get at DAR (NOT part of PortInterface)
+    wbget.stop = False
+
+    yield mmu.rin.prtbl.eq(0x1000000) # set process table
+    yield
+
+    addr = 0x100e0
+    data = 0xf553b658ba7e1f51
+    msr = MSRSpec(pr=0, dr=0, sf=0)
+
+    if test_dcbz:
+        yield from pi_st(pi, addr, data, 8, msr=msr)
+        yield
+
+        ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+        assert ld_data == 0xf553b658ba7e1f51
+        assert exctype is None
+
+        ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+        assert ld_data == 0xf553b658ba7e1f51
+        assert exctype is None
+
+        print("do_dcbz ===============")
+        yield from pi_st(pi, addr, data, 8, msr=msr, is_dcbz=1)
+        print("done_dcbz ===============")
+        yield
+
+        ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+        print("ld_data after dcbz")
+        print(ld_data)
+        assert ld_data == 0
+        assert exctype is None
+
+    if test_exceptions:
+        print("=== alignment error (ld) ===")
+        addr = 0xFF100e0FF
+        ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+        if exc:
+            alignment = exc.alignment
+            happened = exc.happened
+            yield # wait for dsr to update
+            dar = yield ldst.dar
+        else:
+            alignment = 0
+            happened = 0
+            dar = 0
+        assert (happened == 1)
+        assert (alignment == 1)
+        assert (dar == addr)
+        assert (exctype == "fast")
+        yield from wait_busy(pi, debug="pi_ld_E_alignment_error")
+        # wait is only needed in case of in exception here
+        print("=== alignment error test passed (ld) ===")
+
+        # take some cycles in between so that gtkwave separates out
+        # signals
+        yield
+        yield
+        yield
+        yield
+
+        print("=== alignment error (st) ===")
+        addr = 0xFF100e0FF
+        exctype, exc = yield from pi_st(pi, addr,0, 8, msr=msr)
+        if exc:
+            alignment = exc.alignment
+            happened = exc.happened
+        else:
+            alignment = 0
+            happened = 0
+        assert (happened == 1)
+        assert (alignment==1)
+        assert (dar==addr)
+        assert (exctype == "fast")
+        #???? yield from wait_busy(pi, debug="pi_st_E_alignment_error")
+        # wait is only needed in case of in exception here
+        print("=== alignment error test passed (st) ===")
+        yield #FIXME hangs
+
+    if True:
+        print("=== no alignment error (ld) ===")
+        addr = 0x100e0
+        ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+        print("ld_data", ld_data, exctype, exc)
+        if exc:
+            alignment = exc.alignment
+            happened = exc.happened
+        else:
+            alignment = 0
+            happened = 0
+        assert (happened == 0)
+        assert (alignment == 0)
+        print("=== no alignment error done (ld) ===")
+
+    if test_random:
+        addrs = [0x456920,0xa7a180,0x299420,0x1d9d60]
+
+        for addr in addrs:
+            print("== RANDOM addr ==",hex(addr))
+            ld_data, exctype, exc  = yield from pi_ld(pi, addr, 8, msr=msr)
+            print("ld_data[RANDOM]",ld_data,exc,addr)
+            assert (exctype == None)
+
+        for addr in addrs:
+            print("== RANDOM addr ==",hex(addr))
+            exc = yield from pi_st(pi, addr,0xFF*addr, 8, msr=msr)
+            assert (exctype == None)
+
+        # readback written data and compare
+        for addr in addrs:
+            print("== RANDOM addr ==",hex(addr))
+            ld_data, exctype, exc = yield from pi_ld(pi, addr, 8, msr=msr)
+            print("ld_data[RANDOM_READBACK]",ld_data,exc,addr)
+            assert (exctype == None)
+            assert (ld_data == 0xFF*addr)
+
+        print("== RANDOM addr done ==")
+
+    wbget.stop = True
+
+
+def _test_loadstore1_ifetch_invalid(dut, mem):
+    mmu = dut.submodules.mmu
+    ldst = dut.submodules.ldst
+    pi = ldst.pi
+    icache = dut.submodules.ldst.icache
+    wbget.stop = False
+
+    print("=== test loadstore instruction (invalid) ===")
+
+    i_in = icache.i_in
+    i_out  = icache.i_out
+    i_m_in = icache.m_in
+
+    # first virtual memory test
+
+    print ("set process table")
+    yield from debug(dut, "set prtbl")
+    yield mmu.rin.prtbl.eq(0x1000000) # set process table
+    yield
+
+    yield from debug(dut, "real mem instruction")
+    # set address to zero, update mem[0] to 01234
+    addr = 8
+    expected_insn = 0x1234
+    mem[addr] = expected_insn
+
+    yield i_in.priv_mode.eq(1)
+    yield i_in.req.eq(0)
+    yield i_in.nia.eq(addr)
+    yield i_in.stop_mark.eq(0)
+    yield i_m_in.tlbld.eq(0)
+    yield i_m_in.tlbie.eq(0)
+    yield i_m_in.addr.eq(0)
+    yield i_m_in.pte.eq(0)
+    yield
+    yield
+    yield
+
+    # miss, stalls for a bit
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(addr)
+    yield
+    valid = yield i_out.valid
+    nia   = yield i_out.nia
+    while not valid:
+        yield
+        valid = yield i_out.valid
+    yield i_in.req.eq(0)
+
+    nia   = yield i_out.nia
+    insn  = yield i_out.insn
+
+    yield
+    yield
+
+    print ("fetched %x from addr %x" % (insn, nia))
+    assert insn == expected_insn
+
+    print("=== test loadstore instruction (virtual) ===")
+    yield from debug(dut, "virtual instr req")
+
+    # look up i-cache expecting it to fail
+
+    # set address to 0x10200, update mem[] to 5678
+    virt_addr = 0x10200
+    real_addr = virt_addr
+    expected_insn = 0x5678
+    mem[real_addr] = expected_insn
+
+    yield i_in.priv_mode.eq(1)
+    yield i_in.virt_mode.eq(1)
+    yield i_in.req.eq(0)
+    yield i_in.nia.eq(virt_addr)
+    yield i_in.stop_mark.eq(0)
+    yield i_m_in.tlbld.eq(0)
+    yield i_m_in.tlbie.eq(0)
+    yield i_m_in.addr.eq(0)
+    yield i_m_in.pte.eq(0)
+    yield
+    yield
+    yield
+
+    # miss, stalls for a bit
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(virt_addr)
+    yield
+    valid = yield i_out.valid
+    failed = yield i_out.fetch_failed
+    while not valid and not failed:
+        yield
+        valid = yield i_out.valid
+        failed = yield i_out.fetch_failed
+    yield i_in.req.eq(0)
+
+    print ("failed?", "yes" if failed else "no")
+    assert failed == 1
+    yield
+    yield
+
+    print("=== test invalid loadstore instruction (instruction fault) ===")
+
+    yield from debug(dut, "instr fault (perm err expected)")
+    virt_addr = 0x10200
+
+    yield ldst.priv_mode.eq(0)
+    yield ldst.instr_fault.eq(1)
+    yield ldst.maddr.eq(virt_addr)
+    #ld_data, exctype, exc = yield from pi_ld(pi, virt_addr, 8, msr=msr)
+    yield
+    yield ldst.instr_fault.eq(0)
+    while True:
+        done = yield (ldst.done)
+        exc_info = yield from get_exception_info(pi.exc_o)
+        if done or exc_info.happened:
+            break
+        yield
+    assert exc_info.happened == 1 # different here as expected
+
+    # TODO: work out what kind of exception occurred and check it's
+    # the right one.  we *expect* it to be a permissions error because
+    # the RPTE leaf node in pagetables.test2 is marked as "non-executable"
+    # but we also expect instr_fault to be set because it is an instruction
+    # (iside) lookup
+    print ("   MMU lookup exception type?")
+    for fname in LDSTExceptionTuple._fields:
+        print ("   fname %20s %d" % (fname, getattr(exc_info, fname)))
+
+    # ok now printed them out and visually inspected: check them with asserts
+    assert exc_info.instr_fault == 1 # instruction fault (yes!)
+    assert exc_info.perm_error == 1 # permissions (yes!)
+    assert exc_info.rc_error == 0
+    assert exc_info.alignment == 0
+    assert exc_info.invalid == 0
+    assert exc_info.segment_fault == 0
+    assert exc_info.rc_error == 0
+
+    yield from debug(dut, "test done")
+    yield ldst.instr_fault.eq(0)
+    yield
+    yield
+    yield
+
+    wbget.stop = True
+
+
+def test_loadstore1_ifetch_unit_iface():
+
+    m, cmpi = setup_mmu()
+
+    mem = pagetables.test1
+
+    # set this up before passing to Simulator (which calls elaborate)
+    icache = m.submodules.ldst.icache
+    icache.use_fetch_interface() # this is the function which converts
+                                 # to FetchUnitInterface. *including*
+                                 # rewiring the Wishbone Bus to ibus
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    sim.add_sync_process(wrap(_test_loadstore1_ifetch_iface(m, mem)))
+    # add two wb_get_classic processes onto the *same* memory dictionary.
+    # this shouuuld work.... cross-fingers...
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(icache.ibus, mem))) # ibus not bus
+    with sim.write_vcd('test_loadstore1_ifetch_iface.vcd',
+                      traces=[m.debug_status]): # include extra debug
+        sim.run()
+
+
+def test_loadstore1_ifetch():
+
+    m, cmpi = setup_mmu()
+
+    mem = pagetables.test1
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    icache = m.submodules.ldst.icache
+    sim.add_sync_process(wrap(_test_loadstore1_ifetch(m, mem)))
+    # add two wb_get_classic processes onto the *same* memory dictionary.
+    # this shouuuld work.... cross-fingers...
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(icache.bus, mem)))
+    with sim.write_vcd('test_loadstore1_ifetch.vcd',
+                      traces=[m.debug_status]): # include extra debug
+        sim.run()
+
+
+def test_loadstore1():
+
+    m, cmpi = setup_mmu()
+
+    mem = pagetables.test1
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    sim.add_sync_process(wrap(_test_loadstore1(m, mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    with sim.write_vcd('test_loadstore1.vcd'):
+        sim.run()
+
+
+def test_loadstore1_microwatt_mmu_bin_test2():
+
+    m, cmpi = setup_mmu()
+
+    mem = pagetables.microwatt_test2
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    sim.add_sync_process(wrap(_test_loadstore1_microwatt_mmu_bin_test2(m, mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    with sim.write_vcd('test_microwatt_mmu_test2.vcd'):
+        sim.run()
+
+
+def test_loadstore1_microwatt_mmu_bin_test5():
+
+    m, cmpi = setup_mmu()
+
+    mem = pagetables.microwatt_test5
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    sim.add_sync_process(wrap(_test_loadstore1_microwatt_mmu_bin_test5(m, mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    with sim.write_vcd('test_microwatt_mmu_test5.vcd'):
+        sim.run()
+
+
+def test_loadstore1_misalign():
+
+    m, cmpi = setup_mmu()
+
+    mem = pagetables.microwatt_test2
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    ###########1122334455667788
+    mem[0] = 0x0102030405060708
+    mem[8] = 0xffffffffffffffff
+
+    sim.add_sync_process(wrap(_test_loadstore1_misalign(m, mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    with sim.write_vcd('test_loadstore1_misalign.vcd'):
+        sim.run()
+    print ("mem", mem)
+
+
+def test_loadstore1_invalid():
+
+    m, cmpi = setup_mmu()
+
+    mem = {}
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    sim.add_sync_process(wrap(_test_loadstore1_invalid(m, mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    with sim.write_vcd('test_loadstore1_invalid.vcd'):
+        sim.run()
+
+
+def test_loadstore1_ifetch_invalid():
+    m, cmpi = setup_mmu()
+
+    # this is a specially-arranged page table which has the permissions
+    # barred for execute on the leaf node (EAA=0x2 instead of EAA=0x3)
+    mem = pagetables.test2
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    icache = m.submodules.ldst.icache
+    sim.add_sync_process(wrap(_test_loadstore1_ifetch_invalid(m, mem)))
+    # add two wb_get_classic processes onto the *same* memory dictionary.
+    # this shouuuld work.... cross-fingers...
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(icache.bus, mem)))
+    with sim.write_vcd('test_loadstore1_ifetch_invalid.vcd',
+                      traces=[m.debug_status]): # include extra debug
+        sim.run()
+
+
+def test_loadstore1_ifetch_multi():
+    m, cmpi = setup_mmu()
+    wbget.stop = False
+
+    # this is a specially-arranged page table which has the permissions
+    # barred for execute on the leaf node (EAA=0x2 instead of EAA=0x3)
+    mem = pagetables.test1
+
+    # set this up before passing to Simulator (which calls elaborate)
+    icache = m.submodules.ldst.icache
+    icache.use_fetch_interface() # this is the function which converts
+                                 # to FetchUnitInterface. *including*
+                                 # rewiring the Wishbone Bus to ibus
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    sim.add_sync_process(wrap(_test_loadstore1_ifetch_multi(m, mem)))
+    # add two wb_get_classic processes onto the *same* memory dictionary.
+    # this shouuuld work.... cross-fingers...
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(icache.ibus, mem))) # ibus not bus
+    with sim.write_vcd('test_loadstore1_ifetch_multi.vcd',
+                      traces=[m.debug_status]): # include extra debug
+        sim.run()
+
+if __name__ == '__main__':
+    #test_loadstore1()
+    #test_loadstore1_microwatt_mmu_bin_test2()
+    #test_loadstore1_microwatt_mmu_bin_test5()
+    #test_loadstore1_invalid()
+    #test_loadstore1_ifetch() #FIXME
+    #test_loadstore1_ifetch_invalid()
+    #test_loadstore1_ifetch_unit_iface() # guess: should be working
+    #test_loadstore1_ifetch_multi()
+    test_loadstore1_misalign()
index 1528d7d40db31bbaed8f821a7a90663ef087bb26..e31225f6e369cdc48ef8c7fa0cc2c9867438b989 100644 (file)
@@ -21,15 +21,12 @@ from soc.experiment.mem_types import (LoadStore1ToMMUType,
 from soc.experiment.mmu import MMU
 from soc.experiment.dcache import DCache
 from soc.experiment.icache import ICache
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
 
 import random
 
-stop = False
-
-def set_stop(newval):
-    global stop
-    stop = newval
-
+wbget.stop = False
 
 def b(x):
     return int.from_bytes(x.to_bytes(8, byteorder='little'),
@@ -55,48 +52,13 @@ default_mem = { 0x10000:    # PARTITION_TABLE_2
             }
 
 
-def wb_get(c, mem, name):
-    """simulator process for getting memory load requests
-    """
-
-    logfile = open("/tmp/wb_get.log","w")
-
-    def log(msg):
-        logfile.write(msg+"\n")
-        print(msg)
-
-    global stop
-    while not stop:
-        while True: # wait for dc_valid
-            if stop:
-                log("stop")
-                return
-            cyc = yield (c.wb_out.cyc)
-            stb = yield (c.wb_out.stb)
-            if cyc and stb:
-                break
-            yield
-        addr = (yield c.wb_out.adr) << 3
-        if addr not in mem:
-            log("%s LOOKUP FAIL %x" % (name, addr))
-            stop = True
-            return
-
-        yield
-        data = mem[addr]
-        yield c.wb_in.dat.eq(data)
-        log("%s get %x data %x" % (name, addr, data))
-        yield c.wb_in.ack.eq(1)
-        yield
-        yield c.wb_in.ack.eq(0)
-        yield
-
-
 def icache_sim(dut, mem):
     i_out = dut.i_in
     i_in  = dut.i_out
     m_out = dut.m_in
 
+    wbget.stop = False
+
     for k,v in mem.items():
         yield i_in.valid.eq(0)
         yield i_out.priv_mode.eq(1)
@@ -126,6 +88,7 @@ def icache_sim(dut, mem):
         yield i_out.req.eq(0)
         yield
 
+    wbget.stop = True
 
 def test_icache_il():
     dut = ICache()
@@ -155,19 +118,21 @@ def test_icache():
 
     # read from "memory" process and corresponding wishbone "read" process
     sim.add_sync_process(wrap(icache_sim(icache, mem)))
-    sim.add_sync_process(wrap(wb_get(icache, mem, "ICACHE")))
+    sim.add_sync_process(wrap(wb_get(icache.bus, mem, "ICACHE")))
     with sim.write_vcd('test_icache.vcd'):
         sim.run()
 
 
 def mmu_lookup(mmu, addr):
-    global stop
 
     yield mmu.l_in.load.eq(1)
     yield mmu.l_in.priv.eq(1)
     yield mmu.l_in.addr.eq(addr)
     yield mmu.l_in.valid.eq(1)
-    while not stop: # wait for dc_valid / err
+
+    print ("mmu lookup %x stopped" % addr, wbget.stop)
+    while not wbget.stop: # wait for dc_valid / err
+        print ("stopped", wbget.stop)
         l_done = yield (mmu.l_out.done)
         l_err = yield (mmu.l_out.err)
         l_badtree = yield (mmu.l_out.badtree)
@@ -190,7 +155,7 @@ def mmu_lookup(mmu, addr):
 
 
 def mmu_sim(mmu):
-    global stop
+    wbget.stop = False
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
 
@@ -199,8 +164,9 @@ def mmu_sim(mmu):
 
     phys_addr = yield from mmu_lookup(mmu, 0x10000)
     assert phys_addr == 0x40000
+    yield
 
-    stop = True
+    wbget.stop = True
 
 
 def test_mmu():
@@ -219,7 +185,8 @@ def test_mmu():
     sim.add_clock(1e-6)
 
     sim.add_sync_process(wrap(mmu_sim(mmu)))
-    sim.add_sync_process(wrap(wb_get(dcache, default_mem, "DCACHE")))
+    sim.add_sync_process(wrap(wb_get(dcache.bus,
+                              default_mem, "DCACHE")))
     with sim.write_vcd('test_mmu.vcd'):
         sim.run()
 
index 2f219e63ebbd91043a54f81d8f06f88e9eaf7939..338480d848d0ae5c03c100666361c784046176f2 100644 (file)
@@ -28,6 +28,8 @@ from soc.experiment.mem_types import (LoadStore1ToMMUType,
 
 from soc.experiment.mmu import MMU
 from soc.experiment.dcache import DCache
+from openpower.test.wb_get import wb_get
+from openpower.test import wb_get as wbget
 
 #more imports 
 
@@ -49,6 +51,28 @@ from nmigen.compat.sim import run_simulation, Settle
 # will take at least one week (10.10.2020)
 # many unconnected signals
 
+def b(x):
+    return int.from_bytes(x.to_bytes(8, byteorder='little'),
+                          byteorder='big', signed=False)
+
+mem = {0x10000:    # PARTITION_TABLE_2
+                   # PATB_GR=1 PRTB=0x1000 PRTS=0xb
+       b(0x800000000100000b),
+
+       0x30000:     # RADIX_ROOT_PTE
+                    # V = 1 L = 0 NLB = 0x400 NLS = 9
+       b(0x8000000000040009),
+
+       0x40000:     # RADIX_SECOND_LEVEL
+                    #     V = 1 L = 1 SW = 0 RPN = 0
+                       # R = 1 C = 1 ATT = 0 EAA 0x7
+       b(0xc000000000000187),
+
+      0x1000000:   # PROCESS_TABLE_3
+                   # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
+       b(0x40000000000300ad),
+      }
+
 
 class TestMicrowattMemoryPortInterface(PortInterfaceBase):
     """TestMicrowattMemoryPortInterface
@@ -61,18 +85,18 @@ class TestMicrowattMemoryPortInterface(PortInterfaceBase):
         self.mmu = mmu
         self.dcache = dcache
 
-    def set_wr_addr(self, m, addr, mask, misalign, msr_pr):
+    def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
         m.d.comb += self.dcache.d_in.addr.eq(addr)
         m.d.comb += self.mmu.l_in.addr.eq(addr)
         m.d.comb += self.mmu.l_in.load.eq(0)
-        m.d.comb += self.mmu.l_in.priv.eq(1) # TODO put msr_pr here
+        m.d.comb += self.mmu.l_in.priv.eq(~msr.pr) # TODO verify
         m.d.comb += self.mmu.l_in.valid.eq(1)
 
-    def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+    def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
         m.d.comb += self.dcache.d_in.addr.eq(addr)
         m.d.comb += self.mmu.l_in.addr.eq(addr)
         m.d.comb += self.mmu.l_in.load.eq(1)
-        m.d.comb += self.mmu.l_in.priv.eq(1) # TODO put msr_pr here
+        m.d.comb += self.mmu.l_in.priv.eq(~msr.pr) # TODO verify
         m.d.comb += self.mmu.l_in.valid.eq(1)
 
     def set_wr_data(self, m, data, wen):
@@ -120,62 +144,11 @@ class TestMicrowattMemoryPortInterface(PortInterfaceBase):
         yield from super().ports()
         # TODO: memory ports
 
-stop = False
-
-
-def wb_get(dc):
-    """simulator process for getting memory load requests
-    """
-
-    global stop
-
-    def b(x):
-        return int.from_bytes(x.to_bytes(8, byteorder='little'),
-                              byteorder='big', signed=False)
-
-    mem = {0x10000:    # PARTITION_TABLE_2
-                       # PATB_GR=1 PRTB=0x1000 PRTS=0xb
-           b(0x800000000100000b),
-
-           0x30000:     # RADIX_ROOT_PTE
-                        # V = 1 L = 0 NLB = 0x400 NLS = 9
-           b(0x8000000000040009),
-
-           0x40000:     # RADIX_SECOND_LEVEL
-                        #         V = 1 L = 1 SW = 0 RPN = 0
-                           # R = 1 C = 1 ATT = 0 EAA 0x7
-           b(0xc000000000000187),
-
-          0x1000000:   # PROCESS_TABLE_3
-                       # RTS1 = 0x2 RPDB = 0x300 RTS2 = 0x5 RPDS = 13
-           b(0x40000000000300ad),
-          }
-
-    while not stop:
-        while True: # wait for dc_valid
-            if stop:
-                return
-            cyc = yield (dc.wb_out.cyc)
-            stb = yield (dc.wb_out.stb)
-            if cyc and stb:
-                break
-            yield
-        addr = (yield dc.wb_out.adr) << 3
-        if addr not in mem:
-            print ("    WB LOOKUP NO entry @ %x, returning zero" % (addr))
-
-        data = mem.get(addr, 0)
-        yield dc.wb_in.dat.eq(data)
-        print ("    DCACHE get %x data %x" % (addr, data))
-        yield dc.wb_in.ack.eq(1)
-        yield
-        yield dc.wb_in.ack.eq(0)
-        yield
+wbget.stop = False
 
 
 def mmu_lookup(dut, addr):
     mmu = dut.mmu
-    global stop
 
     print("pi_ld")
     yield from pi_ld(dut.pi, addr, 1)
@@ -210,7 +183,6 @@ def mmu_lookup(dut, addr):
 
 def mmu_sim(dut):
     mmu = dut.mmu
-    global stop
     yield mmu.rin.prtbl.eq(0x1000000) # set process table
     yield
 
@@ -226,7 +198,7 @@ def mmu_sim(dut):
     phys_addr = yield from mmu_lookup(dut, 0x10000)
     assert phys_addr == 0x40000
 
-    stop = True
+    wbget.stop = True
 
 
 def test_mmu():
@@ -242,7 +214,7 @@ def test_mmu():
     sim.add_clock(1e-6)
 
     sim.add_sync_process(wrap(mmu_sim(dut)))
-    sim.add_sync_process(wrap(wb_get(dcache)))
+    sim.add_sync_process(wrap(wb_get(dcache.bus, mem)))
     with sim.write_vcd('test_mmu_pi.vcd'):
         sim.run()
 
diff --git a/src/soc/experiment/test/test_wishbone.py b/src/soc/experiment/test/test_wishbone.py
new file mode 100644 (file)
index 0000000..d1a9938
--- /dev/null
@@ -0,0 +1,2 @@
+from openpower.test.wb_get import wb_get
+
index 107be930091e65d45dc984a5c4f26903e4ce7a98..ba65373b646dcdde5a90e89161aae7bdea65a578 100644 (file)
@@ -32,7 +32,7 @@ class Driver(Elaboratable):
             recwidth += width
             comb += p.eq(AnyConst(width))
 
-        pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth)
+        pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth, parent_pspec=None)
         m.submodules.dut = dut = ALUInputStage(pspec)
 
         a = Signal(64)
@@ -66,6 +66,7 @@ class GTCombinerTestCase(FHDLTestCase):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=4)
         self.assertFormal(module, mode="cover", depth=4)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index 529381eaf0c799a455525bb5326ea5307f594fe4..de8dc54f1c82ea18eb768e40ec183fab119e5049 100644 (file)
@@ -37,20 +37,20 @@ class Driver(Elaboratable):
             width = p.width
             comb += p.eq(AnyConst(width))
 
-        pspec = ALUPipeSpec(id_wid=2)
+        pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.dut = dut = ALUMainStage(pspec)
 
         # convenience variables
         a = dut.i.a
         b = dut.i.b
         ca_in = dut.i.xer_ca[0]   # CA carry in
-        ca32_in = dut.i.xer_ca[1] # CA32 carry in 32
+        ca32_in = dut.i.xer_ca[1]  # CA32 carry in 32
         so_in = dut.i.xer_so      # SO sticky overflow
 
         ca_o = dut.o.xer_ca.data[0]   # CA carry out
-        ca32_o = dut.o.xer_ca.data[1] # CA32 carry out32
+        ca32_o = dut.o.xer_ca.data[1]  # CA32 carry out32
         ov_o = dut.o.xer_ov.data[0]   # OV overflow
-        ov32_o = dut.o.xer_ov.data[1] # OV32 overflow32
+        ov32_o = dut.o.xer_ov.data[1]  # OV32 overflow32
         o = dut.o.o.data
 
         # setup random inputs
@@ -143,6 +143,7 @@ class ALUTestCase(FHDLTestCase):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=2)
         self.assertFormal(module, mode="cover", depth=2)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index 5e32fbfde9d84e0c91dbf5a0171edae5a95f9f7d..eb6f45719553b8545111595cb0d7964a7a45872f 100644 (file)
@@ -38,7 +38,7 @@ class Driver(Elaboratable):
             recwidth += width
             comb += p.eq(AnyConst(width))
 
-        pspec = ALUPipeSpec(id_wid=2)
+        pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.dut = dut = ALUOutputStage(pspec)
 
         o = Signal(64)
@@ -103,11 +103,13 @@ class Driver(Elaboratable):
 
         return m
 
+
 class GTCombinerTestCase(FHDLTestCase):
     def test_formal(self):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=4)
         self.assertFormal(module, mode="cover", depth=4)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index 4d5fe2313bb9184e63fff9b3f81422650d1d9ba4..1f17943c4b6116270b052a1486e1d6358627c749 100644 (file)
@@ -13,7 +13,7 @@ from nmigen import (Module, Signal, Cat, Repl, Mux, Const)
 from nmutil.pipemodbase import PipeModBase
 from nmutil.extend import exts, extz
 from soc.fu.alu.pipe_data import ALUInputData, ALUOutputData
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
 from openpower.decoder.power_enums import MicrOp
 
 from openpower.decoder.power_fields import DecodeFields
@@ -38,6 +38,7 @@ class ALUMainStage(PipeModBase):
         return ALUOutputData(self.pspec) # defines pipeline stage output format
 
     def elaborate(self, platform):
+        XLEN = self.pspec.XLEN
         m = Module()
         comb = m.d.comb
 
@@ -69,11 +70,11 @@ class ALUMainStage(PipeModBase):
             comb += b_i.eq(b)                     # into trap pipeline
         with m.Elif(is_32bit):
             with m.If(op.is_signed):
-                comb += a_i.eq(exts(a, 32, 64))
-                comb += b_i.eq(exts(b, 32, 64))
+                comb += a_i.eq(exts(a, 32, XLEN))
+                comb += b_i.eq(exts(b, 32, XLEN))
             with m.Else():
-                comb += a_i.eq(extz(a, 32, 64))
-                comb += b_i.eq(extz(b, 32, 64))
+                comb += a_i.eq(extz(a, 32, XLEN))
+                comb += b_i.eq(extz(b, 32, XLEN))
         with m.Else():
             comb += a_i.eq(a)
             comb += b_i.eq(b)
@@ -94,7 +95,7 @@ class ALUMainStage(PipeModBase):
             #### CMP, CMPL v3.0B p85-86
 
             with m.Case(MicrOp.OP_CMP):
-                a_n = Signal(64) # temporary - inverted a
+                a_n = Signal(XLEN) # temporary - inverted a
                 tval = Signal(5)
                 a_lt = Signal()
                 carry_32 = Signal()
@@ -107,18 +108,21 @@ class ALUMainStage(PipeModBase):
 
                 # this is supposed to be inverted (b-a, not a-b)
                 comb += a_n.eq(~a) # sigh a gets inverted
-                comb += carry_32.eq(add_o[33] ^ a[32] ^ b[32])
-                comb += carry_64.eq(add_o[65])
+                if XLEN == 64:
+                    comb += carry_32.eq(add_o[33] ^ a[32] ^ b[32])
+                else:
+                    comb += carry_32.eq(add_o[XLEN+1])
+                comb += carry_64.eq(add_o[XLEN+1])
 
                 comb += zerolo.eq(~((a_n[0:32] ^ b[0:32]).bool()))
-                comb += zerohi.eq(~((a_n[32:64] ^ b[32:64]).bool()))
+                comb += zerohi.eq(~((a_n[32:XLEN] ^ b[32:XLEN]).bool()))
 
                 with m.If(zerolo & (is_32bit | zerohi)):
                     # values are equal
                     comb += tval[2].eq(1)
                 with m.Else():
-                    comb += msb_a.eq(Mux(is_32bit, a_n[31], a_n[63]))
-                    comb += msb_b.eq(Mux(is_32bit, b[31], b[63]))
+                    comb += msb_a.eq(Mux(is_32bit, a_n[31], a_n[XLEN-1]))
+                    comb += msb_b.eq(Mux(is_32bit, b[31], b[XLEN-1]))
                     C0 = Const(0, 1)
                     with m.If(msb_a != msb_b):
                         # Subtraction might overflow, but
@@ -149,13 +153,21 @@ class ALUMainStage(PipeModBase):
                 # https://bugs.libre-soc.org/show_bug.cgi?id=319#c5
                 ca = Signal(2, reset_less=True)
                 comb += ca[0].eq(add_o[-1])                   # XER.CA
-                comb += ca[1].eq(add_o[33] ^ (a_i[32] ^ b_i[32])) # XER.CA32
+                if XLEN == 64:
+                    comb += ca[1].eq(add_o[33] ^ (a_i[32] ^ b_i[32])) # XER.CA32
+                else:
+                    comb += ca[1].eq(add_o[-1])                   # XER.CA32
                 comb += cry_o.data.eq(ca)
                 comb += cry_o.ok.eq(1)
                 # 32-bit (ov[1]) and 64-bit (ov[0]) overflow
                 ov = Signal(2, reset_less=True)
                 comb += ov[0].eq(calc_ov(a_i[-1], b_i[-1], ca[0], add_o[-2]))
-                comb += ov[1].eq(calc_ov(a_i[31], b_i[31], ca[1], add_o[32]))
+                if XLEN == 64:
+                    comb += ov[1].eq(calc_ov(a_i[31], b_i[31], ca[1],
+                                             add_o[32]))
+                else:
+                    comb += ov[1].eq(calc_ov(a_i[-1], b_i[-1], ca[0],
+                                            add_o[-2]))
                 comb += ov_o.data.eq(ov)
                 comb += ov_o.ok.eq(1)
 
@@ -164,11 +176,11 @@ class ALUMainStage(PipeModBase):
 
             with m.Case(MicrOp.OP_EXTS):
                 with m.If(op.data_len == 1):
-                    comb += o.data.eq(exts(a, 8, 64))
+                    comb += o.data.eq(exts(a, 8, XLEN))
                 with m.If(op.data_len == 2):
-                    comb += o.data.eq(exts(a, 16, 64))
+                    comb += o.data.eq(exts(a, 16, XLEN))
                 with m.If(op.data_len == 4):
-                    comb += o.data.eq(exts(a, 32, 64))
+                    comb += o.data.eq(exts(a, 32, XLEN))
                 comb += o.ok.eq(1) # output register
 
             ###################
index 49444e97b1449eeab0e7ea7927aaa51b18c34116..395b268bdc9fe970540bef08d51a78fed57b25ab 100644 (file)
@@ -4,7 +4,7 @@
 from nmigen import (Module, Signal, Cat, Repl)
 from soc.fu.alu.pipe_data import ALUInputData, ALUOutputData
 from soc.fu.common_output_stage import CommonOutputStage
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
 from openpower.decoder.power_enums import MicrOp
 
 
index 7b1334156c9de77b65a64e4319b03a9386f15a46..572ec9a6bcd18c22202f4a100531e6f843975889 100644 (file)
@@ -3,28 +3,36 @@ from soc.fu.pipe_data import FUBaseData, CommonPipeSpec
 
 
 class ALUInputData(FUBaseData):
-    regspec = [('INT', 'ra', '0:63'), # RA
-               ('INT', 'rb', '0:63'), # RB/immediate
-               ('XER', 'xer_so', '32'), # XER bit 32: SO
-               ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32
     def __init__(self, pspec):
         super().__init__(pspec, False)
         # convenience
         self.a, self.b = self.ra, self.rb
 
+    @property
+    def regspec(self):
+        return [('INT', 'ra', self.intrange),  # RA
+               ('INT', 'rb', self.intrange),  # RB/immediate
+               ('XER', 'xer_so', '32'),  # XER bit 32: SO
+               ('XER', 'xer_ca', '34,45')]  # XER bit 34/45: CA/CA32
+
+
 
 class ALUOutputData(FUBaseData):
-    regspec = [('INT', 'o', '0:63'),
-               ('CR', 'cr_a', '0:3'),
-               ('XER', 'xer_ca', '34,45'), # bit0: ca, bit1: ca32
-               ('XER', 'xer_ov', '33,44'), # bit0: ov, bit1: ov32
-               ('XER', 'xer_so', '32')]
     def __init__(self, pspec):
         super().__init__(pspec, True)
         # convenience
         self.cr0 = self.cr_a
 
+    @property
+    def regspec(self):
+        return [('INT', 'o', self.intrange),
+               ('CR', 'cr_a', '0:3'),
+               ('XER', 'xer_ca', '34,45'),  # bit0: ca, bit1: ca32
+               ('XER', 'xer_ov', '33,44'),  # bit0: ov, bit1: ov32
+               ('XER', 'xer_so', '32')]
+
+
 
 class ALUPipeSpec(CommonPipeSpec):
-    regspec = (ALUInputData.regspec, ALUOutputData.regspec)
     opsubsetkls = CompALUOpSubset
+    regspecklses = (ALUInputData, ALUOutputData)
index 87ca1356a6e4de3e70b1562a5616ea0ba073e5a0..baaf69c26cde0e599840c9258f716b55d3a93c36 100644 (file)
@@ -4,14 +4,40 @@ from soc.fu.alu.input_stage import ALUInputStage
 from soc.fu.alu.main_stage import ALUMainStage
 from soc.fu.alu.output_stage import ALUOutputStage
 
+
 class ALUStages(PipeModBaseChain):
     def get_chain(self):
         inp = ALUInputStage(self.pspec)
         main = ALUMainStage(self.pspec)
-        return [inp, main]
+        out = ALUOutputStage(self.pspec)
+        return [inp, main, out]
+
+
+class ALUBasePipe(ControlBase):
+    def __init__(self, pspec):
+        ControlBase.__init__(self)
+        self.pspec = pspec
+        self.pipe1 = ALUStages(pspec)
+        self._eqs = self.connect([self.pipe1])
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+        m.submodules.pipe1 = self.pipe1
+        m.d.comb += self._eqs
+        return m
+
+class ALUStages1(PipeModBaseChain):
+    def get_chain(self):
+        inp = ALUInputStage(self.pspec)
+        return [inp]
+
+class ALUStages2(PipeModBaseChain):
+    def get_chain(self):
+        main = ALUMainStage(self.pspec)
+        return [main]
 
 
-class ALUStageEnd(PipeModBaseChain):
+class ALUStages3(PipeModBaseChain):
     def get_chain(self):
         out = ALUOutputStage(self.pspec)
         return [out]
@@ -21,13 +47,16 @@ class ALUBasePipe(ControlBase):
     def __init__(self, pspec):
         ControlBase.__init__(self)
         self.pspec = pspec
-        self.pipe1 = ALUStages(pspec)
-        self.pipe2 = ALUStageEnd(pspec)
-        self._eqs = self.connect([self.pipe1, self.pipe2])
+        self.pipe1 = ALUStages1(pspec)
+        self.pipe2 = ALUStages2(pspec)
+        self.pipe3 = ALUStages3(pspec)
+        self._eqs = self.connect([self.pipe1, self.pipe2, self.pipe3])
 
     def elaborate(self, platform):
         m = ControlBase.elaborate(self, platform)
-        m.submodules.pipe1 = self.pipe1
-        m.submodules.pipe2 = self.pipe2
+        m.submodules.logical_pipe1 = self.pipe1
+        m.submodules.logical_pipe2 = self.pipe2
+        m.submodules.logical_pipe3 = self.pipe3
         m.d.comb += self._eqs
         return m
+
index 422a9a61e7aa34250dd0942d2e2a4df64d6f695d..512e379406dd77c26f6682c7f7146fb8e299444b 100644 (file)
@@ -38,7 +38,7 @@ def get_cu_inputs(dec2, sim):
 def set_alu_inputs(alu, dec2, sim):
     # TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
     # detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
-    # and place it into data_i.b
+    # and place it into i_data.b
 
     inp = yield from get_cu_inputs(dec2, sim)
     yield from ALUHelpers.set_int_ra(alu, dec2, inp)
@@ -51,7 +51,7 @@ def set_alu_inputs(alu, dec2, sim):
 class ALUIAllCases(ALUTestCase):
 
     def case_ilang(self):
-        pspec = ALUPipeSpec(id_wid=2)
+        pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
         alu = ALUBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("alu_pipeline.il", "w") as f:
@@ -60,7 +60,7 @@ class ALUIAllCases(ALUTestCase):
 
 class TestRunner(unittest.TestCase):
 
-    def execute(self, alu,instruction, pdecode2, test):
+    def execute(self, alu, instruction, pdecode2, test):
         program = test.program
         sim = ISA(pdecode2, test.regs, test.sprs, test.cr,
                   test.mem, test.msr,
@@ -88,30 +88,30 @@ class TestRunner(unittest.TestCase):
             fn_unit = yield pdecode2.e.do.fn_unit
             asmcode = yield pdecode2.e.asmcode
             dec_asmcode = yield pdecode2.dec.op.asmcode
-            print ("asmcode", asmcode, dec_asmcode)
+            print("asmcode", asmcode, dec_asmcode)
             self.assertEqual(fn_unit, Function.ALU.value)
             yield from set_alu_inputs(alu, pdecode2, sim)
 
             # set valid for one cycle, propagate through pipeline...
-            yield alu.p.valid_i.eq(1)
+            yield alu.p.i_valid.eq(1)
             yield
-            yield alu.p.valid_i.eq(0)
+            yield alu.p.i_valid.eq(0)
 
             opname = code.split(' ')[0]
             yield from sim.call(opname)
             index = sim.pc.CIA.value//4
 
-            vld = yield alu.n.valid_o
+            vld = yield alu.n.o_valid
             while not vld:
                 yield
-                vld = yield alu.n.valid_o
+                vld = yield alu.n.o_valid
             yield
 
             yield from self.check_alu_outputs(alu, pdecode2, sim, code)
             yield Settle()
 
     def test_it(self):
-        test_data = ALUTestCase().test_data
+        test_data = ALUTestCase({'soc'}).test_data
         m = Module()
         comb = m.d.comb
         instruction = Signal(32)
@@ -120,14 +120,18 @@ class TestRunner(unittest.TestCase):
         opkls = ALUPipeSpec.opsubsetkls
 
         pdecode = create_pdecode()
-        m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode, opkls, fn_name)
+        m.submodules.pdecode2 = pdecode2 = PowerDecode2(
+            pdecode, opkls, fn_name)
         pdecode = pdecode2.dec
 
-        pspec = ALUPipeSpec(id_wid=2)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = ALUPipeSpec(id_wid=2, parent_pspec=pps)
         m.submodules.alu = alu = ALUBasePipe(pspec)
 
-        comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
-        comb += alu.n.ready_i.eq(1)
+        comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+        comb += alu.n.i_ready.eq(1)
         comb += pdecode2.dec.raw_opcode_in.eq(instruction)
         sim = Simulator(m)
 
@@ -158,8 +162,8 @@ class TestRunner(unittest.TestCase):
         oe_ok = yield dec2.e.do.oe.ok
         if not oe or not oe_ok:
             # if OE not enabled, XER SO and OV must correspondingly be false
-            so_ok = yield alu.n.data_o.xer_so.ok
-            ov_ok = yield alu.n.data_o.xer_ov.ok
+            so_ok = yield alu.n.o_data.xer_so.ok
+            ov_ok = yield alu.n.o_data.xer_ov.ok
             self.assertEqual(so_ok, False, code)
             self.assertEqual(ov_ok, False, code)
 
index 780fcbeace7c2271492863588dbaf3a45ef9637a..739d3b20fe8a15806315c546deef5460a0d5653a 100644 (file)
@@ -32,7 +32,7 @@ class Driver(Elaboratable):
             recwidth += width
             comb += p.eq(AnyConst(width))
 
-        pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth)
+        pspec = ALUPipeSpec(id_wid=2, op_wid=recwidth, parent_pspec=None)
         m.submodules.dut = dut = ALUInputStage(pspec)
 
         a = Signal(64)
@@ -64,11 +64,13 @@ class Driver(Elaboratable):
 
         return m
 
+
 class GTCombinerTestCase(FHDLTestCase):
     def test_formal(self):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=4)
         self.assertFormal(module, mode="cover", depth=4)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index 94cf0024bb6a9cd6594884f9373ec47251ce4ea1..0f58e1c049d130e3cc1b46ddccd0bbbc1a6f53dc 100644 (file)
@@ -39,7 +39,7 @@ class Driver(Elaboratable):
             recwidth += width
             comb += p.eq(AnyConst(width))
 
-        pspec = BranchPipeSpec(id_wid=2)
+        pspec = BranchPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.dut = dut = BranchMainStage(pspec)
 
         # convenience aliases
@@ -202,6 +202,7 @@ class LogicalTestCase(FHDLTestCase):
     def test_formal(self):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=2)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index a2f5bcf2508fa09d00f4e43aaf1e610db8f20142..8a6c0071ee51e347379d7f6ca79d9349cd4246ac 100644 (file)
@@ -57,5 +57,5 @@ class BranchOutputData(FUBaseData):
 
 
 class BranchPipeSpec(CommonPipeSpec):
-    regspec = (BranchInputData.regspec, BranchOutputData.regspec)
+    regspecklses = (BranchInputData, BranchOutputData)
     opsubsetkls = CompBROpSubset
index 1cdb3e9a1ff0c0c15219d84505e2c5bdce4d1122..f7c9456ec328d3d6e1fc64a57279fdb7656734d6 100644 (file)
@@ -1,6 +1,26 @@
 from nmutil.singlepipe import ControlBase
 from nmutil.pipemodbase import PipeModBaseChain
 from soc.fu.branch.main_stage import BranchMainStage
+from nmutil.pipemodbase import PipeModBase
+from soc.fu.branch.pipe_data import BranchInputData
+from nmigen import Module
+
+# gives a 1-clock delay to stop combinatorial link between in and out
+class DummyBranchStage(PipeModBase):
+    def __init__(self, pspec): super().__init__(pspec, "dummy")
+    def ispec(self): return BranchInputData(self.pspec)
+    def ospec(self): return BranchInputData(self.pspec)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.o.eq(self.i) # pass-through output
+        return m
+
+class BranchDummyStages(PipeModBaseChain):
+    def get_chain(self):
+        dummy = DummyBranchStage(self.pspec)
+        return [dummy]
+
 
 class BranchStages(PipeModBaseChain):
     def get_chain(self):
@@ -12,11 +32,13 @@ class BranchBasePipe(ControlBase):
     def __init__(self, pspec):
         ControlBase.__init__(self)
         self.pspec = pspec
-        self.pipe1 = BranchStages(pspec)
-        self._eqs = self.connect([self.pipe1])
+        self.pipe1 = BranchDummyStages(pspec)
+        self.pipe2 = BranchStages(pspec)
+        self._eqs = self.connect([self.pipe1, self.pipe2])
 
     def elaborate(self, platform):
         m = ControlBase.elaborate(self, platform)
-        m.submodules.pipe = self.pipe1
+        m.submodules.pipe1 = self.pipe1
+        m.submodules.pipe2 = self.pipe2
         m.d.comb += self._eqs
         return m
index d144f4076927c3e80498c2fa464569ebf0377998..611ca983b6196d3f07aafe6b6bac0590bca4739d 100644 (file)
@@ -50,7 +50,7 @@ def get_cu_inputs(dec2, sim):
 class BranchAllCases(BranchTestCase):
 
     def case_ilang(self):
-        pspec = BranchPipeSpec(id_wid=2)
+        pspec = BranchPipeSpec(id_wid=2, parent_pspec=None)
         alu = BranchBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("branch_pipeline.il", "w") as f:
@@ -59,7 +59,8 @@ class BranchAllCases(BranchTestCase):
 
 class TestRunner(unittest.TestCase):
     def test_it(self):
-        test_data = BranchAllCases().test_data
+        test_data = BranchTestCase().test_data
+        print ("test data", test_data)
         m = Module()
         comb = m.d.comb
         instruction = Signal(32)
@@ -70,12 +71,12 @@ class TestRunner(unittest.TestCase):
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
         pdecode = pdecode2.dec
 
-        pspec = BranchPipeSpec(id_wid=2)
+        pspec = BranchPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.branch = branch = BranchBasePipe(pspec)
 
-        comb += branch.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
-        comb += branch.p.valid_i.eq(1)
-        comb += branch.n.ready_i.eq(1)
+        comb += branch.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+        comb += branch.p.i_valid.eq(1)
+        comb += branch.n.i_ready.eq(1)
         comb += pdecode2.dec.raw_opcode_in.eq(instruction)
         sim = Simulator(m)
 
@@ -102,7 +103,7 @@ class TestRunner(unittest.TestCase):
                         print(index)
                         ins, code = instructions[index]
 
-                        print("0x{:X}".format(ins & 0xffffffff))
+                        print("insn 0x{:X}".format(ins & 0xffffffff))
                         print(code)
 
                         # ask the decoder to decode this binary data (endian'd)
@@ -141,11 +142,11 @@ class TestRunner(unittest.TestCase):
             sim.run()
 
     def assert_outputs(self, branch, dec2, sim, prev_nia, code):
-        branch_taken = yield branch.n.data_o.nia.ok
+        branch_taken = yield branch.n.o_data.nia.ok
         sim_branch_taken = prev_nia != sim.pc.CIA
         self.assertEqual(branch_taken, sim_branch_taken, code)
         if branch_taken:
-            branch_addr = yield branch.n.data_o.nia.data
+            branch_addr = yield branch.n.o_data.nia.data
             print(f"real: {branch_addr:x}, sim: {sim.pc.CIA.value:x}")
             self.assertEqual(branch_addr, sim.pc.CIA.value, code)
 
@@ -153,10 +154,10 @@ class TestRunner(unittest.TestCase):
 
         # TODO: this should be checking write_fast2
         lk = yield dec2.e.do.lk
-        branch_lk = yield branch.n.data_o.lr.ok
+        branch_lk = yield branch.n.o_data.lr.ok
         self.assertEqual(lk, branch_lk, code)
         if lk:
-            branch_lr = yield branch.n.data_o.lr.data
+            branch_lr = yield branch.n.o_data.lr.data
             self.assertEqual(sim.spr['LR'], branch_lr, code)
 
     def set_inputs(self, branch, dec2, sim):
index dc17410af790998086d38643518860aab0406cd3..a79179bc3503e60585d0b0748ad7719da36e26f6 100644 (file)
@@ -2,7 +2,7 @@
 # and updating the condition register
 from nmigen import (Module, Signal, Cat, Const)
 from nmutil.pipemodbase import PipeModBase
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
 from openpower.decoder.power_enums import MicrOp
 
 
@@ -11,6 +11,7 @@ class CommonOutputStage(PipeModBase):
         super().__init__(pspec, "output")
 
     def elaborate(self, platform):
+        XLEN = self.pspec.XLEN
         m = Module()
         comb = m.d.comb
         op = self.i.ctx.op
@@ -49,7 +50,7 @@ class CommonOutputStage(PipeModBase):
         # XXX ah.  right.  this needs to be done only if the *mode* is 32-bit
         # (an MSR bit)
         # see https://bugs.libre-soc.org/show_bug.cgi?id=424
-        target = Signal(64, reset_less=True)
+        target = Signal(XLEN, reset_less=True)
         #with m.If(op.is_32bit):
         #    comb += target.eq(o[:32])
         #with m.Else():
index bdb0847dd832be7d026a6140ef7e113759090a9d..873a09df23e3ee884e2ba6eaad6936994ec58e49 100644 (file)
@@ -48,6 +48,7 @@ from nmigen.cli import rtlil
 from soc.experiment.compalu_multi import MultiCompUnit
 from openpower.decoder.power_enums import Function
 from soc.config.test.test_loadstore import TestMemPspec
+from nmutil.concurrentunit import ReservationStations2
 
 # pipeline / spec imports
 
@@ -107,103 +108,178 @@ class FunctionUnitBaseSingle(MultiCompUnit):
     note that the rdflags function obtains (dynamically, from instruction
     decoding) which read-register ports are to be requested.  this is not
     ideal (it could be a lot neater) but works for now.
+
+    also note: additional members, fu.rd_latches and fu.wr_latches
+    are replaced, here, by core.py.  those contain the latched
+    read/write register information which the FU needs in order
+    to actually read (and write) the correct register number
     """
 
-    def __init__(self, speckls, pipekls, idx):
+    def __init__(self, speckls, pipekls, idx, parent_pspec):
         alu_name = "alu_%s%d" % (self.fnunit.name.lower(), idx)
-        pspec = speckls(id_wid=2)                # spec (NNNPipeSpec instance)
+        # spec (NNNPipeSpec instance)
+        pspec = speckls(id_wid=2, parent_pspec=parent_pspec)
         opsubset = pspec.opsubsetkls             # get the operand subset class
-        regspec = pspec.regspec                  # get the regspec
+        rsk = pspec.regspecklses        # get the regspec classes
+        regspec = []
+        for kls in rsk:
+            regspec.append(kls(pspec).regspec)
+        print ("regspecs", regspec)
         alu = pipekls(pspec)                     # create actual NNNBasePipe
         self.pspec = pspec
         super().__init__(regspec, alu, opsubset, name=alu_name)  # MultiCompUnit
+        # these are set to None for now: core get_byregfiles fills them in
+        # (for now)
+        self.fu_rdlatches = None
+        self.fu_wrlatches = None
 
 
 ##############################################################
 # TODO: ReservationStations-based (FunctionUnitBaseConcurrent)
 
-class FunctionUnitBaseMulti:
-    pass
+class FunctionUnitBaseMulti(ReservationStations2):
+    """FunctionUnitBaseMulti
+
+    similar to FunctionUnitBaseSingle except it creates a list
+    of MultiCompUnit instances all using the same ALU instance.
+
+    * :speckls:  - the specification.  contains regspec and op subset info,
+                   and contains common "stuff" like the pipeline ctx,
+                   what type of nmutil pipeline base is to be used (etc)
+    * :pipekls:  - the type of pipeline.  actually connects things together
+
+    * :num_rows: - number of ReservationStations wrapped around the FU
+
+    note that it is through MultiCompUnit.get_in/out that we *actually*
+    connect up the association between regspec variable names (defined
+    in the pipe_data).
+
+    note that the rdflags function obtains (dynamically, from instruction
+    decoding) which read-register ports are to be requested.  this is not
+    ideal (it could be a lot neater) but works for now.
+    """
+
+    def __init__(self, speckls, pipekls, num_rows, parent_pspec):
+        id_wid = num_rows.bit_length()
 
+        # spec (NNNPipeSpec instance)
+        pspec = speckls(id_wid=id_wid, parent_pspec=parent_pspec)
+        self.pspec = pspec
+        opsubset = pspec.opsubsetkls        # get the operand subset class
+        rsk = pspec.regspecklses        # get the regspec classes
+        regspec = []
+        for kls in rsk:
+            regspec.append(kls(pspec).regspec)
+        print ("regspecs", regspec)
+        alu = pipekls(pspec)                # create actual NNNBasePipe
+        alu_name = self.fnunit.name.lower()
+        super().__init__(alu, num_rows, alu_name)   # initialise fan-in/fan-out
+        self.cu = []
+        for idx in range(num_rows):
+            alu_name = "alu_%s%d" % (alu_name, idx)
+            palu = self.pseudoalus[idx]
+            cu = MultiCompUnit(regspec, palu, opsubset, name=alu_name,
+                               sync_rw=False)
+            cu.fnunit = self.fnunit
+            cu.fu_muxidx = idx
+            self.cu.append(cu)
+
+    def elaborate(self, platform):
+        m = super().elaborate(platform)
+        # set the muxids so that ReservationStations2 can direct data
+        # without this the incoming data gets routed to the wrong place!
+        # NOTE: for Mask Cancellation this has to be done slightly differently
+        for i, p in enumerate(self.p):
+            m.d.comb += p.i_data.muxid.eq(i)
+        return m
 
 ######################################################################
 ###### actual Function Units: these are "single" stage pipelines #####
 
-class ALUFunctionUnit(FunctionUnitBaseSingle):
+# class ALUFunctionUnit(FunctionUnitBaseSingle):
+
+
+class ALUFunctionUnit(FunctionUnitBaseMulti):
     fnunit = Function.ALU
 
-    def __init__(self, idx):
-        super().__init__(ALUPipeSpec, ALUBasePipe, idx)
+    def __init__(self, num_rses, parent_pspec):
+        super().__init__(ALUPipeSpec, ALUBasePipe, num_rses, parent_pspec)
 
 
-class LogicalFunctionUnit(FunctionUnitBaseSingle):
+# class LogicalFunctionUnit(FunctionUnitBaseSingle):
+class LogicalFunctionUnit(FunctionUnitBaseMulti):
     fnunit = Function.LOGICAL
 
-    def __init__(self, idx):
-        super().__init__(LogicalPipeSpec, LogicalBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(LogicalPipeSpec, LogicalBasePipe, idx, parent_pspec)
 
 
-class CRFunctionUnit(FunctionUnitBaseSingle):
+# class CRFunctionUnit(FunctionUnitBaseSingle):
+class CRFunctionUnit(FunctionUnitBaseMulti):
     fnunit = Function.CR
 
-    def __init__(self, idx):
-        super().__init__(CRPipeSpec, CRBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(CRPipeSpec, CRBasePipe, idx, parent_pspec)
 
 
-class BranchFunctionUnit(FunctionUnitBaseSingle):
+# class BranchFunctionUnit(FunctionUnitBaseSingle):
+class BranchFunctionUnit(FunctionUnitBaseMulti):
     fnunit = Function.BRANCH
 
-    def __init__(self, idx):
-        super().__init__(BranchPipeSpec, BranchBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(BranchPipeSpec, BranchBasePipe, idx, parent_pspec)
 
 
-class ShiftRotFunctionUnit(FunctionUnitBaseSingle):
+# class ShiftRotFunctionUnit(FunctionUnitBaseSingle):
+class ShiftRotFunctionUnit(FunctionUnitBaseMulti):
     fnunit = Function.SHIFT_ROT
 
-    def __init__(self, idx):
-        super().__init__(ShiftRotPipeSpec, ShiftRotBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(ShiftRotPipeSpec, ShiftRotBasePipe, idx, parent_pspec)
 
 
 class DivFSMFunctionUnit(FunctionUnitBaseSingle):
     fnunit = Function.DIV
 
-    def __init__(self, idx):
-        super().__init__(DivPipeSpecFSMDivCore, DivBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(DivPipeSpecFSMDivCore, DivBasePipe, idx, parent_pspec)
 
 
 class MMUFSMFunctionUnit(FunctionUnitBaseSingle):
     fnunit = Function.MMU
 
-    def __init__(self, idx):
-        super().__init__(MMUPipeSpec, FSMMMUStage, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(MMUPipeSpec, FSMMMUStage, idx, parent_pspec)
+        self.exc_o = self.alu.exc_o # get at MMU exception
 
 
 class DivPipeFunctionUnit(FunctionUnitBaseSingle):
     fnunit = Function.DIV
 
-    def __init__(self, idx):
-        super().__init__(DivPipeSpecDivPipeCore, DivBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(DivPipeSpecDivPipeCore, DivBasePipe, idx, parent_pspec)
 
 
-class MulFunctionUnit(FunctionUnitBaseSingle):
+# class MulFunctionUnit(FunctionUnitBaseSingle):
+class MulFunctionUnit(FunctionUnitBaseMulti):
     fnunit = Function.MUL
 
-    def __init__(self, idx):
-        super().__init__(MulPipeSpec, MulBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(MulPipeSpec, MulBasePipe, idx, parent_pspec)
 
 
 class TrapFunctionUnit(FunctionUnitBaseSingle):
     fnunit = Function.TRAP
 
-    def __init__(self, idx):
-        super().__init__(TrapPipeSpec, TrapBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(TrapPipeSpec, TrapBasePipe, idx, parent_pspec)
 
 
 class SPRFunctionUnit(FunctionUnitBaseSingle):
     fnunit = Function.SPR
 
-    def __init__(self, idx):
-        super().__init__(SPRPipeSpec, SPRBasePipe, idx)
+    def __init__(self, idx, parent_pspec):
+        super().__init__(SPRPipeSpec, SPRBasePipe, idx, parent_pspec)
 
 
 # special-case: LD/ST conforms to the CompUnit API but is not a pipeline
@@ -211,11 +287,16 @@ class SPRFunctionUnit(FunctionUnitBaseSingle):
 class LDSTFunctionUnit(LDSTCompUnit):
     fnunit = Function.LDST
 
-    def __init__(self, pi, awid, idx):
+    def __init__(self, pi, awid, idx, parent_pspec):
         alu_name = "ldst_%s%d" % (self.fnunit.name.lower(), idx)
-        pspec = LDSTPipeSpec(id_wid=2)           # spec (NNNPipeSpec instance)
+        # spec (NNNPipeSpec instance)
+        pspec = LDSTPipeSpec(id_wid=2, parent_pspec=parent_pspec)
         opsubset = pspec.opsubsetkls             # get the operand subset class
-        regspec = pspec.regspec                  # get the regspec
+        rsk = pspec.regspecklses        # get the regspec classes
+        regspec = []
+        for kls in rsk:
+            regspec.append(kls(pspec).regspec)
+        print ("regspecs", regspec)
         self.opsubsetkls = opsubset
         super().__init__(pi, regspec, awid, opsubset, name=alu_name)
 
@@ -268,10 +349,18 @@ class AllFunctionUnits(Elaboratable):
 
         # create dictionary of Function Units
         self.fus = {}
+        self.actual_alus = {}
         for name, qty in units.items():
             kls = alus[name]
-            for i in range(qty):
-                self.fus["%s%d" % (name, i)] = kls(i)
+            if issubclass(kls, FunctionUnitBaseMulti):
+                # create just the one ALU but many "fronts"
+                fu = kls(qty, parent_pspec=pspec)
+                self.actual_alus[name] = fu  # to be made a module of AllFUs
+                for i in range(qty):
+                    self.fus["%s%d" % (name, i)] = fu.cu[i]
+            else:
+                for i in range(qty):
+                    self.fus["%s%d" % (name, i)] = kls(i, parent_pspec=pspec)
 
         # debug print for MMU ALU
         if microwatt_mmu:
@@ -281,27 +370,31 @@ class AllFunctionUnits(Elaboratable):
         # if any PortInterfaces, we want LDST Units.
         if pilist is None:
             return
-        print ("pilist", pilist)
+        print("pilist", pilist)
         for i, pi in enumerate(pilist):
-            self.fus["ldst%d" % (i)] = LDSTFunctionUnit(pi, addrwid, i)
+            self.fus["ldst%d" % (i)] = LDSTFunctionUnit(pi, addrwid, i, pspec)
 
         # extract exceptions from any FunctionUnits for easy access
         self.excs = {}
         for name, alu in self.fus.items():
             if hasattr(alu, "exc_o"):
-                print ("FU exceptions", name, type(alu.exc_o), alu.exc_o)
+                print("FU exceptions", name, type(alu.exc_o), alu.exc_o)
                 self.excs[name] = alu.exc_o
 
     def get_exc(self, name):
-        return self.excs.get(name, default=None)
+        return self.excs.get(name)
 
     def get_fu(self, name):
         return self.fus.get(name)
 
     def elaborate(self, platform):
         m = Module()
+        # add MultiCompUnit modules (Single CompUnits add their own ALU)
         for (name, fu) in self.fus.items():
-            setattr(m.submodules, name, fu)
+            m.submodules[name] = fu
+        # if any ReservationStations, there is only one ALU per RS so add that
+        for (name, alu) in self.actual_alus.items():
+            m.submodules[name] = alu
         return m
 
     def __iter__(self):
@@ -330,7 +423,7 @@ def tst_single_fus_il():
 def tst_all_fus():
     pspec = TestMemPspec(ldst_ifacetype='testpi',
                          imem_ifacetype='',
-                         addr_wid=48,
+                         addr_wid=64,
                          mask_wid=8,
                          reg_wid=64)
     dut = AllFunctionUnits(pspec)
index 0af6ea6437ea501ff4441f0bd2c08361d5110997..9ec497fa7060a41fe21c6429a3235ab1fb035792 100644 (file)
@@ -114,12 +114,12 @@ class Driver(Elaboratable):
             alu_temp = Signal(16)
             write_req_valid = Signal(reset=0)
             with m.If(~Past(go_die) & Past(busy)):
-                with m.If(Rose(dut.alu.n.valid_o)):
+                with m.If(Rose(dut.alu.n.o_valid)):
                     sync += alu_temp.eq(dut.alu.o)
                     sync += write_req_valid.eq(1)
 
             # write_req_valid should only be high once the alu finishes
-            with m.If(~write_req_valid & ~dut.alu.n.valid_o):
+            with m.If(~write_req_valid & ~dut.alu.n.o_valid):
                 comb += Assert(wr_rel == 0)
 
             # Property 6: Write request release is held up if shadow_n
@@ -149,8 +149,8 @@ class Driver(Elaboratable):
             # then the alu data should be output
             with m.If(Past(wr_rel) & Past(go_wr)):
                 # the alu data is output
-                comb += Assert((dut.data_o == alu_temp)
-                               | (dut.data_o == dut.alu.o))
+                comb += Assert((dut.o_data == alu_temp)
+                               | (dut.o_data == dut.alu.o))
                 # wr_rel is dropped
                 comb += Assert(wr_rel == 0)
                 # busy is dropped.
@@ -216,9 +216,9 @@ class FUTestCase(FHDLTestCase):
             ('alu', {'submodule': 'alu'}, [
                 ('prev port', 'in', [
                     'oper_i_None__insn_type', 'i1[15:0]',
-                    'valid_i', 'ready_o']),
+                    'i_valid', 'o_ready']),
                 ('next port', 'out', [
-                    'alu_o[15:0]', 'valid_o', 'ready_i'])])]
+                    'alu_o[15:0]', 'o_valid', 'i_ready'])])]
 
         write_gtkw('test_fu_formal_bmc.gtkw',
                    os.path.dirname(__file__) +
index 350a611ca9e6fb0cdeda4c1f3603dfc552793a48..133508b704ed89e508a3194ec33ceac9cf5f84f9 100644 (file)
@@ -28,12 +28,12 @@ class MaskGenTestCase(FHDLTestCase):
             yield
             while True:
                 yield
-                rd_rel = yield dut.rd.rel
+                rd_rel = yield dut.rd.rel_o
                 if rd_rel != 0:
                     break
-            yield dut.rd.go.eq(0xfff)
+            yield dut.rd.go_i.eq(0xfff)
             yield
-            yield dut.rd.go.eq(0)
+            yield dut.rd.go_i.eq(0)
             for i in range(10):
                 yield
                 
index 11eb8b3c7f84e81ed1dc5f038e305d12d2c7c3e6..e115c2158e3faec39d9aabf5375a196bcb34dd5e 100644 (file)
@@ -11,6 +11,7 @@ from openpower.decoder.power_decoder import create_pdecode
 from openpower.decoder.power_decoder2 import PowerDecode2, get_rdflags
 from openpower.decoder.power_enums import Function
 from openpower.decoder.isa.all import ISA
+from openpower.decoder.isa.mem import Mem
 
 from soc.experiment.compalu_multi import find_ok  # hack
 from soc.config.test.test_loadstore import TestMemPspec
@@ -88,8 +89,8 @@ def get_cu_outputs(cu, code):
     # pipelines (or FSMs) the write mask is only valid at that time.
     if hasattr(cu, "alu"): # ALU CompUnits
         while True:
-            valid_o = yield cu.alu.n.valid_o
-            if valid_o:
+            o_valid = yield cu.alu.n.o_valid
+            if o_valid:
                 break
             yield
     else: # LDST CompUnit
@@ -137,15 +138,17 @@ def get_l0_mem(l0):  # BLECH! this is awful! hunting around through structures
     return mem.mem
 
 
-def setup_test_memory(l0, sim):
+def setup_tst_memory(l0, test_mem):
+    # create independent Sim Mem from test values
+    sim_mem = Mem(initial_mem=test_mem)
     mem = get_l0_mem(l0)
     print("before, init mem", mem.depth, mem.width, mem)
     for i in range(mem.depth):
-        data = sim.mem.ld(i*8, 8, False)
+        data = sim_mem.ld(i*8, 8, False)
         print("init ", i, hex(data))
         yield mem._array[i].eq(data)
     yield Settle()
-    for k, v in sim.mem.mem.items():
+    for k, v in sim_mem.mem.items():
         print("    %6x %016x" % (k, v))
     print("before, nmigen mem dump")
     for i in range(mem.depth):
@@ -184,7 +187,7 @@ class TestRunner(FHDLTestCase):
         self.funit = funit
         self.bigendian = bigendian
 
-    def execute(self, cu, l0, instruction, pdecode2, simdec2, test):
+    def execute(self, m, cu, l0, instruction, pdecode2, simdec2, test):
 
         program = test.program
         print("test", test.name, test.mem)
@@ -199,7 +202,7 @@ class TestRunner(FHDLTestCase):
 
         # initialise memory
         if self.funit == Function.LDST:
-            yield from setup_test_memory(l0, sim)
+            yield from setup_tst_memory(l0, test.mem)
 
         pc = sim.pc.CIA.value
         index = pc//4
@@ -226,9 +229,9 @@ class TestRunner(FHDLTestCase):
                 fast_out2 = yield pdecode2.e.write_fast2.data
                 fast_out2_ok = yield pdecode2.e.write_fast2.ok
                 print("lk:", lk, fast_out2, fast_out2_ok)
-                op_lk = yield cu.alu.pipe1.p.data_i.ctx.op.lk
+                op_lk = yield cu.alu.pipe1.p.i_data.ctx.op.lk
                 print("op_lk:", op_lk)
-                print(dir(cu.alu.pipe1.n.data_o))
+                print(dir(cu.alu.pipe1.n.o_data))
             fn_unit = yield pdecode2.e.do.fn_unit
             fuval = self.funit.value
             self.assertEqual(fn_unit & fuval, fuval)
@@ -236,7 +239,7 @@ class TestRunner(FHDLTestCase):
             # set operand and get inputs
             yield from set_operand(cu, pdecode2, sim)
             # reset read-operand mask
-            rdmask = get_rdflags(pdecode2.e, cu)
+            rdmask = get_rdflags(m, pdecode2.e, cu)
             #print ("hardcoded rdmask", cu.rdflags(pdecode2.e))
             #print ("decoder rdmask", rdmask)
             yield cu.rdmaskn.eq(~rdmask)
@@ -298,8 +301,8 @@ class TestRunner(FHDLTestCase):
 
             # debugging issue with branch
             if self.funit == Function.BRANCH:
-                lr = yield cu.alu.pipe1.n.data_o.lr.data
-                lr_ok = yield cu.alu.pipe1.n.data_o.lr.ok
+                lr = yield cu.alu.pipe1.n.o_data.lr.data
+                lr_ok = yield cu.alu.pipe1.n.o_data.lr.ok
                 print("lr:", hex(lr), lr_ok)
 
             if self.funit == Function.LDST:
@@ -341,7 +344,7 @@ class TestRunner(FHDLTestCase):
             m.d.comb += cu.ad.go_i.eq(cu.ad.rel_o)  # link addr direct to rel
             m.d.comb += cu.st.go_i.eq(cu.st.rel_o)  # link store direct to rel
         else:
-            m.submodules.cu = cu = self.fukls(0)
+            m.submodules.cu = cu = self.fukls(0, parent_pspec=None)
             l0 = None
 
         comb += pdecode2.dec.raw_opcode_in.eq(instruction)
@@ -356,7 +359,7 @@ class TestRunner(FHDLTestCase):
             for test in self.test_data:
                 print(test.name)
                 with self.subTest(test.name):
-                    yield from self.execute(cu, l0, instruction,
+                    yield from self.execute(m, cu, l0, instruction,
                                             pdecode2, simdec2,
                                             test)
 
index ab6da5a0dc7d140f4a3cc5e4c0af12ba28776c3a..3b4d968c96df1664bd98666b69d3b7f0def8653c 100644 (file)
@@ -1,7 +1,7 @@
 import unittest
 from openpower.decoder.power_enums import (XER_bits, Function)
 
-from soc.fu.div.test.test_pipe_caller import get_cu_inputs
+from soc.fu.div.test.helper import get_cu_inputs
 from soc.fu.div.test.test_pipe_caller import DivTestCases  # creates the tests
 
 from openpower.test.common import ALUHelpers
index 0a46716530ef146993e9618f0e5595383cd48867..fa44c4d3fb685b7f89bcacca12beafb8b5ef65a1 100644 (file)
@@ -37,7 +37,7 @@ class Driver(Elaboratable):
             recwidth += width
             comb += p.eq(AnyConst(width))
 
-        pspec = ALUPipeSpec(id_wid=2)
+        pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.dut = dut = CRMainStage(pspec)
 
         full_cr_in = Signal(32)
@@ -85,7 +85,6 @@ class Driver(Elaboratable):
                 # into cr_a
                 comb += dut.i.cr_a.eq(cr_input_arr[bc])
 
-
             # For OP_CROP, we need to input the corresponding CR
             # registers for BA, BB, and BT
             with m.Case(MicrOp.OP_CROP):
@@ -172,7 +171,7 @@ class Driver(Elaboratable):
                             comb += Assert(o[4*i:4*i+4] == cr[4*i:4*i+4])
                         with m.Else():
                             comb += Assert(o[4*i:4*i+4] == 0)
-                with m.Else(): # mfcrf
+                with m.Else():  # mfcrf
                     comb += Assert(o == cr)
                 comb += o_ok.eq(1)
 
@@ -237,7 +236,7 @@ class Driver(Elaboratable):
 
             with m.Case(MicrOp.OP_SETB):
                 with m.If(cr_arr[4*bfa]):
-                    comb += Assert(o == ((1<<64)-1))
+                    comb += Assert(o == ((1 << 64)-1))
                 with m.Elif(cr_arr[4*bfa+1]):
                     comb += Assert(o == 1)
                 with m.Else():
@@ -256,6 +255,7 @@ class CRTestCase(FHDLTestCase):
     def test_formal(self):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=2)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index edcad2e9aa6a1c84c53b0e73f5925eea7e981c30..f1c6d349201915764682ae5079cc1753f9c94b10 100644 (file)
@@ -30,5 +30,5 @@ class CROutputData(FUBaseData):
 
 
 class CRPipeSpec(CommonPipeSpec):
-    regspec = (CRInputData.regspec, CROutputData.regspec)
+    regspecklses = (CRInputData, CROutputData)
     opsubsetkls = CompCROpSubset
index 6352698006c7808cd808d83dab226a4d91a3dfe6..9a92d2d6dbdacfdf1478ac99a82ce839245d97ef 100644 (file)
@@ -24,7 +24,7 @@ from openpower.test.cr.cr_cases import CRTestCase
 class CRIlangCase(TestAccumulatorBase):
 
     def case_ilang(self):
-        pspec = CRPipeSpec(id_wid=2)
+        pspec = CRPipeSpec(id_wid=2, parent_pspec=None)
         alu = CRBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("cr_pipeline.il", "w") as f:
@@ -76,10 +76,10 @@ class TestRunner(unittest.TestCase):
 
         cr_en = yield dec2.e.write_cr.ok
         if whole_reg_ok:
-            full_cr = yield alu.n.data_o.full_cr.data & full_cr_mask
+            full_cr = yield alu.n.o_data.full_cr.data & full_cr_mask
             expected_cr = simulator.cr.value
-            print("CR whole: expected %x, actual: %x mask: %x" % \
-                (expected_cr, full_cr, full_cr_mask))
+            print("CR whole: expected %x, actual: %x mask: %x" %
+                  (expected_cr, full_cr, full_cr_mask))
             # HACK: only look at the bits that we expected to change
             self.assertEqual(expected_cr & full_cr_mask, full_cr, code)
         elif cr_en:
@@ -87,10 +87,10 @@ class TestRunner(unittest.TestCase):
             expected_cr = simulator.cr.value
             print(f"CR whole: {expected_cr:x}, sel {cr_sel}")
             expected_cr = simulator.crl[cr_sel].get_range().value
-            real_cr = yield alu.n.data_o.cr.data
+            real_cr = yield alu.n.o_data.cr.data
             print(f"CR part: expected {expected_cr:x}, actual: {real_cr:x}")
             self.assertEqual(expected_cr, real_cr, code)
-        alu_out = yield alu.n.data_o.o.data
+        alu_out = yield alu.n.o_data.o.data
         out_reg_valid = yield dec2.e.write_reg.ok
         if out_reg_valid:
             write_reg_idx = yield dec2.e.write_reg.data
@@ -118,7 +118,7 @@ class TestRunner(unittest.TestCase):
             yield instruction.eq(ins)          # raw binary instr.
             yield Settle()
             yield from self.set_inputs(alu, pdecode2, sim)
-            yield alu.p.valid_i.eq(1)
+            yield alu.p.i_valid.eq(1)
             fn_unit = yield pdecode2.e.do.fn_unit
             self.assertEqual(fn_unit, Function.CR.value, code)
             yield
@@ -126,10 +126,10 @@ class TestRunner(unittest.TestCase):
             yield from sim.call(opname)
             index = sim.pc.CIA.value//4
 
-            vld = yield alu.n.valid_o
+            vld = yield alu.n.o_valid
             while not vld:
                 yield
-                vld = yield alu.n.valid_o
+                vld = yield alu.n.o_valid
             yield
             yield from self.assert_outputs(alu, pdecode2, sim, code)
 
@@ -144,11 +144,11 @@ class TestRunner(unittest.TestCase):
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
         pdecode = pdecode2.dec
 
-        pspec = CRPipeSpec(id_wid=2)
+        pspec = CRPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.alu = alu = CRBasePipe(pspec)
 
-        comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
-        comb += alu.n.ready_i.eq(1)
+        comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+        comb += alu.n.i_ready.eq(1)
         comb += pdecode2.dec.raw_opcode_in.eq(instruction)
         sim = Simulator(m)
 
index 9f63a63117ac5e0d4a72f4e7d0dc901d721fd504..e271876b26e41b1f74a6bd919dd985691cae00b0 100644 (file)
@@ -3,7 +3,7 @@
 
 from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
 from nmutil.pipemodbase import PipeModBase
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
 from openpower.decoder.power_enums import MicrOp
 
 from openpower.decoder.power_fields import DecodeFields
diff --git a/src/soc/fu/div/experiment/__init__.py b/src/soc/fu/div/experiment/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/soc/fu/div/experiment/goldschmidt_div_sqrt.py b/src/soc/fu/div/experiment/goldschmidt_div_sqrt.py
new file mode 100644 (file)
index 0000000..3f7c248
--- /dev/null
@@ -0,0 +1,1552 @@
+# SPDX-License-Identifier: LGPL-3-or-later
+# Copyright 2022 Jacob Lifshay programmerjake@gmail.com
+
+# Funded by NLnet Assure Programme 2021-02-052, https://nlnet.nl/assure part
+# of Horizon 2020 EU Programme 957073.
+
+from collections import defaultdict
+import logging
+import math
+import enum
+from fractions import Fraction
+from types import FunctionType
+from functools import lru_cache
+from nmigen.hdl.ast import Signal, unsigned, signed, Const
+from nmigen.hdl.dsl import Module, Elaboratable
+from nmigen.hdl.mem import Memory
+from nmutil.clz import CLZ
+from nmutil.plain_data import plain_data, fields, replace
+
+try:
+    from functools import cached_property
+except ImportError:
+    from cached_property import cached_property
+
+# fix broken IDE type detection for cached_property
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from functools import cached_property
+
+
+_NOT_FOUND = object()
+
+
+def cache_on_self(func):
+    """like `functools.cached_property`, except for methods. unlike
+    `lru_cache` the cache is per-class instance rather than a global cache
+    per-method."""
+
+    assert isinstance(func, FunctionType), \
+        "non-plain methods are not supported"
+
+    cache_name = func.__name__ + "__cache"
+
+    def wrapper(self, *args, **kwargs):
+        # specifically access through `__dict__` to bypass frozen=True
+        cache = self.__dict__.get(cache_name, _NOT_FOUND)
+        if cache is _NOT_FOUND:
+            self.__dict__[cache_name] = cache = {}
+        key = (args, *kwargs.items())
+        retval = cache.get(key, _NOT_FOUND)
+        if retval is _NOT_FOUND:
+            retval = func(self, *args, **kwargs)
+            cache[key] = retval
+        return retval
+
+    wrapper.__doc__ = func.__doc__
+    return wrapper
+
+
+@enum.unique
+class RoundDir(enum.Enum):
+    DOWN = enum.auto()
+    UP = enum.auto()
+    NEAREST_TIES_UP = enum.auto()
+    ERROR_IF_INEXACT = enum.auto()
+
+
+@plain_data(frozen=True, eq=False, repr=False)
+class FixedPoint:
+    __slots__ = "bits", "frac_wid"
+
+    def __init__(self, bits, frac_wid):
+        self.bits = bits
+        self.frac_wid = frac_wid
+        assert isinstance(self.bits, int)
+        assert isinstance(self.frac_wid, int) and self.frac_wid >= 0
+
+    @staticmethod
+    def cast(value):
+        """convert `value` to a fixed-point number with enough fractional
+        bits to preserve its value."""
+        if isinstance(value, FixedPoint):
+            return value
+        if isinstance(value, int):
+            return FixedPoint(value, 0)
+        if isinstance(value, str):
+            value = value.strip()
+            neg = value.startswith("-")
+            if neg or value.startswith("+"):
+                value = value[1:]
+            if value.startswith(("0x", "0X")) and "." in value:
+                value = value[2:]
+                got_dot = False
+                bits = 0
+                frac_wid = 0
+                for digit in value:
+                    if digit == "_":
+                        continue
+                    if got_dot:
+                        if digit == ".":
+                            raise ValueError("too many `.` in string")
+                        frac_wid += 4
+                    if digit == ".":
+                        got_dot = True
+                        continue
+                    if not digit.isalnum():
+                        raise ValueError("invalid hexadecimal digit")
+                    bits <<= 4
+                    bits |= int("0x" + digit, base=16)
+            else:
+                bits = int(value, base=0)
+                frac_wid = 0
+            if neg:
+                bits = -bits
+            return FixedPoint(bits, frac_wid)
+
+        if isinstance(value, float):
+            n, d = value.as_integer_ratio()
+            log2_d = d.bit_length() - 1
+            assert d == 1 << log2_d, ("d isn't a power of 2 -- won't ever "
+                                      "fail with float being IEEE 754")
+            return FixedPoint(n, log2_d)
+        raise TypeError("can't convert type to FixedPoint")
+
+    @staticmethod
+    def with_frac_wid(value, frac_wid, round_dir=RoundDir.ERROR_IF_INEXACT):
+        """convert `value` to the nearest fixed-point number with `frac_wid`
+        fractional bits, rounding according to `round_dir`."""
+        assert isinstance(frac_wid, int) and frac_wid >= 0
+        assert isinstance(round_dir, RoundDir)
+        if isinstance(value, Fraction):
+            numerator = value.numerator
+            denominator = value.denominator
+        else:
+            value = FixedPoint.cast(value)
+            numerator = value.bits
+            denominator = 1 << value.frac_wid
+        if denominator < 0:
+            numerator = -numerator
+            denominator = -denominator
+        bits, remainder = divmod(numerator << frac_wid, denominator)
+        if round_dir == RoundDir.DOWN:
+            pass
+        elif round_dir == RoundDir.UP:
+            if remainder != 0:
+                bits += 1
+        elif round_dir == RoundDir.NEAREST_TIES_UP:
+            if remainder * 2 >= denominator:
+                bits += 1
+        elif round_dir == RoundDir.ERROR_IF_INEXACT:
+            if remainder != 0:
+                raise ValueError("inexact conversion")
+        else:
+            assert False, "unimplemented round_dir"
+        return FixedPoint(bits, frac_wid)
+
+    def to_frac_wid(self, frac_wid, round_dir=RoundDir.ERROR_IF_INEXACT):
+        """convert to the nearest fixed-point number with `frac_wid`
+        fractional bits, rounding according to `round_dir`."""
+        return FixedPoint.with_frac_wid(self, frac_wid, round_dir)
+
+    def __float__(self):
+        # use truediv to get correct result even when bits
+        # and frac_wid are huge
+        return float(self.bits / (1 << self.frac_wid))
+
+    def as_fraction(self):
+        return Fraction(self.bits, 1 << self.frac_wid)
+
+    def cmp(self, rhs):
+        """compare self with rhs, returning a positive integer if self is
+        greater than rhs, zero if self is equal to rhs, and a negative integer
+        if self is less than rhs."""
+        rhs = FixedPoint.cast(rhs)
+        common_frac_wid = max(self.frac_wid, rhs.frac_wid)
+        lhs = self.to_frac_wid(common_frac_wid)
+        rhs = rhs.to_frac_wid(common_frac_wid)
+        return lhs.bits - rhs.bits
+
+    def __eq__(self, rhs):
+        return self.cmp(rhs) == 0
+
+    def __ne__(self, rhs):
+        return self.cmp(rhs) != 0
+
+    def __gt__(self, rhs):
+        return self.cmp(rhs) > 0
+
+    def __lt__(self, rhs):
+        return self.cmp(rhs) < 0
+
+    def __ge__(self, rhs):
+        return self.cmp(rhs) >= 0
+
+    def __le__(self, rhs):
+        return self.cmp(rhs) <= 0
+
+    def fract(self):
+        """return the fractional part of `self`.
+        that is `self - math.floor(self)`.
+        """
+        fract_mask = (1 << self.frac_wid) - 1
+        return FixedPoint(self.bits & fract_mask, self.frac_wid)
+
+    def __str__(self):
+        if self < 0:
+            return "-" + str(-self)
+        digit_bits = 4
+        frac_digit_count = (self.frac_wid + digit_bits - 1) // digit_bits
+        fract = self.fract().to_frac_wid(frac_digit_count * digit_bits)
+        frac_str = hex(fract.bits)[2:].zfill(frac_digit_count)
+        return hex(math.floor(self)) + "." + frac_str
+
+    def __repr__(self):
+        return f"FixedPoint.with_frac_wid({str(self)!r}, {self.frac_wid})"
+
+    def __add__(self, rhs):
+        rhs = FixedPoint.cast(rhs)
+        common_frac_wid = max(self.frac_wid, rhs.frac_wid)
+        lhs = self.to_frac_wid(common_frac_wid)
+        rhs = rhs.to_frac_wid(common_frac_wid)
+        return FixedPoint(lhs.bits + rhs.bits, common_frac_wid)
+
+    def __radd__(self, lhs):
+        # symmetric
+        return self.__add__(lhs)
+
+    def __neg__(self):
+        return FixedPoint(-self.bits, self.frac_wid)
+
+    def __sub__(self, rhs):
+        rhs = FixedPoint.cast(rhs)
+        common_frac_wid = max(self.frac_wid, rhs.frac_wid)
+        lhs = self.to_frac_wid(common_frac_wid)
+        rhs = rhs.to_frac_wid(common_frac_wid)
+        return FixedPoint(lhs.bits - rhs.bits, common_frac_wid)
+
+    def __rsub__(self, lhs):
+        # a - b == -(b - a)
+        return -self.__sub__(lhs)
+
+    def __mul__(self, rhs):
+        rhs = FixedPoint.cast(rhs)
+        return FixedPoint(self.bits * rhs.bits, self.frac_wid + rhs.frac_wid)
+
+    def __rmul__(self, lhs):
+        # symmetric
+        return self.__mul__(lhs)
+
+    def __floor__(self):
+        return self.bits >> self.frac_wid
+
+    def div(self, rhs, frac_wid, round_dir=RoundDir.ERROR_IF_INEXACT):
+        assert isinstance(frac_wid, int) and frac_wid >= 0
+        assert isinstance(round_dir, RoundDir)
+        rhs = FixedPoint.cast(rhs)
+        return FixedPoint.with_frac_wid(self.as_fraction()
+                                        / rhs.as_fraction(),
+                                        frac_wid, round_dir)
+
+    def sqrt(self, round_dir=RoundDir.ERROR_IF_INEXACT):
+        assert isinstance(round_dir, RoundDir)
+        if self < 0:
+            raise ValueError("can't compute sqrt of negative number")
+        if self == 0:
+            return self
+        retval = FixedPoint(0, self.frac_wid)
+        int_part_wid = self.bits.bit_length() - self.frac_wid
+        first_bit_index = -(-int_part_wid // 2)  # division rounds up
+        last_bit_index = -self.frac_wid
+        for bit_index in range(first_bit_index, last_bit_index - 1, -1):
+            trial = retval + FixedPoint(1 << (bit_index + self.frac_wid),
+                                        self.frac_wid)
+            if trial * trial <= self:
+                retval = trial
+        if round_dir == RoundDir.DOWN:
+            pass
+        elif round_dir == RoundDir.UP:
+            if retval * retval < self:
+                retval += FixedPoint(1, self.frac_wid)
+        elif round_dir == RoundDir.NEAREST_TIES_UP:
+            half_way = retval + FixedPoint(1, self.frac_wid + 1)
+            if half_way * half_way <= self:
+                retval += FixedPoint(1, self.frac_wid)
+        elif round_dir == RoundDir.ERROR_IF_INEXACT:
+            if retval * retval != self:
+                raise ValueError("inexact sqrt")
+        else:
+            assert False, "unimplemented round_dir"
+        return retval
+
+    def rsqrt(self, round_dir=RoundDir.ERROR_IF_INEXACT):
+        """compute the reciprocal-sqrt of `self`"""
+        assert isinstance(round_dir, RoundDir)
+        if self < 0:
+            raise ValueError("can't compute rsqrt of negative number")
+        if self == 0:
+            raise ZeroDivisionError("can't compute rsqrt of zero")
+        retval = FixedPoint(0, self.frac_wid)
+        first_bit_index = -(-self.frac_wid // 2)  # division rounds up
+        last_bit_index = -self.frac_wid
+        for bit_index in range(first_bit_index, last_bit_index - 1, -1):
+            trial = retval + FixedPoint(1 << (bit_index + self.frac_wid),
+                                        self.frac_wid)
+            if trial * trial * self <= 1:
+                retval = trial
+        if round_dir == RoundDir.DOWN:
+            pass
+        elif round_dir == RoundDir.UP:
+            if retval * retval * self < 1:
+                retval += FixedPoint(1, self.frac_wid)
+        elif round_dir == RoundDir.NEAREST_TIES_UP:
+            half_way = retval + FixedPoint(1, self.frac_wid + 1)
+            if half_way * half_way * self <= 1:
+                retval += FixedPoint(1, self.frac_wid)
+        elif round_dir == RoundDir.ERROR_IF_INEXACT:
+            if retval * retval * self != 1:
+                raise ValueError("inexact rsqrt")
+        else:
+            assert False, "unimplemented round_dir"
+        return retval
+
+
+class ParamsNotAccurateEnough(Exception):
+    """raised when the parameters aren't accurate enough to have goldschmidt
+    division work."""
+
+
+def _assert_accuracy(condition, msg="not accurate enough"):
+    if condition:
+        return
+    raise ParamsNotAccurateEnough(msg)
+
+
+@plain_data(frozen=True, unsafe_hash=True)
+class GoldschmidtDivParamsBase:
+    """parameters for a Goldschmidt division algorithm, excluding derived
+    parameters.
+    """
+
+    __slots__ = ("io_width", "extra_precision", "table_addr_bits",
+                 "table_data_bits", "iter_count")
+
+    def __init__(self, io_width, extra_precision, table_addr_bits,
+                 table_data_bits, iter_count):
+        assert isinstance(io_width, int)
+        assert isinstance(extra_precision, int)
+        assert isinstance(table_addr_bits, int)
+        assert isinstance(table_data_bits, int)
+        assert isinstance(iter_count, int)
+        self.io_width = io_width
+        """bit-width of the input divisor and the result.
+        the input numerator is `2 * io_width`-bits wide.
+        """
+
+        self.extra_precision = extra_precision
+        """number of bits of additional precision used inside the algorithm."""
+
+        self.table_addr_bits = table_addr_bits
+        """the number of address bits used in the lookup-table."""
+
+        self.table_data_bits = table_data_bits
+        """the number of data bits used in the lookup-table."""
+
+        self.iter_count = iter_count
+        """the total number of iterations of the division algorithm's loop"""
+
+
+@plain_data(frozen=True, unsafe_hash=True)
+class GoldschmidtDivParams(GoldschmidtDivParamsBase):
+    """parameters for a Goldschmidt division algorithm.
+    Use `GoldschmidtDivParams.get` to find a efficient set of parameters.
+    """
+
+    __slots__ = "table", "ops"
+
+    def _shrink_bound(self, bound, round_dir):
+        """prevent fractions from having huge numerators/denominators by
+        rounding to a `FixedPoint` and converting back to a `Fraction`.
+
+        This is intended only for values used to compute bounds, and not for
+        values that end up in the hardware.
+        """
+        assert isinstance(bound, (Fraction, int))
+        assert round_dir is RoundDir.DOWN or round_dir is RoundDir.UP, \
+            "you shouldn't use that round_dir on bounds"
+        frac_wid = self.io_width * 4 + 100  # should be enough precision
+        fixed = FixedPoint.with_frac_wid(bound, frac_wid, round_dir)
+        return fixed.as_fraction()
+
+    def _shrink_min(self, min_bound):
+        """prevent fractions used as minimum bounds from having huge
+        numerators/denominators by rounding down to a `FixedPoint` and
+        converting back to a `Fraction`.
+
+        This is intended only for values used to compute bounds, and not for
+        values that end up in the hardware.
+        """
+        return self._shrink_bound(min_bound, RoundDir.DOWN)
+
+    def _shrink_max(self, max_bound):
+        """prevent fractions used as maximum bounds from having huge
+        numerators/denominators by rounding up to a `FixedPoint` and
+        converting back to a `Fraction`.
+
+        This is intended only for values used to compute bounds, and not for
+        values that end up in the hardware.
+        """
+        return self._shrink_bound(max_bound, RoundDir.UP)
+
+    @property
+    def table_addr_count(self):
+        """number of distinct addresses in the lookup-table."""
+        # used while computing self.table, so can't just do len(self.table)
+        return 1 << self.table_addr_bits
+
+    def table_input_exact_range(self, addr):
+        """return the range of inputs as `Fraction`s used for the table entry
+        with address `addr`."""
+        assert isinstance(addr, int)
+        assert 0 <= addr < self.table_addr_count
+        _assert_accuracy(self.io_width >= self.table_addr_bits)
+        addr_shift = self.io_width - self.table_addr_bits
+        min_numerator = (1 << self.io_width) + (addr << addr_shift)
+        denominator = 1 << self.io_width
+        values_per_table_entry = 1 << addr_shift
+        max_numerator = min_numerator + values_per_table_entry - 1
+        min_input = Fraction(min_numerator, denominator)
+        max_input = Fraction(max_numerator, denominator)
+        min_input = self._shrink_min(min_input)
+        max_input = self._shrink_max(max_input)
+        assert 1 <= min_input <= max_input < 2
+        return min_input, max_input
+
+    def table_value_exact_range(self, addr):
+        """return the range of values as `Fraction`s used for the table entry
+        with address `addr`."""
+        min_input, max_input = self.table_input_exact_range(addr)
+        # division swaps min/max
+        min_value = 1 / max_input
+        max_value = 1 / min_input
+        min_value = self._shrink_min(min_value)
+        max_value = self._shrink_max(max_value)
+        assert 0.5 < min_value <= max_value <= 1
+        return min_value, max_value
+
+    def table_exact_value(self, index):
+        min_value, max_value = self.table_value_exact_range(index)
+        # we round down
+        return min_value
+
+    def __init__(self, io_width, extra_precision, table_addr_bits,
+                 table_data_bits, iter_count):
+        super().__init__(io_width=io_width,
+                         extra_precision=extra_precision,
+                         table_addr_bits=table_addr_bits,
+                         table_data_bits=table_data_bits,
+                         iter_count=iter_count)
+        _assert_accuracy(self.io_width >= 1, "io_width out of range")
+        _assert_accuracy(self.extra_precision >= 0,
+                         "extra_precision out of range")
+        _assert_accuracy(self.table_addr_bits >= 1,
+                         "table_addr_bits out of range")
+        _assert_accuracy(self.table_data_bits >= 1,
+                         "table_data_bits out of range")
+        _assert_accuracy(self.iter_count >= 1, "iter_count out of range")
+        table = []
+        for addr in range(1 << self.table_addr_bits):
+            table.append(FixedPoint.with_frac_wid(self.table_exact_value(addr),
+                                                  self.table_data_bits,
+                                                  RoundDir.DOWN))
+
+        self.table = tuple(table)
+        """ the lookup-table.
+        type: tuple[FixedPoint, ...]
+        """
+
+        self.ops = tuple(self.__make_ops())
+        "the operations needed to perform the goldschmidt division algorithm."
+
+    @property
+    def expanded_width(self):
+        """the total number of bits of precision used inside the algorithm."""
+        return self.io_width + self.extra_precision
+
+    @property
+    def n_d_f_int_wid(self):
+        """the number of bits in the integer part of `state.n`, `state.d`, and
+        `state.f` during the main iteration loop.
+        """
+        return 2
+
+    @property
+    def n_d_f_total_wid(self):
+        """the total number of bits (both integer and fraction bits) in
+        `state.n`, `state.d`, and `state.f` during the main iteration loop.
+        """
+        return self.n_d_f_int_wid + self.expanded_width
+
+    @cache_on_self
+    def max_neps(self, i):
+        """maximum value of `neps[i]`.
+        `neps[i]` is defined to be `n[i] * N_prime[i - 1] * F_prime[i - 1]`.
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        return Fraction(1, 1 << self.expanded_width)
+
+    @cache_on_self
+    def max_deps(self, i):
+        """maximum value of `deps[i]`.
+        `deps[i]` is defined to be `d[i] * D_prime[i - 1] * F_prime[i - 1]`.
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        return Fraction(1, 1 << self.expanded_width)
+
+    @cache_on_self
+    def max_feps(self, i):
+        """maximum value of `feps[i]`.
+        `feps[i]` is defined to be `f[i] * (2 - D_prime[i - 1])`.
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        # zero, because the computation of `F_prime[i]` in
+        # `GoldschmidtDivOp.MulDByF.run(...)` is exact.
+        return Fraction(0)
+
+    @cached_property
+    def e0_range(self):
+        """minimum and maximum values of `e[0]`
+        (the relative error in `F_prime[-1]`)
+        """
+        min_e0 = Fraction(0)
+        max_e0 = Fraction(0)
+        for addr in range(self.table_addr_count):
+            # `F_prime[-1] = (1 - e[0]) / B`
+            # => `e[0] = 1 - B * F_prime[-1]`
+            min_b, max_b = self.table_input_exact_range(addr)
+            f_prime_m1 = self.table[addr].as_fraction()
+            assert min_b >= 0 and f_prime_m1 >= 0, \
+                "only positive quadrant of interval multiplication implemented"
+            min_product = min_b * f_prime_m1
+            max_product = max_b * f_prime_m1
+            # negation swaps min/max
+            cur_min_e0 = 1 - max_product
+            cur_max_e0 = 1 - min_product
+            min_e0 = min(min_e0, cur_min_e0)
+            max_e0 = max(max_e0, cur_max_e0)
+        min_e0 = self._shrink_min(min_e0)
+        max_e0 = self._shrink_max(max_e0)
+        return min_e0, max_e0
+
+    @cached_property
+    def min_e0(self):
+        """minimum value of `e[0]` (the relative error in `F_prime[-1]`)
+        """
+        min_e0, max_e0 = self.e0_range
+        return min_e0
+
+    @cached_property
+    def max_e0(self):
+        """maximum value of `e[0]` (the relative error in `F_prime[-1]`)
+        """
+        min_e0, max_e0 = self.e0_range
+        return max_e0
+
+    @cached_property
+    def max_abs_e0(self):
+        """maximum value of `abs(e[0])`."""
+        return max(abs(self.min_e0), abs(self.max_e0))
+
+    @cached_property
+    def min_abs_e0(self):
+        """minimum value of `abs(e[0])`."""
+        return Fraction(0)
+
+    @cache_on_self
+    def max_n(self, i):
+        """maximum value of `n[i]` (the relative error in `N_prime[i]`
+        relative to the previous iteration)
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        if i == 0:
+            # from Claim 10
+            # `n[0] = neps[0] / ((1 - e[0]) * (A / B))`
+            # `n[0] <= 2 * neps[0] / (1 - e[0])`
+
+            assert self.max_e0 < 1 and self.max_neps(0) >= 0, \
+                "only one quadrant of interval division implemented"
+            retval = 2 * self.max_neps(0) / (1 - self.max_e0)
+        elif i == 1:
+            # from Claim 10
+            # `n[1] <= neps[1] / ((1 - f[0]) * (1 - pi[0] - delta[0]))`
+            min_mpd = 1 - self.max_pi(0) - self.max_delta(0)
+            assert self.max_f(0) <= 1 and min_mpd >= 0, \
+                "only one quadrant of interval multiplication implemented"
+            prod = (1 - self.max_f(0)) * min_mpd
+            assert self.max_neps(1) >= 0 and prod > 0, \
+                "only one quadrant of interval division implemented"
+            retval = self.max_neps(1) / prod
+        else:
+            # from Claim 6
+            # `0 <= n[i] <= 2 * max_neps[i] / (1 - pi[i - 1] - delta[i - 1])`
+            min_mpd = 1 - self.max_pi(i - 1) - self.max_delta(i - 1)
+            assert self.max_neps(i) >= 0 and min_mpd > 0, \
+                "only one quadrant of interval division implemented"
+            retval = self.max_neps(i) / min_mpd
+
+        return self._shrink_max(retval)
+
+    @cache_on_self
+    def max_d(self, i):
+        """maximum value of `d[i]` (the relative error in `D_prime[i]`
+        relative to the previous iteration)
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        if i == 0:
+            # from Claim 10
+            # `d[0] = deps[0] / (1 - e[0])`
+
+            assert self.max_e0 < 1 and self.max_deps(0) >= 0, \
+                "only one quadrant of interval division implemented"
+            retval = self.max_deps(0) / (1 - self.max_e0)
+        elif i == 1:
+            # from Claim 10
+            # `d[1] <= deps[1] / ((1 - f[0]) * (1 - delta[0] ** 2))`
+            assert self.max_f(0) <= 1 and self.max_delta(0) <= 1, \
+                "only one quadrant of interval multiplication implemented"
+            divisor = (1 - self.max_f(0)) * (1 - self.max_delta(0) ** 2)
+            assert self.max_deps(1) >= 0 and divisor > 0, \
+                "only one quadrant of interval division implemented"
+            retval = self.max_deps(1) / divisor
+        else:
+            # from Claim 6
+            # `0 <= d[i] <= max_deps[i] / (1 - delta[i - 1])`
+            assert self.max_deps(i) >= 0 and self.max_delta(i - 1) < 1, \
+                "only one quadrant of interval division implemented"
+            retval = self.max_deps(i) / (1 - self.max_delta(i - 1))
+
+        return self._shrink_max(retval)
+
+    @cache_on_self
+    def max_f(self, i):
+        """maximum value of `f[i]` (the relative error in `F_prime[i]`
+        relative to the previous iteration)
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        if i == 0:
+            # from Claim 10
+            # `f[0] = feps[0] / (1 - delta[0])`
+
+            assert self.max_delta(0) < 1 and self.max_feps(0) >= 0, \
+                "only one quadrant of interval division implemented"
+            retval = self.max_feps(0) / (1 - self.max_delta(0))
+        elif i == 1:
+            # from Claim 10
+            # `f[1] = feps[1]`
+            retval = self.max_feps(1)
+        else:
+            # from Claim 6
+            # `f[i] <= max_feps[i]`
+            retval = self.max_feps(i)
+
+        return self._shrink_max(retval)
+
+    @cache_on_self
+    def max_delta(self, i):
+        """ maximum value of `delta[i]`.
+        `delta[i]` is defined in Definition 4 of paper.
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        if i == 0:
+            # `delta[0] = abs(e[0]) + 3 * d[0] / 2`
+            retval = self.max_abs_e0 + Fraction(3, 2) * self.max_d(0)
+        else:
+            # `delta[i] = delta[i - 1] ** 2 + f[i - 1]`
+            prev_max_delta = self.max_delta(i - 1)
+            assert prev_max_delta >= 0
+            retval = prev_max_delta ** 2 + self.max_f(i - 1)
+
+        # `delta[i]` has to be smaller than one otherwise errors would go off
+        # to infinity
+        _assert_accuracy(retval < 1)
+
+        return self._shrink_max(retval)
+
+    @cache_on_self
+    def max_pi(self, i):
+        """ maximum value of `pi[i]`.
+        `pi[i]` is defined right below Theorem 5 of paper.
+        """
+        assert isinstance(i, int) and 0 <= i < self.iter_count
+        # `pi[i] = 1 - (1 - n[i]) * prod`
+        # where `prod` is the product of,
+        # for `j` in `0 <= j < i`, `(1 - n[j]) / (1 + d[j])`
+        min_prod = Fraction(1)
+        for j in range(i):
+            max_n_j = self.max_n(j)
+            max_d_j = self.max_d(j)
+            assert max_n_j <= 1 and max_d_j > -1, \
+                "only one quadrant of interval division implemented"
+            min_prod *= (1 - max_n_j) / (1 + max_d_j)
+        max_n_i = self.max_n(i)
+        assert max_n_i <= 1 and min_prod >= 0, \
+            "only one quadrant of interval multiplication implemented"
+        retval = 1 - (1 - max_n_i) * min_prod
+        return self._shrink_max(retval)
+
+    @cached_property
+    def max_n_shift(self):
+        """ maximum value of `state.n_shift`.
+        """
+        # numerator must be less than `denominator << self.io_width`, so
+        # `n_shift` is at most `self.io_width`
+        return self.io_width
+
+    @cached_property
+    def n_hat(self):
+        """ maximum value of, for all `i`, `max_n(i)` and `max_d(i)`
+        """
+        n_hat = Fraction(0)
+        for i in range(self.iter_count):
+            n_hat = max(n_hat, self.max_n(i), self.max_d(i))
+        return self._shrink_max(n_hat)
+
+    def __make_ops(self):
+        """ Goldschmidt division algorithm.
+
+            based on:
+            Even, G., Seidel, P. M., & Ferguson, W. E. (2003).
+            A Parametric Error Analysis of Goldschmidt's Division Algorithm.
+            https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.90.1238&rep=rep1&type=pdf
+
+            yields: GoldschmidtDivOp
+                the operations needed to perform the division.
+        """
+        # establish assumptions of the paper's error analysis (section 3.1):
+
+        # 1. normalize so A (numerator) and B (denominator) are in [1, 2)
+        yield GoldschmidtDivOp.Normalize
+
+        # 2. ensure all relative errors from directed rounding are <= 1 / 4.
+        # the assumption is met by multipliers with > 4-bits precision
+        _assert_accuracy(self.expanded_width > 4)
+
+        # 3. require `abs(e[0]) + 3 * d[0] / 2 + f[0] < 1 / 2`.
+        _assert_accuracy(self.max_abs_e0 + 3 * self.max_d(0) / 2
+                         + self.max_f(0) < Fraction(1, 2))
+
+        # 4. the initial approximation F'[-1] of 1/B is in [1/2, 1].
+        # (B is the denominator)
+
+        for addr in range(self.table_addr_count):
+            f_prime_m1 = self.table[addr]
+            _assert_accuracy(0.5 <= f_prime_m1 <= 1)
+
+        yield GoldschmidtDivOp.FEqTableLookup
+
+        # we use Setting I (section 4.1 of the paper):
+        # Require `n[i] <= n_hat` and `d[i] <= n_hat` and `f[i] = 0`:
+        # the conditions on n_hat are satisfied by construction.
+        for i in range(self.iter_count):
+            _assert_accuracy(self.max_f(i) == 0)
+            yield GoldschmidtDivOp.MulNByF
+            if i != self.iter_count - 1:
+                yield GoldschmidtDivOp.MulDByF
+                yield GoldschmidtDivOp.FEq2MinusD
+
+        # relative approximation error `p(N_prime[i])`:
+        # `p(N_prime[i]) = (A / B - N_prime[i]) / (A / B)`
+        # `0 <= p(N_prime[i])`
+        # `p(N_prime[i]) <= (2 * i) * n_hat \`
+        # ` + (abs(e[0]) + 3 * n_hat / 2) ** (2 ** i)`
+        i = self.iter_count - 1  # last used `i`
+        # compute power manually to prevent huge intermediate values
+        power = self._shrink_max(self.max_abs_e0 + 3 * self.n_hat / 2)
+        for _ in range(i):
+            power = self._shrink_max(power * power)
+
+        max_rel_error = (2 * i) * self.n_hat + power
+
+        min_a_over_b = Fraction(1, 2)
+        min_abs_error_for_correctness = min_a_over_b / (1 << self.max_n_shift)
+        min_rel_error_for_correctness = (min_abs_error_for_correctness
+                                         / min_a_over_b)
+
+        _assert_accuracy(
+            max_rel_error < min_rel_error_for_correctness,
+            f"not accurate enough: max_rel_error={max_rel_error}"
+            f" min_rel_error_for_correctness={min_rel_error_for_correctness}")
+
+        yield GoldschmidtDivOp.CalcResult
+
+    @cache_on_self
+    def default_cost_fn(self):
+        """ calculate the estimated cost on an arbitrary scale of implementing
+        goldschmidt division with the specified parameters. larger cost
+        values mean worse parameters.
+
+        This is the default cost function for `GoldschmidtDivParams.get`.
+
+        returns: float
+        """
+        rom_cells = self.table_data_bits << self.table_addr_bits
+        cost = float(rom_cells)
+        for op in self.ops:
+            if op == GoldschmidtDivOp.MulNByF \
+                    or op == GoldschmidtDivOp.MulDByF:
+                mul_cost = self.expanded_width ** 2
+                mul_cost *= self.expanded_width.bit_length()
+                cost += mul_cost
+        cost += 5e7 * self.iter_count
+        return cost
+
+    @staticmethod
+    @lru_cache(maxsize=1 << 16)
+    def __cached_new(base_params):
+        assert isinstance(base_params, GoldschmidtDivParamsBase)
+        kwargs = {}
+        for field in fields(GoldschmidtDivParamsBase):
+            kwargs[field] = getattr(base_params, field)
+        try:
+            return GoldschmidtDivParams(**kwargs), None
+        except ParamsNotAccurateEnough as e:
+            return None, e
+
+    @staticmethod
+    def __raise(e):  # type: (ParamsNotAccurateEnough) -> Any
+        raise e
+
+    @staticmethod
+    def cached_new(base_params, handle_error=__raise):
+        assert isinstance(base_params, GoldschmidtDivParamsBase)
+        params, error = GoldschmidtDivParams.__cached_new(base_params)
+        if error is None:
+            return params
+        else:
+            return handle_error(error)
+
+    @staticmethod
+    def get(io_width, cost_fn=default_cost_fn, max_table_addr_bits=12):
+        """ find efficient parameters for a goldschmidt division algorithm
+        with `params.io_width == io_width`.
+
+        arguments:
+        io_width: int
+            bit-width of the input divisor and the result.
+            the input numerator is `2 * io_width`-bits wide.
+        cost_fn: Callable[[GoldschmidtDivParams], float]
+            return the estimated cost on an arbitrary scale of implementing
+            goldschmidt division with the specified parameters. larger cost
+            values mean worse parameters.
+        max_table_addr_bits: int
+            maximum allowable value of `table_addr_bits`
+        """
+        assert isinstance(io_width, int) and io_width >= 1
+        assert callable(cost_fn)
+
+        last_error = None
+        last_error_params = None
+
+        def cached_new(base_params):
+            def handle_error(e):
+                nonlocal last_error, last_error_params
+                last_error = e
+                last_error_params = base_params
+                return None
+
+            retval = GoldschmidtDivParams.cached_new(base_params, handle_error)
+            if retval is None:
+                logging.debug(f"GoldschmidtDivParams.get: err: {base_params}")
+            else:
+                logging.debug(f"GoldschmidtDivParams.get: ok: {base_params}")
+            return retval
+
+        @lru_cache(maxsize=None)
+        def get_cost(base_params):
+            params = cached_new(base_params)
+            if params is None:
+                return math.inf
+            retval = cost_fn(params)
+            logging.debug(f"GoldschmidtDivParams.get: cost={retval}: {params}")
+            return retval
+
+        # start with parameters big enough to always work.
+        initial_extra_precision = io_width * 2 + 4
+        initial_params = GoldschmidtDivParamsBase(
+            io_width=io_width,
+            extra_precision=initial_extra_precision,
+            table_addr_bits=min(max_table_addr_bits, io_width),
+            table_data_bits=io_width + initial_extra_precision,
+            iter_count=1 + io_width.bit_length())
+
+        if cached_new(initial_params) is None:
+            raise ValueError(f"initial goldschmidt division algorithm "
+                             f"parameters are invalid: {initial_params}"
+                             ) from last_error
+
+        # find good initial `iter_count`
+        params = initial_params
+        for iter_count in range(1, initial_params.iter_count):
+            trial_params = replace(params, iter_count=iter_count)
+            if cached_new(trial_params) is not None:
+                params = trial_params
+                break
+
+        # now find `table_addr_bits`
+        cost = get_cost(params)
+        for table_addr_bits in range(1, max_table_addr_bits):
+            trial_params = replace(params, table_addr_bits=table_addr_bits)
+            trial_cost = get_cost(trial_params)
+            if trial_cost < cost:
+                params = trial_params
+                cost = trial_cost
+                break
+
+        # check one higher `iter_count` to see if it has lower cost
+        for table_addr_bits in range(1, max_table_addr_bits + 1):
+            trial_params = replace(params,
+                                   table_addr_bits=table_addr_bits,
+                                   iter_count=params.iter_count + 1)
+            trial_cost = get_cost(trial_params)
+            if trial_cost < cost:
+                params = trial_params
+                cost = trial_cost
+                break
+
+        # now shrink `table_data_bits`
+        while True:
+            trial_params = replace(params,
+                                   table_data_bits=params.table_data_bits - 1)
+            trial_cost = get_cost(trial_params)
+            if trial_cost < cost:
+                params = trial_params
+                cost = trial_cost
+            else:
+                break
+
+        # and shrink `extra_precision`
+        while True:
+            trial_params = replace(params,
+                                   extra_precision=params.extra_precision - 1)
+            trial_cost = get_cost(trial_params)
+            if trial_cost < cost:
+                params = trial_params
+                cost = trial_cost
+            else:
+                break
+
+        retval = cached_new(params)
+        assert isinstance(retval, GoldschmidtDivParams)
+        return retval
+
+
+def clz(v, wid):
+    """count leading zeros -- handy for debugging."""
+    assert isinstance(wid, int)
+    assert isinstance(v, int) and 0 <= v < (1 << wid)
+    return (1 << wid).bit_length() - v.bit_length()
+
+
+@enum.unique
+class GoldschmidtDivOp(enum.Enum):
+    Normalize = "n, d, n_shift = normalize(n, d)"
+    FEqTableLookup = "f = table_lookup(d)"
+    MulNByF = "n *= f"
+    MulDByF = "d *= f"
+    FEq2MinusD = "f = 2 - d"
+    CalcResult = "result = unnormalize_and_round(n)"
+
+    def run(self, params, state):
+        assert isinstance(params, GoldschmidtDivParams)
+        assert isinstance(state, GoldschmidtDivState)
+        expanded_width = params.expanded_width
+        table_addr_bits = params.table_addr_bits
+        if self == GoldschmidtDivOp.Normalize:
+            # normalize so 1 <= d < 2
+            # can easily be done with count-leading-zeros and left shift
+            while state.d < 1:
+                state.n = (state.n * 2).to_frac_wid(expanded_width)
+                state.d = (state.d * 2).to_frac_wid(expanded_width)
+
+            state.n_shift = 0
+            # normalize so 1 <= n < 2
+            while state.n >= 2:
+                state.n = (state.n * 0.5).to_frac_wid(expanded_width,
+                                                      round_dir=RoundDir.DOWN)
+                state.n_shift += 1
+        elif self == GoldschmidtDivOp.FEqTableLookup:
+            # compute initial f by table lookup
+            d_m_1 = state.d - 1
+            d_m_1 = d_m_1.to_frac_wid(table_addr_bits, RoundDir.DOWN)
+            assert 0 <= d_m_1.bits < (1 << params.table_addr_bits)
+            state.f = params.table[d_m_1.bits]
+            state.f = state.f.to_frac_wid(expanded_width,
+                                          round_dir=RoundDir.DOWN)
+        elif self == GoldschmidtDivOp.MulNByF:
+            assert state.f is not None
+            n = state.n * state.f
+            state.n = n.to_frac_wid(expanded_width, round_dir=RoundDir.DOWN)
+        elif self == GoldschmidtDivOp.MulDByF:
+            assert state.f is not None
+            d = state.d * state.f
+            state.d = d.to_frac_wid(expanded_width, round_dir=RoundDir.UP)
+        elif self == GoldschmidtDivOp.FEq2MinusD:
+            state.f = (2 - state.d).to_frac_wid(expanded_width)
+        elif self == GoldschmidtDivOp.CalcResult:
+            assert state.n_shift is not None
+            # scale to correct value
+            n = state.n * (1 << state.n_shift)
+
+            state.quotient = math.floor(n)
+            state.remainder = state.orig_n - state.quotient * state.orig_d
+            if state.remainder >= state.orig_d:
+                state.quotient += 1
+                state.remainder -= state.orig_d
+        else:
+            assert False, f"unimplemented GoldschmidtDivOp: {self}"
+
+    def gen_hdl(self, params, state, sync_rom):
+        """generate the hdl for this operation.
+
+        arguments:
+        params: GoldschmidtDivParams
+            the goldschmidt division parameters.
+        state: GoldschmidtDivHDLState
+            the input/output state
+        sync_rom: bool
+            true if the rom should be read synchronously rather than
+            combinatorially, incurring an extra clock cycle of latency.
+        """
+        assert isinstance(params, GoldschmidtDivParams)
+        assert isinstance(state, GoldschmidtDivHDLState)
+        m = state.m
+        if self == GoldschmidtDivOp.Normalize:
+            # normalize so 1 <= d < 2
+            assert state.d.width == params.io_width
+            assert state.n.width == 2 * params.io_width
+            d_leading_zeros = CLZ(params.io_width)
+            m.submodules.d_leading_zeros = d_leading_zeros
+            m.d.comb += d_leading_zeros.sig_in.eq(state.d)
+            d_shift_out = Signal.like(state.d)
+            m.d.comb += d_shift_out.eq(state.d << d_leading_zeros.lz)
+            d = Signal(params.n_d_f_total_wid)
+            m.d.comb += d.eq((d_shift_out << (1 + params.expanded_width))
+                             >> state.d.width)
+
+            # normalize so 1 <= n < 2
+            n_leading_zeros = CLZ(2 * params.io_width)
+            m.submodules.n_leading_zeros = n_leading_zeros
+            m.d.comb += n_leading_zeros.sig_in.eq(state.n)
+            signed_zero = Const(0, signed(1))  # force subtraction to be signed
+            n_shift_s_v = (params.io_width + signed_zero + d_leading_zeros.lz
+                           - n_leading_zeros.lz)
+            n_shift_s = Signal.like(n_shift_s_v)
+            n_shift_n_lz_out = Signal.like(state.n)
+            n_shift_d_lz_out = Signal.like(state.n << d_leading_zeros.lz)
+            m.d.comb += [
+                n_shift_s.eq(n_shift_s_v),
+                n_shift_d_lz_out.eq(state.n << d_leading_zeros.lz),
+                n_shift_n_lz_out.eq(state.n << n_leading_zeros.lz),
+            ]
+            state.n_shift = Signal(d_leading_zeros.lz.width)
+            n = Signal(params.n_d_f_total_wid)
+            with m.If(n_shift_s < 0):
+                m.d.comb += [
+                    state.n_shift.eq(0),
+                    n.eq((n_shift_d_lz_out << (1 + params.expanded_width))
+                         >> state.d.width),
+                ]
+            with m.Else():
+                m.d.comb += [
+                    state.n_shift.eq(n_shift_s),
+                    n.eq((n_shift_n_lz_out << (1 + params.expanded_width))
+                         >> state.n.width),
+                ]
+            state.n = n
+            state.d = d
+        elif self == GoldschmidtDivOp.FEqTableLookup:
+            assert state.d.width == params.n_d_f_total_wid, "invalid d width"
+            # compute initial f by table lookup
+
+            # extra bit for table entries == 1.0
+            table_width = 1 + params.table_data_bits
+            table = Memory(width=table_width, depth=len(params.table),
+                           init=[i.bits for i in params.table])
+            addr = state.d[:-params.n_d_f_int_wid][-params.table_addr_bits:]
+            if sync_rom:
+                table_read = table.read_port()
+                m.d.comb += table_read.addr.eq(addr)
+                state.insert_pipeline_register()
+            else:
+                table_read = table.read_port(domain="comb")
+                m.d.comb += table_read.addr.eq(addr)
+            m.submodules.table_read = table_read
+            state.f = Signal(params.n_d_f_int_wid + params.expanded_width)
+            data_shift = params.expanded_width - params.table_data_bits
+            m.d.comb += state.f.eq(table_read.data << data_shift)
+        elif self == GoldschmidtDivOp.MulNByF:
+            assert state.n.width == params.n_d_f_total_wid, "invalid n width"
+            assert state.f is not None
+            assert state.f.width == params.n_d_f_total_wid, "invalid f width"
+            n = Signal.like(state.n)
+            m.d.comb += n.eq((state.n * state.f) >> params.expanded_width)
+            state.n = n
+        elif self == GoldschmidtDivOp.MulDByF:
+            assert state.d.width == params.n_d_f_total_wid, "invalid d width"
+            assert state.f is not None
+            assert state.f.width == params.n_d_f_total_wid, "invalid f width"
+            d = Signal.like(state.d)
+            d_times_f = Signal.like(state.d * state.f)
+            m.d.comb += [
+                d_times_f.eq(state.d * state.f),
+                # round the multiplication up
+                d.eq((d_times_f >> params.expanded_width)
+                     + (d_times_f[:params.expanded_width] != 0)),
+            ]
+            state.d = d
+        elif self == GoldschmidtDivOp.FEq2MinusD:
+            assert state.d.width == params.n_d_f_total_wid, "invalid d width"
+            f = Signal.like(state.d)
+            m.d.comb += f.eq((2 << params.expanded_width) - state.d)
+            state.f = f
+        elif self == GoldschmidtDivOp.CalcResult:
+            assert state.n.width == params.n_d_f_total_wid, "invalid n width"
+            assert state.n_shift is not None
+            # scale to correct value
+            n = state.n * (1 << state.n_shift)
+            q_approx = Signal(params.io_width)
+            # extra bit for if it's bigger than orig_d
+            r_approx = Signal(params.io_width + 1)
+            adjusted_r = Signal(signed(1 + params.io_width))
+            m.d.comb += [
+                q_approx.eq((state.n << state.n_shift)
+                            >> params.expanded_width),
+                r_approx.eq(state.orig_n - q_approx * state.orig_d),
+                adjusted_r.eq(r_approx - state.orig_d),
+            ]
+            state.quotient = Signal(params.io_width)
+            state.remainder = Signal(params.io_width)
+
+            with m.If(adjusted_r >= 0):
+                m.d.comb += [
+                    state.quotient.eq(q_approx + 1),
+                    state.remainder.eq(adjusted_r),
+                ]
+            with m.Else():
+                m.d.comb += [
+                    state.quotient.eq(q_approx),
+                    state.remainder.eq(r_approx),
+                ]
+        else:
+            assert False, f"unimplemented GoldschmidtDivOp: {self}"
+
+
+@plain_data(repr=False)
+class GoldschmidtDivState:
+    __slots__ = ("orig_n", "orig_d", "n", "d",
+                 "f", "quotient", "remainder", "n_shift")
+
+    def __init__(self, orig_n, orig_d, n, d,
+                 f=None, quotient=None, remainder=None, n_shift=None):
+        assert isinstance(orig_n, int)
+        assert isinstance(orig_d, int)
+        assert isinstance(n, FixedPoint)
+        assert isinstance(d, FixedPoint)
+        assert f is None or isinstance(f, FixedPoint)
+        assert quotient is None or isinstance(quotient, int)
+        assert remainder is None or isinstance(remainder, int)
+        assert n_shift is None or isinstance(n_shift, int)
+        self.orig_n = orig_n
+        """original numerator"""
+
+        self.orig_d = orig_d
+        """original denominator"""
+
+        self.n = n
+        """numerator -- N_prime[i] in the paper's algorithm 2"""
+
+        self.d = d
+        """denominator -- D_prime[i] in the paper's algorithm 2"""
+
+        self.f = f
+        """current factor -- F_prime[i] in the paper's algorithm 2"""
+
+        self.quotient = quotient
+        """final quotient"""
+
+        self.remainder = remainder
+        """final remainder"""
+
+        self.n_shift = n_shift
+        """amount the numerator needs to be left-shifted at the end of the
+        algorithm.
+        """
+
+    def __repr__(self):
+        fields_str = []
+        for field in fields(GoldschmidtDivState):
+            value = getattr(self, field)
+            if value is None:
+                continue
+            if isinstance(value, int) and field != "n_shift":
+                fields_str.append(f"{field}={hex(value)}")
+            else:
+                fields_str.append(f"{field}={value!r}")
+        return f"GoldschmidtDivState({', '.join(fields_str)})"
+
+
+def goldschmidt_div(n, d, params, trace=lambda state: None):
+    """ Goldschmidt division algorithm.
+
+        based on:
+        Even, G., Seidel, P. M., & Ferguson, W. E. (2003).
+        A Parametric Error Analysis of Goldschmidt's Division Algorithm.
+        https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.90.1238&rep=rep1&type=pdf
+
+        arguments:
+        n: int
+            numerator. a `2*width`-bit unsigned integer.
+            must be less than `d << width`, otherwise the quotient wouldn't
+            fit in `width` bits.
+        d: int
+            denominator. a `width`-bit unsigned integer. must not be zero.
+        width: int
+            the bit-width of the inputs/outputs. must be a positive integer.
+        trace: Function[[GoldschmidtDivState], None]
+            called with the initial state and the state after executing each
+            operation in `params.ops`.
+
+        returns: tuple[int, int]
+            the quotient and remainder. a tuple of two `width`-bit unsigned
+            integers.
+    """
+    assert isinstance(params, GoldschmidtDivParams)
+    assert isinstance(d, int) and 0 < d < (1 << params.io_width)
+    assert isinstance(n, int) and 0 <= n < (d << params.io_width)
+
+    # this whole algorithm is done with fixed-point arithmetic where values
+    # have `width` fractional bits
+
+    state = GoldschmidtDivState(
+        orig_n=n,
+        orig_d=d,
+        n=FixedPoint(n, params.io_width),
+        d=FixedPoint(d, params.io_width),
+    )
+
+    trace(state)
+    for op in params.ops:
+        op.run(params, state)
+        trace(state)
+
+    assert state.quotient is not None
+    assert state.remainder is not None
+
+    return state.quotient, state.remainder
+
+
+@plain_data(eq=False)
+class GoldschmidtDivHDLState:
+    __slots__ = ("m", "orig_n", "orig_d", "n", "d",
+                 "f", "quotient", "remainder", "n_shift")
+
+    __signal_name_prefix = "state_"
+
+    def __init__(self, m, orig_n, orig_d, n, d,
+                 f=None, quotient=None, remainder=None, n_shift=None):
+        assert isinstance(m, Module)
+        assert isinstance(orig_n, Signal)
+        assert isinstance(orig_d, Signal)
+        assert isinstance(n, Signal)
+        assert isinstance(d, Signal)
+        assert f is None or isinstance(f, Signal)
+        assert quotient is None or isinstance(quotient, Signal)
+        assert remainder is None or isinstance(remainder, Signal)
+        assert n_shift is None or isinstance(n_shift, Signal)
+
+        self.m = m
+        """The HDL Module"""
+
+        self.orig_n = orig_n
+        """original numerator"""
+
+        self.orig_d = orig_d
+        """original denominator"""
+
+        self.n = n
+        """numerator -- N_prime[i] in the paper's algorithm 2"""
+
+        self.d = d
+        """denominator -- D_prime[i] in the paper's algorithm 2"""
+
+        self.f = f
+        """current factor -- F_prime[i] in the paper's algorithm 2"""
+
+        self.quotient = quotient
+        """final quotient"""
+
+        self.remainder = remainder
+        """final remainder"""
+
+        self.n_shift = n_shift
+        """amount the numerator needs to be left-shifted at the end of the
+        algorithm.
+        """
+
+        # old_signals must be set last
+        self.old_signals = defaultdict(list)
+
+    def __setattr__(self, name, value):
+        assert isinstance(name, str)
+        if name.startswith("_"):
+            return super().__setattr__(name, value)
+        try:
+            old_signals = self.old_signals[name]
+        except AttributeError:
+            # haven't yet finished __post_init__
+            return super().__setattr__(name, value)
+        assert name != "m" and name != "old_signals", f"can't write to {name}"
+        assert isinstance(value, Signal)
+        value.name = f"{self.__signal_name_prefix}{name}_{len(old_signals)}"
+        old_signal = getattr(self, name, None)
+        if old_signal is not None:
+            assert isinstance(old_signal, Signal)
+            old_signals.append(old_signal)
+        return super().__setattr__(name, value)
+
+    def insert_pipeline_register(self):
+        old_prefix = self.__signal_name_prefix
+        try:
+            for field in fields(GoldschmidtDivHDLState):
+                if field.startswith("_") or field == "m":
+                    continue
+                old_sig = getattr(self, field, None)
+                if old_sig is None:
+                    continue
+                assert isinstance(old_sig, Signal)
+                new_sig = Signal.like(old_sig)
+                setattr(self, field, new_sig)
+                self.m.d.sync += new_sig.eq(old_sig)
+        finally:
+            self.__signal_name_prefix = old_prefix
+
+
+class GoldschmidtDivHDL(Elaboratable):
+    """ Goldschmidt division algorithm.
+
+        based on:
+        Even, G., Seidel, P. M., & Ferguson, W. E. (2003).
+        A Parametric Error Analysis of Goldschmidt's Division Algorithm.
+        https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.90.1238&rep=rep1&type=pdf
+
+        attributes:
+        params: GoldschmidtDivParams
+            the goldschmidt division algorithm parameters.
+        pipe_reg_indexes: list[int]
+            the operation indexes where pipeline registers should be inserted.
+            duplicate values mean multiple registers should be inserted for
+            that operation index -- this is useful to allow yosys to spread a
+            multiplication across those multiple pipeline stages.
+        sync_rom: bool
+            true if the rom should be read synchronously rather than
+            combinatorially, incurring an extra clock cycle of latency.
+        n: Signal(unsigned(2 * params.io_width))
+            input numerator. a `2 * params.io_width`-bit unsigned integer.
+            must be less than `d << params.io_width`, otherwise the quotient
+            wouldn't fit in `params.io_width` bits.
+        d: Signal(unsigned(params.io_width))
+            input denominator. a `params.io_width`-bit unsigned integer.
+            must not be zero.
+        q: Signal(unsigned(params.io_width))
+            output quotient. only valid when `n < (d << params.io_width)`.
+        r: Signal(unsigned(params.io_width))
+            output remainder. only valid when `n < (d << params.io_width)`.
+        trace: list[GoldschmidtDivHDLState]
+            list of the initial state and the state after executing each
+            operation in `params.ops`.
+    """
+
+    @property
+    def total_pipeline_registers(self):
+        """the total number of pipeline registers"""
+        return len(self.pipe_reg_indexes) + self.sync_rom
+
+    def __init__(self, params, pipe_reg_indexes=(), sync_rom=False):
+        assert isinstance(params, GoldschmidtDivParams)
+        assert isinstance(sync_rom, bool)
+        self.params = params
+        self.pipe_reg_indexes = sorted(int(i) for i in pipe_reg_indexes)
+        self.sync_rom = sync_rom
+        self.n = Signal(unsigned(2 * params.io_width))
+        self.d = Signal(unsigned(params.io_width))
+        self.q = Signal(unsigned(params.io_width))
+        self.r = Signal(unsigned(params.io_width))
+
+        # in constructor so we get trace without needing to call elaborate
+        state = GoldschmidtDivHDLState(
+            m=Module(),
+            orig_n=self.n,
+            orig_d=self.d,
+            n=self.n,
+            d=self.d)
+
+        self.trace = [replace(state)]
+
+        # copy and reverse
+        pipe_reg_indexes = list(reversed(self.pipe_reg_indexes))
+
+        for op_index, op in enumerate(self.params.ops):
+            while len(pipe_reg_indexes) > 0 \
+                    and pipe_reg_indexes[-1] <= op_index:
+                pipe_reg_indexes.pop()
+                state.insert_pipeline_register()
+            op.gen_hdl(self.params, state, self.sync_rom)
+            self.trace.append(replace(state))
+
+        while len(pipe_reg_indexes) > 0:
+            pipe_reg_indexes.pop()
+            state.insert_pipeline_register()
+
+        state.m.d.comb += [
+            self.q.eq(state.quotient),
+            self.r.eq(state.remainder),
+        ]
+
+    def elaborate(self, platform):
+        return self.trace[0].m
+
+
+GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID = 2
+
+
+@lru_cache()
+def goldschmidt_sqrt_rsqrt_table(table_addr_bits, table_data_bits):
+    """Generate the look-up table needed for Goldschmidt's square-root and
+    reciprocal-square-root algorithm.
+
+    arguments:
+    table_addr_bits: int
+        the number of address bits for the look-up table.
+    table_data_bits: int
+        the number of data bits for the look-up table.
+    """
+    assert isinstance(table_addr_bits, int) and \
+        table_addr_bits >= GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID
+    assert isinstance(table_data_bits, int) and table_data_bits >= 1
+    table = []
+    table_len = 1 << table_addr_bits
+    for addr in range(table_len):
+        if addr == 0:
+            value = FixedPoint(0, table_data_bits)
+        elif (addr << 2) < table_len:
+            value = None  # table entries should be unused
+        else:
+            table_addr_frac_wid = table_addr_bits
+            table_addr_frac_wid -= GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID
+            max_input_value = FixedPoint(addr + 1, table_addr_bits - 2)
+            max_frac_wid = max(max_input_value.frac_wid, table_data_bits)
+            value = max_input_value.to_frac_wid(max_frac_wid)
+            value = value.rsqrt(RoundDir.DOWN)
+            value = value.to_frac_wid(table_data_bits, RoundDir.DOWN)
+        table.append(value)
+
+    # tuple for immutability
+    return tuple(table)
+
+# FIXME: add code to calculate error bounds and check that the algorithm will
+# actually work (like in the goldschmidt division algorithm).
+# FIXME: add code to calculate a good set of parameters based on the error
+# bounds checking.
+
+
+def goldschmidt_sqrt_rsqrt(radicand, io_width, frac_wid, extra_precision,
+                           table_addr_bits, table_data_bits, iter_count):
+    """Goldschmidt's square-root and reciprocal-square-root algorithm.
+
+    uses algorithm based on second method at:
+    https://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Goldschmidt%E2%80%99s_algorithm
+
+    arguments:
+    radicand: FixedPoint(frac_wid=frac_wid)
+        the input value to take the square-root and reciprocal-square-root of.
+    io_width: int
+        the number of bits in the input (`radicand`) and output values.
+    frac_wid: int
+        the number of fraction bits in the input (`radicand`) and output
+        values.
+    extra_precision: int
+        the number of bits of internal extra precision.
+    table_addr_bits: int
+        the number of address bits for the look-up table.
+    table_data_bits: int
+        the number of data bits for the look-up table.
+
+    returns: tuple[FixedPoint, FixedPoint]
+        the square-root and reciprocal-square-root, rounded down to the
+        nearest representable value. If `radicand == 0`, then the
+        reciprocal-square-root value returned is zero.
+    """
+    assert (isinstance(radicand, FixedPoint)
+            and radicand.frac_wid == frac_wid
+            and 0 <= radicand.bits < (1 << io_width))
+    assert isinstance(io_width, int) and io_width >= 1
+    assert isinstance(frac_wid, int) and 0 <= frac_wid < io_width
+    assert isinstance(extra_precision, int) and extra_precision >= io_width
+    assert isinstance(table_addr_bits, int) and table_addr_bits >= 1
+    assert isinstance(table_data_bits, int) and table_data_bits >= 1
+    assert isinstance(iter_count, int) and iter_count >= 0
+    expanded_frac_wid = frac_wid + extra_precision
+    s = radicand.to_frac_wid(expanded_frac_wid)
+    sqrt_rshift = extra_precision
+    rsqrt_rshift = extra_precision
+    while s != 0 and s < 1:
+        s = (s * 4).to_frac_wid(expanded_frac_wid)
+        sqrt_rshift += 1
+        rsqrt_rshift -= 1
+    while s >= 4:
+        s = s.div(4, expanded_frac_wid)
+        sqrt_rshift -= 1
+        rsqrt_rshift += 1
+    table = goldschmidt_sqrt_rsqrt_table(table_addr_bits=table_addr_bits,
+                                         table_data_bits=table_data_bits)
+    # core goldschmidt sqrt/rsqrt algorithm:
+    # initial setup:
+    table_addr_frac_wid = table_addr_bits
+    table_addr_frac_wid -= GOLDSCHMIDT_SQRT_RSQRT_TABLE_ADDR_INT_WID
+    addr = s.to_frac_wid(table_addr_frac_wid, RoundDir.DOWN)
+    assert 0 <= addr.bits < (1 << table_addr_bits), "table addr out of range"
+    f = table[addr.bits]
+    assert f is not None, "accessed invalid table entry"
+    # use with_frac_wid to fix IDE type deduction
+    f = FixedPoint.with_frac_wid(f, expanded_frac_wid, RoundDir.DOWN)
+    x = (s * f).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+    h = (f * 0.5).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+    for _ in range(iter_count):
+        # iteration step:
+        f = (1.5 - x * h).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+        x = (x * f).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+        h = (h * f).to_frac_wid(expanded_frac_wid, RoundDir.DOWN)
+    r = 2 * h
+    # now `x` is approximately `sqrt(s)` and `r` is approximately `rsqrt(s)`
+
+    sqrt = FixedPoint(x.bits >> sqrt_rshift, frac_wid)
+    rsqrt = FixedPoint(r.bits >> rsqrt_rshift, frac_wid)
+
+    next_sqrt = FixedPoint(sqrt.bits + 1, frac_wid)
+    if next_sqrt * next_sqrt <= radicand:
+        sqrt = next_sqrt
+
+    next_rsqrt = FixedPoint(rsqrt.bits + 1, frac_wid)
+    if next_rsqrt * next_rsqrt * radicand <= 1 and radicand != 0:
+        rsqrt = next_rsqrt
+    return sqrt, rsqrt
diff --git a/src/soc/fu/div/experiment/test/__init__.py b/src/soc/fu/div/experiment/test/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/soc/fu/div/experiment/test/test_goldschmidt_div_sqrt.py b/src/soc/fu/div/experiment/test/test_goldschmidt_div_sqrt.py
new file mode 100644 (file)
index 0000000..28e795f
--- /dev/null
@@ -0,0 +1,480 @@
+# SPDX-License-Identifier: LGPL-3-or-later
+# Copyright 2022 Jacob Lifshay programmerjake@gmail.com
+
+# Funded by NLnet Assure Programme 2021-02-052, https://nlnet.nl/assure part
+# of Horizon 2020 EU Programme 957073.
+
+from nmutil.plain_data import fields, replace
+import math
+import unittest
+from nmutil.formaltest import FHDLTestCase
+from nmutil.sim_util import do_sim, hash_256
+from nmigen.sim import Tick, Delay
+from nmigen.hdl.ast import Signal
+from nmigen.hdl.dsl import Module
+from soc.fu.div.experiment.goldschmidt_div_sqrt import (
+    GoldschmidtDivHDL, GoldschmidtDivHDLState, GoldschmidtDivParams,
+    GoldschmidtDivState, ParamsNotAccurateEnough, goldschmidt_div,
+    FixedPoint, RoundDir, goldschmidt_sqrt_rsqrt)
+
+
+class TestFixedPoint(FHDLTestCase):
+    def test_str_roundtrip(self):
+        for frac_wid in range(8):
+            for bits in range(-1 << 9, 1 << 9):
+                with self.subTest(bits=hex(bits), frac_wid=frac_wid):
+                    value = FixedPoint(bits, frac_wid)
+                    round_trip_value = FixedPoint.cast(str(value))
+                    self.assertEqual(value, round_trip_value)
+
+    @staticmethod
+    def trap(f):
+        try:
+            return f(), None
+        except (ValueError, ZeroDivisionError) as e:
+            return None, e.__class__.__name__
+
+    def test_sqrt(self):
+        for frac_wid in range(8):
+            for bits in range(1 << 9):
+                for round_dir in RoundDir:
+                    radicand = FixedPoint(bits, frac_wid)
+                    expected_f = math.sqrt(float(radicand))
+                    expected = self.trap(lambda: FixedPoint.with_frac_wid(
+                        expected_f, frac_wid, round_dir))
+                    with self.subTest(radicand=repr(radicand),
+                                      round_dir=str(round_dir),
+                                      expected=repr(expected)):
+                        result = self.trap(lambda: radicand.sqrt(round_dir))
+                        self.assertEqual(result, expected)
+
+    def test_rsqrt(self):
+        for frac_wid in range(8):
+            for bits in range(1, 1 << 9):
+                for round_dir in RoundDir:
+                    radicand = FixedPoint(bits, frac_wid)
+                    expected_f = 1 / math.sqrt(float(radicand))
+                    expected = self.trap(lambda: FixedPoint.with_frac_wid(
+                        expected_f, frac_wid, round_dir))
+                    with self.subTest(radicand=repr(radicand),
+                                      round_dir=str(round_dir),
+                                      expected=repr(expected)):
+                        result = self.trap(lambda: radicand.rsqrt(round_dir))
+                        self.assertEqual(result, expected)
+
+
+class TestGoldschmidtDiv(FHDLTestCase):
+    def test_case1(self):
+        with self.assertRaises(ParamsNotAccurateEnough):
+            GoldschmidtDivParams(io_width=3, extra_precision=2,
+                                 table_addr_bits=3, table_data_bits=5,
+                                 iter_count=2)
+
+    def test_case2(self):
+        with self.assertRaises(ParamsNotAccurateEnough):
+            GoldschmidtDivParams(io_width=4, extra_precision=1,
+                                 table_addr_bits=1, table_data_bits=5,
+                                 iter_count=1)
+
+    @staticmethod
+    def cases(io_width, cases=None):
+        assert isinstance(io_width, int) and io_width >= 1
+        if cases is not None:
+            for n, d in cases:
+                assert isinstance(d, int) \
+                    and 0 < d < (1 << io_width), "invalid case"
+                assert isinstance(n, int) \
+                    and 0 <= n < (d << io_width), "invalid case"
+                yield (n, d)
+        elif io_width > 6:
+            assert io_width * 2 <= 256, \
+                "can't generate big enough numbers for test cases"
+            for i in range(10000):
+                d = hash_256(f'd {i}') % (1 << io_width)
+                if d == 0:
+                    d = 1
+                n = hash_256(f'n {i}') % (d << io_width)
+                yield (n, d)
+        else:
+            for d in range(1, 1 << io_width):
+                for n in range(d << io_width):
+                    yield (n, d)
+
+    def tst(self, io_width, cases=None):
+        assert isinstance(io_width, int)
+        params = GoldschmidtDivParams.get(io_width)
+        with self.subTest(params=str(params)):
+            for n, d in self.cases(io_width, cases):
+                expected_q, expected_r = divmod(n, d)
+                with self.subTest(n=hex(n), d=hex(d),
+                                  expected_q=hex(expected_q),
+                                  expected_r=hex(expected_r)):
+                    trace = []
+
+                    def trace_fn(state):
+                        assert isinstance(state, GoldschmidtDivState)
+                        trace.append((replace(state)))
+                    q, r = goldschmidt_div(n, d, params, trace=trace_fn)
+                    with self.subTest(q=hex(q), r=hex(r), trace=repr(trace)):
+                        self.assertEqual((q, r), (expected_q, expected_r))
+
+    def tst_sim(self, io_width, cases=None, pipe_reg_indexes=(),
+                sync_rom=False):
+        assert isinstance(io_width, int)
+        params = GoldschmidtDivParams.get(io_width)
+        m = Module()
+        dut = GoldschmidtDivHDL(params, pipe_reg_indexes=pipe_reg_indexes,
+                                sync_rom=sync_rom)
+        m.submodules.dut = dut
+        # make sync domain get added
+        m.d.sync += Signal().eq(0)
+
+        def inputs_proc():
+            yield Tick()
+            for n, d in self.cases(io_width, cases):
+                yield dut.n.eq(n)
+                yield dut.d.eq(d)
+                yield Tick()
+
+        def check_interals(n, d):
+            # check internals only if dut is completely combinatorial
+            # so we don't have to figure out how to read values in
+            # previous clock cycles
+            if dut.total_pipeline_registers != 0:
+                return
+            ref_trace = []
+
+            def ref_trace_fn(state):
+                assert isinstance(state, GoldschmidtDivState)
+                ref_trace.append((replace(state)))
+            goldschmidt_div(n=n, d=d, params=params, trace=ref_trace_fn)
+            self.assertEqual(len(dut.trace), len(ref_trace))
+            for index, state in enumerate(dut.trace):
+                ref_state = ref_trace[index]
+                last_op = None if index == 0 else params.ops[index - 1]
+                with self.subTest(index=index, state=repr(state),
+                                  ref_state=repr(ref_state),
+                                  last_op=str(last_op)):
+                    for field in fields(GoldschmidtDivHDLState):
+                        sig = getattr(state, field)
+                        if not isinstance(sig, Signal):
+                            continue
+                        ref_value = getattr(ref_state, field)
+                        ref_value_str = repr(ref_value)
+                        if isinstance(ref_value, int):
+                            ref_value_str = hex(ref_value)
+                        value = yield sig
+                        with self.subTest(field_name=field,
+                                          sig=repr(sig),
+                                          sig_shape=repr(sig.shape()),
+                                          value=hex(value),
+                                          ref_value=ref_value_str):
+                            if isinstance(ref_value, int):
+                                self.assertEqual(value, ref_value)
+                            else:
+                                assert isinstance(ref_value, FixedPoint)
+                                self.assertEqual(value, ref_value.bits)
+
+        def check_outputs():
+            yield Tick()
+            for _ in range(dut.total_pipeline_registers):
+                yield Tick()
+            for n, d in self.cases(io_width, cases):
+                yield Delay(0.1e-6)
+                expected_q, expected_r = divmod(n, d)
+                with self.subTest(n=hex(n), d=hex(d),
+                                  expected_q=hex(expected_q),
+                                  expected_r=hex(expected_r)):
+                    q = yield dut.q
+                    r = yield dut.r
+                    with self.subTest(q=hex(q), r=hex(r)):
+                        self.assertEqual((q, r), (expected_q, expected_r))
+                    yield from check_interals(n, d)
+
+                yield Tick()
+
+        with self.subTest(params=str(params)):
+            with do_sim(self, m, (dut.n, dut.d, dut.q, dut.r)) as sim:
+                sim.add_clock(1e-6)
+                sim.add_process(inputs_proc)
+                sim.add_process(check_outputs)
+                sim.run()
+
+    def test_1_through_4(self):
+        for io_width in range(1, 4 + 1):
+            with self.subTest(io_width=io_width):
+                self.tst(io_width)
+
+    def test_5(self):
+        self.tst(5)
+
+    def test_6(self):
+        self.tst(6)
+
+    def test_8(self):
+        self.tst(8)
+
+    def test_16(self):
+        self.tst(16)
+
+    def test_32(self):
+        self.tst(32)
+
+    def test_64(self):
+        self.tst(64)
+
+    def test_sim_5(self):
+        self.tst_sim(5)
+
+    def test_sim_8(self):
+        self.tst_sim(8)
+
+    def test_sim_16(self):
+        self.tst_sim(16)
+
+    def test_sim_32(self):
+        self.tst_sim(32)
+
+    def test_sim_64(self):
+        self.tst_sim(64)
+
+    def tst_params(self, io_width):
+        assert isinstance(io_width, int)
+        params = GoldschmidtDivParams.get(io_width)
+        print()
+        print(params)
+
+    def test_params_1(self):
+        self.tst_params(1)
+
+    def test_params_2(self):
+        self.tst_params(2)
+
+    def test_params_3(self):
+        self.tst_params(3)
+
+    def test_params_4(self):
+        self.tst_params(4)
+
+    def test_params_5(self):
+        self.tst_params(5)
+
+    def test_params_6(self):
+        self.tst_params(6)
+
+    def test_params_7(self):
+        self.tst_params(7)
+
+    def test_params_8(self):
+        self.tst_params(8)
+
+    def test_params_9(self):
+        self.tst_params(9)
+
+    def test_params_10(self):
+        self.tst_params(10)
+
+    def test_params_11(self):
+        self.tst_params(11)
+
+    def test_params_12(self):
+        self.tst_params(12)
+
+    def test_params_13(self):
+        self.tst_params(13)
+
+    def test_params_14(self):
+        self.tst_params(14)
+
+    def test_params_15(self):
+        self.tst_params(15)
+
+    def test_params_16(self):
+        self.tst_params(16)
+
+    def test_params_17(self):
+        self.tst_params(17)
+
+    def test_params_18(self):
+        self.tst_params(18)
+
+    def test_params_19(self):
+        self.tst_params(19)
+
+    def test_params_20(self):
+        self.tst_params(20)
+
+    def test_params_21(self):
+        self.tst_params(21)
+
+    def test_params_22(self):
+        self.tst_params(22)
+
+    def test_params_23(self):
+        self.tst_params(23)
+
+    def test_params_24(self):
+        self.tst_params(24)
+
+    def test_params_25(self):
+        self.tst_params(25)
+
+    def test_params_26(self):
+        self.tst_params(26)
+
+    def test_params_27(self):
+        self.tst_params(27)
+
+    def test_params_28(self):
+        self.tst_params(28)
+
+    def test_params_29(self):
+        self.tst_params(29)
+
+    def test_params_30(self):
+        self.tst_params(30)
+
+    def test_params_31(self):
+        self.tst_params(31)
+
+    def test_params_32(self):
+        self.tst_params(32)
+
+    def test_params_33(self):
+        self.tst_params(33)
+
+    def test_params_34(self):
+        self.tst_params(34)
+
+    def test_params_35(self):
+        self.tst_params(35)
+
+    def test_params_36(self):
+        self.tst_params(36)
+
+    def test_params_37(self):
+        self.tst_params(37)
+
+    def test_params_38(self):
+        self.tst_params(38)
+
+    def test_params_39(self):
+        self.tst_params(39)
+
+    def test_params_40(self):
+        self.tst_params(40)
+
+    def test_params_41(self):
+        self.tst_params(41)
+
+    def test_params_42(self):
+        self.tst_params(42)
+
+    def test_params_43(self):
+        self.tst_params(43)
+
+    def test_params_44(self):
+        self.tst_params(44)
+
+    def test_params_45(self):
+        self.tst_params(45)
+
+    def test_params_46(self):
+        self.tst_params(46)
+
+    def test_params_47(self):
+        self.tst_params(47)
+
+    def test_params_48(self):
+        self.tst_params(48)
+
+    def test_params_49(self):
+        self.tst_params(49)
+
+    def test_params_50(self):
+        self.tst_params(50)
+
+    def test_params_51(self):
+        self.tst_params(51)
+
+    def test_params_52(self):
+        self.tst_params(52)
+
+    def test_params_53(self):
+        self.tst_params(53)
+
+    def test_params_54(self):
+        self.tst_params(54)
+
+    def test_params_55(self):
+        self.tst_params(55)
+
+    def test_params_56(self):
+        self.tst_params(56)
+
+    def test_params_57(self):
+        self.tst_params(57)
+
+    def test_params_58(self):
+        self.tst_params(58)
+
+    def test_params_59(self):
+        self.tst_params(59)
+
+    def test_params_60(self):
+        self.tst_params(60)
+
+    def test_params_61(self):
+        self.tst_params(61)
+
+    def test_params_62(self):
+        self.tst_params(62)
+
+    def test_params_63(self):
+        self.tst_params(63)
+
+    def test_params_64(self):
+        self.tst_params(64)
+
+
+class TestGoldschmidtSqrtRSqrt(FHDLTestCase):
+    def tst(self, io_width, frac_wid, extra_precision,
+            table_addr_bits, table_data_bits, iter_count):
+        assert isinstance(io_width, int)
+        assert isinstance(frac_wid, int)
+        assert isinstance(extra_precision, int)
+        assert isinstance(table_addr_bits, int)
+        assert isinstance(table_data_bits, int)
+        assert isinstance(iter_count, int)
+        with self.subTest(io_width=io_width, frac_wid=frac_wid,
+                          extra_precision=extra_precision,
+                          table_addr_bits=table_addr_bits,
+                          table_data_bits=table_data_bits,
+                          iter_count=iter_count):
+            for bits in range(1 << io_width):
+                radicand = FixedPoint(bits, frac_wid)
+                expected_sqrt = radicand.sqrt(RoundDir.DOWN)
+                expected_rsqrt = FixedPoint(0, frac_wid)
+                if radicand > 0:
+                    expected_rsqrt = radicand.rsqrt(RoundDir.DOWN)
+                with self.subTest(radicand=repr(radicand),
+                                  expected_sqrt=repr(expected_sqrt),
+                                  expected_rsqrt=repr(expected_rsqrt)):
+                    sqrt, rsqrt = goldschmidt_sqrt_rsqrt(
+                        radicand=radicand, io_width=io_width,
+                        frac_wid=frac_wid,
+                        extra_precision=extra_precision,
+                        table_addr_bits=table_addr_bits,
+                        table_data_bits=table_data_bits,
+                        iter_count=iter_count)
+                    with self.subTest(sqrt=repr(sqrt), rsqrt=repr(rsqrt)):
+                        self.assertEqual((sqrt, rsqrt),
+                                         (expected_sqrt, expected_rsqrt))
+
+    def test1(self):
+        self.tst(io_width=16, frac_wid=8, extra_precision=20,
+                 table_addr_bits=4, table_data_bits=28, iter_count=4)
+
+
+if __name__ == "__main__":
+    unittest.main()
index 2a78b19ad79258419c719e1dcba47789b529e674..b90da07f63177b7dcc94e74ababbf129d524531d 100644 (file)
@@ -1,7 +1,6 @@
 import enum
 from nmigen import Elaboratable, Module, Signal, Shape, unsigned, Cat, Mux
 from soc.fu.div.pipe_data import CoreInputData, CoreOutputData, DivPipeSpec
-from nmutil.iocontrol import PrevControl, NextControl
 from nmutil.singlepipe import ControlBase
 from ieee754.div_rem_sqrt_rsqrt.core import DivPipeCoreOperation
 
@@ -132,25 +131,37 @@ class DivState:
 
 class FSMDivCoreStage(ControlBase):
     def __init__(self, pspec):
-        super().__init__()
-        self.pspec = pspec
-        self.p.data_i = CoreInputData(pspec)
-        self.n.data_o = CoreOutputData(pspec)
-        self.saved_input_data = CoreInputData(pspec)
+        self.pspec = pspec # store now: used in ispec and ospec
+        super().__init__(stage=self)
+        self.saved_input_data = self.ispec()
         self.empty = Signal(reset=1)
         self.saved_state = DivState(64, name="saved_state")
         self.div_state_next = DivStateNext(64)
         self.div_state_init = DivStateInit(64)
         self.divisor = Signal(unsigned(64))
 
+    def ispec(self):
+        return CoreInputData(self.pspec)
+
+    def ospec(self):
+        return CoreOutputData(self.pspec)
+
+    # an extremely rare (and catastrophic) coredump in the binary executable
+    # known as "python 3.7" requires the addition of this function.
+    # no, that's not a "crash which most n00bs call an exception", being
+    # thrown: that's an *actual* coredump created by /usr/bin/python3.7 which
+    # actually segfaults if this function is not added.  no idea why.
+    def setup(self, m, i):
+        pass
+
     def elaborate(self, platform):
         m = super().elaborate(platform)
         m.submodules.div_state_next = self.div_state_next
         m.submodules.div_state_init = self.div_state_init
-        data_i = self.p.data_i
-        data_o = self.n.data_o
-        core_i = data_i.core
-        core_o = data_o.core
+        i_data = self.p.i_data
+        o_data = self.n.o_data
+        core_i = i_data.core
+        core_o = o_data.core
 
         core_saved_i = self.saved_input_data.core
 
@@ -158,7 +169,7 @@ class FSMDivCoreStage(ControlBase):
 
         m.d.comb += self.div_state_init.dividend.eq(core_i.dividend)
 
-        m.d.comb += data_o.eq_without_core(self.saved_input_data)
+        m.d.comb += o_data.eq_without_core(self.saved_input_data)
         m.d.comb += core_o.quotient_root.eq(self.div_state_next.o.quotient)
         # fract width of `DivPipeCoreOutputData.remainder`
         remainder_fract_width = 64 * 3
@@ -167,22 +178,22 @@ class FSMDivCoreStage(ControlBase):
         rem_start = remainder_fract_width - dividend_fract_width
         m.d.comb += core_o.remainder.eq(self.div_state_next.o.remainder
                                         << rem_start)
-        m.d.comb += self.n.valid_o.eq(
+        m.d.comb += self.n.o_valid.eq(
             ~self.empty & self.saved_state.will_be_done_after(1))
-        m.d.comb += self.p.ready_o.eq(self.empty)
+        m.d.comb += self.p.o_ready.eq(self.empty)
         m.d.sync += self.saved_state.eq(self.div_state_next.o)
 
         with m.If(self.empty):
             m.d.comb += self.div_state_next.i.eq(self.div_state_init.o)
             m.d.comb += self.div_state_next.divisor.eq(core_i.divisor_radicand)
-            with m.If(self.p.valid_i):
+            with m.If(self.p.i_valid):
                 m.d.sync += self.empty.eq(0)
-                m.d.sync += self.saved_input_data.eq(data_i)
+                m.d.sync += self.saved_input_data.eq(i_data)
         with m.Else():
             m.d.comb += [
                 self.div_state_next.i.eq(self.saved_state),
                 self.div_state_next.divisor.eq(core_saved_i.divisor_radicand)]
-            with m.If(self.n.ready_i & self.n.valid_o):
+            with m.If(self.n.i_ready & self.n.o_valid):
                 m.d.sync += self.empty.eq(1)
 
         return m
index 903770ddd0b50b0dc23c647654adcab265f6126e..31218c66ef0af904704b7cfbbabd397d76290e33 100644 (file)
@@ -8,7 +8,7 @@ from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array, signed)
 from nmutil.pipemodbase import PipeModBase
 from soc.fu.logical.pipe_data import LogicalInputData
 from soc.fu.div.pipe_data import DivMulOutputData
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
 from openpower.decoder.power_enums import MicrOp
 
 from openpower.decoder.power_fields import DecodeFields
index 4c70fdf177d35e8d18144cfec25751a82563b43d..1c807dc05492f5b654cf70f1443e55cef99774bb 100644 (file)
@@ -10,28 +10,33 @@ from ieee754.div_rem_sqrt_rsqrt.core import (
 
 
 class DivInputData(FUBaseData):
-    regspec = [('INT', 'ra', '0:63'),  # RA
-               ('INT', 'rb', '0:63'),  # RB/immediate
-               ('XER', 'xer_so', '32'), ]  # XER bit 32: SO
-
     def __init__(self, pspec):
         super().__init__(pspec, False)
         # convenience
         self.a, self.b = self.ra, self.rb
 
+    @property
+    def regspec(self):
+        return [('INT', 'ra', self.intrange),  # RA
+               ('INT', 'rb', self.intrange),  # RB/immediate
+               ('XER', 'xer_so', '32'), ]  # XER bit 32: SO
+
 
 # output stage shared between div and mul: like ALUOutputData but no CA/32
 class DivMulOutputData(FUBaseData):
-    regspec = [('INT', 'o', '0:63'),
-               ('CR', 'cr_a', '0:3'),
-               ('XER', 'xer_ov', '33,44'),  # bit0: ov, bit1: ov32
-               ('XER', 'xer_so', '32')]
 
     def __init__(self, pspec):
         super().__init__(pspec, True)
         # convenience
         self.cr0 = self.cr_a
 
+    @property
+    def regspec(self):
+        return [('INT', 'o', self.intrange),
+               ('CR', 'cr_a', '0:3'),
+               ('XER', 'xer_ov', '33,44'),  # bit0: ov, bit1: ov32
+               ('XER', 'xer_so', '32')]
+
 
 class DivPipeKindConfigBase:
     def __init__(self,
@@ -129,28 +134,34 @@ class DivPipeKind(enum.Enum):
 
 
 class DivPipeSpec(CommonPipeSpec):
-    def __init__(self, id_wid, div_pipe_kind):
-        super().__init__(id_wid=id_wid)
+    def __init__(self, id_wid, parent_pspec, div_pipe_kind):
+        super().__init__(id_wid=id_wid, parent_pspec=parent_pspec)
         self.div_pipe_kind = div_pipe_kind
         self.core_config = div_pipe_kind.config.core_config
 
-    regspec = (DivInputData.regspec, DivMulOutputData.regspec)
+    regspecklses = (DivInputData, DivMulOutputData)
     opsubsetkls = CompLogicalOpSubset
 
 
 class DivPipeSpecDivPipeCore(DivPipeSpec):
-    def __init__(self, id_wid):
-        super().__init__(id_wid=id_wid, div_pipe_kind=DivPipeKind.DivPipeCore)
+    def __init__(self, id_wid, parent_pspec):
+        super().__init__(id_wid=id_wid,
+                         parent_pspec=parent_pspec,
+                         div_pipe_kind=DivPipeKind.DivPipeCore)
 
 
 class DivPipeSpecFSMDivCore(DivPipeSpec):
-    def __init__(self, id_wid):
-        super().__init__(id_wid=id_wid, div_pipe_kind=DivPipeKind.FSMDivCore)
+    def __init__(self, id_wid, parent_pspec):
+        super().__init__(id_wid=id_wid,
+                         parent_pspec=parent_pspec,
+                         div_pipe_kind=DivPipeKind.FSMDivCore)
 
 
 class DivPipeSpecSimOnly(DivPipeSpec):
-    def __init__(self, id_wid):
-        super().__init__(id_wid=id_wid, div_pipe_kind=DivPipeKind.SimOnly)
+    def __init__(self, id_wid, parent_pspec):
+        super().__init__(id_wid=id_wid,
+                         parent_pspec=parent_pspec,
+                         div_pipe_kind=DivPipeKind.SimOnly)
 
 
 class CoreBaseData(DivInputData):
index 56308942c1c02fb8ccca9d65acbf3563f8692caa..71c5c01fb03fb8dc08adda2522cc5bc551db486f 100644 (file)
@@ -12,13 +12,18 @@ from soc.fu.div.pipe_data import DivPipeKindConfigCombPipe
 class DivStagesStart(PipeModBaseChain):
     def get_chain(self):
         alu_input = DivMulInputStage(self.pspec)
+        return [alu_input]
+
+
+class DivStagesSetup(PipeModBaseChain):
+    def get_chain(self):
         div_setup = DivSetupStage(self.pspec)
         if isinstance(self.pspec.div_pipe_kind.config,
                       DivPipeKindConfigCombPipe):
             core_setup = [DivCoreSetupStage(self.pspec)]
         else:
             core_setup = ()
-        return [alu_input, div_setup, *core_setup]
+        return [div_setup, *core_setup]
 
 
 class DivStagesMiddle(PipeModBaseChain):
@@ -45,9 +50,14 @@ class DivStagesEnd(PipeModBaseChain):
         else:
             core_final = ()
         div_out = DivOutputStage(self.pspec)
-        alu_out = DivMulOutputStage(self.pspec)
         self.div_out = div_out  # debugging - bug #425
-        return [*core_final, div_out, alu_out]
+        return [*core_final, div_out]
+
+
+class DivStagesFinalise(PipeModBaseChain):
+    def get_chain(self):
+        alu_out = DivMulOutputStage(self.pspec)
+        return [alu_out]
 
 
 class DivBasePipe(ControlBase):
@@ -55,6 +65,7 @@ class DivBasePipe(ControlBase):
         ControlBase.__init__(self)
         self.pspec = pspec
         self.pipe_start = DivStagesStart(pspec)
+        self.pipe_setup = DivStagesSetup(pspec)
         self.pipe_middles = []
         if isinstance(self.pspec.div_pipe_kind.config,
                       DivPipeKindConfigCombPipe):
@@ -66,16 +77,21 @@ class DivBasePipe(ControlBase):
             self.pipe_middles.append(
                 self.pspec.div_pipe_kind.config.core_stage_class(pspec))
         self.pipe_end = DivStagesEnd(pspec)
+        self.pipe_final = DivStagesFinalise(pspec)
         self._eqs = self.connect([self.pipe_start,
+                                  self.pipe_setup,
                                   *self.pipe_middles,
-                                  self.pipe_end])
+                                  self.pipe_end,
+                                  self.pipe_final])
 
     def elaborate(self, platform):
         m = ControlBase.elaborate(self, platform)
         m.submodules.pipe_start = self.pipe_start
+        m.submodules.pipe_setup = self.pipe_setup
         for i in range(len(self.pipe_middles)):
             name = f"pipe_middle_{i}"
             setattr(m.submodules, name, self.pipe_middles[i])
         m.submodules.pipe_end = self.pipe_end
+        m.submodules.pipe_final = self.pipe_final
         m.d.comb += self._eqs
         return m
index 937bcbb029a8ce00231522dd17353637b7d36bc0..5fe049786ae9074e30e39a48fe4939b0e7382ba3 100644 (file)
@@ -4,7 +4,7 @@
 from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
 from nmutil.pipemodbase import PipeModBase
 from soc.fu.div.pipe_data import DivInputData
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
 from openpower.decoder.power_enums import MicrOp
 
 from openpower.decoder.power_fields import DecodeFields
@@ -27,6 +27,7 @@ class DivSetupStage(PipeModBase):
         return CoreInputData(self.pspec)
 
     def elaborate(self, platform):
+        XLEN = self.pspec.XLEN
         m = Module()
         comb = m.d.comb
         # convenience variables
@@ -42,14 +43,15 @@ class DivSetupStage(PipeModBase):
 
         # work out if a/b are negative (check 32-bit / signed)
         comb += dividend_neg_o.eq(Mux(op.is_32bit,
-                                      a[31], a[63]) & op.is_signed)
-        comb += divisor_neg_o.eq(Mux(op.is_32bit, b[31], b[63]) & op.is_signed)
+                                      a[31], a[XLEN-1]) & op.is_signed)
+        comb += divisor_neg_o.eq(Mux(op.is_32bit,
+                                      b[31], b[XLEN-1]) & op.is_signed)
 
         # negation of a 64-bit value produces the same lower 32-bit
         # result as negation of just the lower 32-bits, so we don't
         # need to do anything special before negating
-        abs_dor = Signal(64, reset_less=True)  # absolute of divisor
-        abs_dend = Signal(64, reset_less=True)  # absolute of dividend
+        abs_dor = Signal(XLEN, reset_less=True)  # absolute of divisor
+        abs_dend = Signal(XLEN, reset_less=True)  # absolute of dividend
         comb += abs_dor.eq(Mux(divisor_neg_o, -b, b))
         comb += abs_dend.eq(Mux(dividend_neg_o, -a, a))
 
@@ -78,7 +80,7 @@ class DivSetupStage(PipeModBase):
                 with m.If(op.is_32bit):
                     comb += dividend_o.eq(abs_dend[0:32] << 32)
                 with m.Else():
-                    comb += dividend_o.eq(abs_dend[0:64] << 64)
+                    comb += dividend_o.eq(abs_dend[0:XLEN] << XLEN)
 
         ###### sticky overflow and context, both pass-through #####
 
index 15f9d8269d5c0d3a3feadcfbcb47e817664fdad7..3a854975b1aa2b4999fcdd2ee6c3789a96c38847 100644 (file)
@@ -41,7 +41,7 @@ def get_cu_inputs(dec2, sim):
 def set_alu_inputs(alu, dec2, sim):
     # TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
     # detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
-    # and place it into data_i.b
+    # and place it into i_data.b
 
     inp = yield from get_cu_inputs(dec2, sim)
     yield from ALUHelpers.set_int_ra(alu, dec2, inp)
@@ -95,9 +95,9 @@ class DivTestHelper(unittest.TestCase):
             # note that it is critically important to do this
             # for DIV otherwise it starts trying to produce
             # multiple results.
-            yield alu.p.valid_i.eq(1)
+            yield alu.p.i_valid.eq(1)
             yield
-            yield alu.p.valid_i.eq(0)
+            yield alu.p.i_valid.eq(0)
 
             opname = code.split(' ')[0]
             fnname = opname.replace(".", "_")
@@ -109,7 +109,7 @@ class DivTestHelper(unittest.TestCase):
             yield from isa_sim.call(opname)
             index = isa_sim.pc.CIA.value//4
 
-            vld = yield alu.n.valid_o
+            vld = yield alu.n.o_valid
             while not vld:
                 yield
                 yield Delay(0.1e-6)
@@ -119,7 +119,7 @@ class DivTestHelper(unittest.TestCase):
                     print(f"time: {sim._engine.now * 1e6}us")
                 except AttributeError:
                     pass
-                vld = yield alu.n.valid_o
+                vld = yield alu.n.o_valid
                 # bug #425 investigation
                 do = alu.pipe_end.div_out
                 ctx_op = do.i.ctx.op
@@ -163,11 +163,15 @@ class DivTestHelper(unittest.TestCase):
 
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
 
-        pspec = DivPipeSpec(id_wid=2, div_pipe_kind=div_pipe_kind)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = DivPipeSpec(
+            id_wid=2, div_pipe_kind=div_pipe_kind, parent_pspec=pps)
         m.submodules.alu = alu = DivBasePipe(pspec)
 
-        comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
-        comb += alu.n.ready_i.eq(1)
+        comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+        comb += alu.n.i_ready.eq(1)
         comb += pdecode2.dec.raw_opcode_in.eq(instruction)
         sim = Simulator(m)
 
@@ -232,8 +236,8 @@ class DivTestHelper(unittest.TestCase):
         print("oe, oe_ok", oe, oe_ok)
         if not oe or not oe_ok:
             # if OE not enabled, XER SO and OV must not be activated
-            so_ok = yield alu.n.data_o.xer_so.ok
-            ov_ok = yield alu.n.data_o.xer_ov.ok
+            so_ok = yield alu.n.o_data.xer_so.ok
+            ov_ok = yield alu.n.o_data.xer_ov.ok
             print("so, ov", so_ok, ov_ok)
             self.assertEqual(ov_ok, False, code)
             self.assertEqual(so_ok, False, code)
index a5b343910827a6f1ddebc43492b68b4fca4dd899..215b3a65d7e54b48e21c66bf33e36fb15b3f246a 100644 (file)
@@ -6,7 +6,11 @@ from soc.fu.div.pipeline import DivBasePipe
 
 class TestPipeIlang(unittest.TestCase):
     def write_ilang(self, div_pipe_kind):
-        pspec = DivPipeSpec(id_wid=2, div_pipe_kind=div_pipe_kind)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = DivPipeSpec(
+            id_wid=2, div_pipe_kind=div_pipe_kind, parent_pspec=pps)
         alu = DivBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open(f"div_pipeline_{div_pipe_kind.name}.il", "w") as f:
index 8ba8f0255c8e7b62d6607360bd6a6cd32123773e..928ab9922f11f88a2a1b2120c5e31cacb94127bf 100644 (file)
@@ -25,6 +25,7 @@ class CompLDSTOpSubset(CompOpSubsetBase):
                   ('is_signed', 1),
                   ('data_len', 4),
                   ('byte_reverse', 1),
+                  ('reserve', 1),     # atomic update
                   ('sign_extend', 1),
                   ('ldst_mode', LDSTMode),
                   ('insn', 32),
index 7cff898743703d902cee7ddeaf0028a7a514cc23..6e868b1eebdd7eeffbc6f1812c29e508a0a0ad9c 100644 (file)
@@ -19,12 +19,13 @@ Links:
 
 from nmigen import (Elaboratable, Module, Signal, Shape, unsigned, Cat, Mux,
                     Record, Memory,
-                    Const)
+                    Const, C)
 from nmutil.iocontrol import RecordObject
-from nmutil.util import rising_edge
+from nmutil.util import rising_edge, Display
 from enum import Enum, unique
 
 from soc.experiment.dcache import DCache
+from soc.experiment.icache import ICache
 from soc.experiment.pimem import PortInterfaceBase
 from soc.experiment.mem_types import LoadStore1ToMMUType
 from soc.experiment.mem_types import MMUToLoadStore1Type
@@ -39,7 +40,14 @@ class State(Enum):
     IDLE = 0       # ready for instruction
     ACK_WAIT = 1   # waiting for ack from dcache
     MMU_LOOKUP = 2 # waiting for MMU to look up translation
-    TLBIE_WAIT = 3 # waiting for MMU to finish doing a tlbie
+    #SECOND_REQ = 3 # second request for unaligned transfer
+
+@unique
+class Misalign(Enum):
+    ONEWORD = 0    # only one word needed, all good
+    NEED2WORDS = 1 # need to send/receive two words
+    WAITFIRST = 2  # waiting for the first word
+    WAITSECOND = 3 # waiting for the second word
 
 
 # captures the LDSTRequest from the PortInterface, which "blips" most
@@ -50,13 +58,20 @@ class LDSTRequest(RecordObject):
 
         self.load          = Signal()
         self.dcbz          = Signal()
-        self.addr          = Signal(64)
+        self.raddr          = Signal(64)
         # self.store_data    = Signal(64) # this is already sync (on a delay)
-        self.byte_sel      = Signal(8)
+        self.byte_sel      = Signal(16)
         self.nc            = Signal()              # non-cacheable access
         self.virt_mode     = Signal()
         self.priv_mode     = Signal()
+        self.mode_32bit    = Signal() # XXX UNUSED AT PRESENT
+        self.alignstate    = Signal(Misalign) # progress of alignment request
         self.align_intr    = Signal()
+        # atomic (LR/SC reservation)
+        self.reserve       = Signal()
+        self.atomic        = Signal()
+        self.atomic_last   = Signal()
+
 
 # glue logic for microwatt mmu and dcache
 class LoadStore1(PortInterfaceBase):
@@ -68,77 +83,112 @@ class LoadStore1(PortInterfaceBase):
         addrwid = pspec.addr_wid
 
         super().__init__(regwid, addrwid)
-        self.dcache = DCache()
+        self.dcache = DCache(pspec)
+        self.icache = ICache(pspec)
         # these names are from the perspective of here (LoadStore1)
         self.d_out  = self.dcache.d_in     # in to dcache is out for LoadStore
         self.d_in = self.dcache.d_out      # out from dcache is in for LoadStore
-        self.m_out  = LoadStore1ToMMUType() # out *to* MMU
-        self.m_in = MMUToLoadStore1Type()   # in *from* MMU
+        self.i_out  = self.icache.i_in     # in to icache is out for LoadStore
+        self.i_in = self.icache.i_out      # out from icache is in for LoadStore
+        self.m_out  = LoadStore1ToMMUType("m_out") # out *to* MMU
+        self.m_in = MMUToLoadStore1Type("m_in")   # in *from* MMU
         self.req = LDSTRequest(name="ldst_req")
 
         # TODO, convert dcache wb_in/wb_out to "standard" nmigen Wishbone bus
         self.dbus = Record(make_wb_layout(pspec))
+        self.ibus = Record(make_wb_layout(pspec))
 
         # for creating a single clock blip to DCache
         self.d_valid = Signal()
         self.d_w_valid = Signal()
         self.d_validblip = Signal()
 
-        # DSISR and DAR cached values.  note that the MMU FSM is where
-        # these are accessed by OP_MTSPR/OP_MFSPR, on behalf of LoadStore1.
-        # by contrast microwatt has the spr set/get done *in* loadstore1.vhdl
-        self.dsisr = Signal(64)
-        self.dar = Signal(64)
-
         # state info for LD/ST
         self.done          = Signal()
+        self.done_delay    = Signal()
         # latch most of the input request
         self.load          = Signal()
         self.tlbie         = Signal()
         self.dcbz          = Signal()
-        self.addr          = Signal(64)
-        self.store_data    = Signal(64)
-        self.load_data     = Signal(64)
-        self.byte_sel      = Signal(8)
+        self.raddr          = Signal(64)
+        self.maddr          = Signal(64)
+        self.store_data    = Signal(64)   # first half (aligned)
+        self.store_data2   = Signal(64)   # second half (misaligned)
+        self.load_data     = Signal(128)   # 128 to cope with misalignment
+        self.load_data_delay = Signal(128) # perform 2 LD/STs
+        self.byte_sel      = Signal(16)    # also for misaligned, 16-bit
+        self.alignstate    = Signal(Misalign) # progress of alignment request
+        self.next_addr      = Signal(64)      # 2nd (aligned) read/write addr
         #self.xerc         : xer_common_t;
-        #self.reserve       = Signal()
-        #self.atomic        = Signal()
-        #self.atomic_last   = Signal()
         #self.rc            = Signal()
         self.nc            = Signal()              # non-cacheable access
-        self.virt_mode     = Signal()
-        self.priv_mode     = Signal()
-        self.state        = Signal(State)
-        self.instr_fault   = Signal()
+        self.mode_32bit    = Signal() # XXX UNUSED AT PRESENT
+        self.state         = Signal(State)
+        self.instr_fault   = Signal()  # indicator to request i-cache MMU lookup
+        self.r_instr_fault  = Signal() # accessed in external_busy
+        self.priv_mode     = Signal() # only for instruction fetch (not LDST)
         self.align_intr    = Signal()
         self.busy          = Signal()
         self.wait_dcache   = Signal()
         self.wait_mmu      = Signal()
-        #self.mode_32bit    = Signal()
+        self.lrsc_misalign = Signal()
         #self.intr_vec     : integer range 0 to 16#fff#;
         #self.nia           = Signal(64)
         #self.srr1          = Signal(16)
-
-    def set_wr_addr(self, m, addr, mask, misalign, msr_pr):
+        # use these to set the dsisr or dar respectively
+        self.mmu_set_spr    = Signal()
+        self.mmu_set_dsisr  = Signal()
+        self.mmu_set_dar    = Signal()
+        self.sprval_in      = Signal(64)
+
+        # ONLY access these read-only, do NOT attempt to change
+        self.dsisr          = Signal(32)
+        self.dar            = Signal(64)
+
+    # when external_busy set, do not allow PortInterface to proceed
+    def external_busy(self, m):
+        return self.instr_fault | self.r_instr_fault
+
+    def set_wr_addr(self, m, addr, mask, misalign, msr, is_dcbz, is_nc):
+        m.d.comb += self.req.nc.eq(is_nc)
         m.d.comb += self.req.load.eq(0) # store operation
         m.d.comb += self.req.byte_sel.eq(mask)
-        m.d.comb += self.req.addr.eq(addr)
-        m.d.comb += self.req.priv_mode.eq(~msr_pr) # not-problem  ==> priv
-        m.d.comb += self.req.virt_mode.eq(msr_pr) # problem-state ==> virt
-        m.d.comb += self.req.align_intr.eq(misalign)
+        m.d.comb += self.req.raddr.eq(addr)
+        m.d.comb += self.req.priv_mode.eq(~msr.pr) # not-problem  ==> priv
+        m.d.comb += self.req.virt_mode.eq(msr.dr) # DR ==> virt
+        m.d.comb += self.req.mode_32bit.eq(~msr.sf) # not-sixty-four ==> 32bit
+        m.d.comb += self.req.dcbz.eq(is_dcbz)
+        with m.If(misalign):
+            m.d.comb += self.req.alignstate.eq(Misalign.NEED2WORDS)
+            m.d.sync += self.next_addr.eq(Cat(C(0, 3), addr[3:]+1))
+
+        # m.d.comb += Display("set_wr_addr %i dcbz %i",addr,is_dcbz)
+
         # option to disable the cache entirely for write
         if self.disable_cache:
             m.d.comb += self.req.nc.eq(1)
+
+        # dcbz cannot do no-cache
+        with m.If(is_dcbz & self.req.nc):
+            m.d.comb += self.req.align_intr.eq(1)
+
+        # hmm, rather than add yet another argument to set_wr_addr
+        # read direct from PortInterface
+        m.d.comb += self.req.reserve.eq(self.pi.reserve) # atomic request
+        m.d.comb += self.req.atomic.eq(~self.lrsc_misalign)
+        m.d.comb += self.req.atomic_last.eq(~self.lrsc_misalign)
+
         return None
 
-    def set_rd_addr(self, m, addr, mask, misalign, msr_pr):
+    def set_rd_addr(self, m, addr, mask, misalign, msr, is_nc):
         m.d.comb += self.d_valid.eq(1)
         m.d.comb += self.req.load.eq(1) # load operation
         m.d.comb += self.req.byte_sel.eq(mask)
-        m.d.comb += self.req.align_intr.eq(misalign)
-        m.d.comb += self.req.addr.eq(addr)
-        m.d.comb += self.req.priv_mode.eq(~msr_pr) # not-problem  ==> priv
-        m.d.comb += self.req.virt_mode.eq(msr_pr) # problem-state ==> virt
+        m.d.comb += self.req.raddr.eq(addr)
+        m.d.comb += self.req.priv_mode.eq(~msr.pr) # not-problem  ==> priv
+        m.d.comb += self.req.virt_mode.eq(msr.dr) # DR ==> virt
+        m.d.comb += self.req.mode_32bit.eq(~msr.sf) # not-sixty-four ==> 32bit
+        m.d.comb += self.req.nc.eq(is_nc)
         # BAD HACK! disable cacheing on LD when address is 0xCxxx_xxxx
         # this is for peripherals. same thing done in Microwatt loadstore1.vhdl
         with m.If(addr[28:] == Const(0xc, 4)):
@@ -146,6 +196,17 @@ class LoadStore1(PortInterfaceBase):
         # option to disable the cache entirely for read
         if self.disable_cache:
             m.d.comb += self.req.nc.eq(1)
+        with m.If(misalign):
+            # need two reads: prepare next address in advance
+            m.d.comb += self.req.alignstate.eq(Misalign.NEED2WORDS)
+            m.d.sync += self.next_addr.eq(Cat(C(0, 3), addr[3:]+1))
+
+        # hmm, rather than add yet another argument to set_rd_addr
+        # read direct from PortInterface
+        m.d.comb += self.req.reserve.eq(self.pi.reserve) # atomic request
+        m.d.comb += self.req.atomic.eq(~self.lrsc_misalign)
+        m.d.comb += self.req.atomic_last.eq(~self.lrsc_misalign)
+
         return None #FIXME return value
 
     def set_wr_data(self, m, data, wen):
@@ -154,51 +215,82 @@ class LoadStore1(PortInterfaceBase):
         # put data into comb which is picked up in main elaborate()
         m.d.comb += self.d_w_valid.eq(1)
         m.d.comb += self.store_data.eq(data)
-        #m.d.sync += self.d_out.byte_sel.eq(wen) # this might not be needed
+        m.d.sync += self.store_data2.eq(data[64:128])
         st_ok = self.done # TODO indicates write data is valid
+        m.d.comb += self.pi.store_done.data.eq(self.d_in.store_done)
+        m.d.comb += self.pi.store_done.ok.eq(1)
         return st_ok
 
     def get_rd_data(self, m):
-        ld_ok = self.done     # indicates read data is valid
-        data = self.load_data # actual read data
+        ld_ok = self.done_delay # indicates read data is valid
+        data = self.load_data_delay   # actual read data
         return data, ld_ok
 
     def elaborate(self, platform):
         m = super().elaborate(platform)
         comb, sync = m.d.comb, m.d.sync
 
-        # create dcache module
+        # microwatt takes one more cycle before next operation can be issued
+        sync += self.done_delay.eq(self.done)
+        #sync += self.load_data_delay[0:64].eq(self.load_data[0:64])
+
+        # create dcache and icache module
         m.submodules.dcache = dcache = self.dcache
+        m.submodules.icache = icache = self.icache
 
         # temp vars
         d_out, d_in, dbus = self.d_out, self.d_in, self.dbus
+        i_out, i_in, ibus = self.i_out, self.i_in, self.ibus
         m_out, m_in = self.m_out, self.m_in
         exc = self.pi.exc_o
         exception = exc.happened
         mmureq = Signal()
 
-        # copy of address, but gets over-ridden for OP_FETCH_FAILED
+        # copy of address, but gets over-ridden for instr_fault
         maddr = Signal(64)
-        m.d.comb += maddr.eq(self.addr)
+        m.d.comb += maddr.eq(self.raddr)
+
+        # check for LR/SC misalignment, used in set_rd/wr_addr above
+        comb += self.lrsc_misalign.eq(((self.pi.data_len[0:3]-1) &
+                                        self.req.raddr[0:3]).bool())
+        with m.If(self.lrsc_misalign & self.req.reserve):
+            m.d.comb += self.req.align_intr.eq(1)
 
         # create a blip (single pulse) on valid read/write request
         # this can be over-ridden in the FSM to get dcache to re-run
         # a request when MMU_LOOKUP completes.
         m.d.comb += self.d_validblip.eq(rising_edge(m, self.d_valid))
         ldst_r = LDSTRequest("ldst_r")
+        sync += Display("MMUTEST: LoadStore1 d_in.error=%i",d_in.error)
 
         # fsm skeleton
         with m.Switch(self.state):
             with m.Case(State.IDLE):
-                with m.If(self.d_validblip & ~exc.happened):
+                sync += self.load_data_delay.eq(0) # clear out
+                with m.If((self.d_validblip | self.instr_fault) &
+                          ~exc.happened):
                     comb += self.busy.eq(1)
                     sync += self.state.eq(State.ACK_WAIT)
                     sync += ldst_r.eq(self.req) # copy of LDSTRequest on "blip"
+                    # sync += Display("validblip self.req.virt_mode=%i",
+                    #                 self.req.virt_mode)
+                    with m.If(self.instr_fault):
+                        comb += mmureq.eq(1)
+                        sync += self.r_instr_fault.eq(1)
+                        comb += maddr.eq(self.maddr)
+                        sync += self.state.eq(State.MMU_LOOKUP)
+                    with m.Else():
+                        sync += self.r_instr_fault.eq(0)
+                    # if the LD/ST requires two dwords, move to waiting
+                    # for first word
+                    with m.If(self.req.alignstate == Misalign.NEED2WORDS):
+                        sync += ldst_r.alignstate.eq(Misalign.WAITFIRST)
                 with m.Else():
                     sync += ldst_r.eq(0)
 
             # waiting for completion
             with m.Case(State.ACK_WAIT):
+                sync += Display("MMUTEST: ACK_WAIT")
                 comb += self.busy.eq(~exc.happened)
 
                 with m.If(d_in.error):
@@ -208,10 +300,12 @@ class LoadStore1(PortInterfaceBase):
                         comb += exception.eq(1)
                         sync += self.state.eq(State.IDLE)
                         sync += ldst_r.eq(0)
-                        sync += self.dsisr[63 - 38].eq(~self.load)
+                        sync += Display("cache error -> update dsisr")
+                        sync += self.dsisr[63 - 38].eq(~ldst_r.load)
                         # XXX there is no architected bit for this
                         # (probably should be a machine check in fact)
                         sync += self.dsisr[63 - 35].eq(d_in.cache_paradox)
+                        sync += self.r_instr_fault.eq(0)
 
                     with m.Else():
                         # Look up the translation for TLB miss
@@ -220,76 +314,137 @@ class LoadStore1(PortInterfaceBase):
                         comb += mmureq.eq(1)
                         sync += self.state.eq(State.MMU_LOOKUP)
                 with m.If(d_in.valid):
-                    m.d.comb += self.done.eq(~mmureq) # done if not doing MMU
                     with m.If(self.done):
-                        sync += Display("ACK_WAIT, done %x", self.addr)
-                    sync += self.state.eq(State.IDLE)
-                    sync += ldst_r.eq(0)
-                    with m.If(self.load):
-                        m.d.comb += self.load_data.eq(d_in.data)
+                        sync += Display("ACK_WAIT, done %x", self.raddr)
+                    with m.If(ldst_r.alignstate == Misalign.ONEWORD):
+                        # done if there is only one dcache operation
+                        sync += self.state.eq(State.IDLE)
+                        sync += ldst_r.eq(0)
+                        with m.If(ldst_r.load):
+                            m.d.comb += self.load_data.eq(d_in.data)
+                            sync += self.load_data_delay[0:64].eq(d_in.data)
+                        m.d.comb += self.done.eq(~mmureq) # done if not MMU
+                    with m.Elif(ldst_r.alignstate == Misalign.WAITFIRST):
+                        # first LD done: load data, initiate 2nd request.
+                        # leave in ACK_WAIT state
+                        with m.If(ldst_r.load):
+                            m.d.comb += self.load_data[0:63].eq(d_in.data)
+                            sync += self.load_data_delay[0:64].eq(d_in.data)
+                        with m.Else():
+                            m.d.sync += d_out.data.eq(self.store_data2)
+                        # mmm kinda cheating, make a 2nd blip.
+                        # use an aligned version of the address
+                        m.d.comb += self.d_validblip.eq(1)
+                        comb += self.req.eq(ldst_r) # from copy of request
+                        comb += self.req.raddr.eq(self.next_addr)
+                        comb += self.req.byte_sel.eq(ldst_r.byte_sel[8:])
+                        comb += self.req.alignstate.eq(Misalign.WAITSECOND)
+                        sync += ldst_r.raddr.eq(self.next_addr)
+                        sync += ldst_r.byte_sel.eq(ldst_r.byte_sel[8:])
+                        sync += ldst_r.alignstate.eq(Misalign.WAITSECOND)
+                        sync += Display("    second req %x", self.req.raddr)
+                    with m.Elif(ldst_r.alignstate == Misalign.WAITSECOND):
+                        sync += Display("    done second %x", d_in.data)
+                        # done second load
+                        sync += self.state.eq(State.IDLE)
+                        sync += ldst_r.eq(0)
+                        with m.If(ldst_r.load):
+                            m.d.comb += self.load_data[64:128].eq(d_in.data)
+                            sync += self.load_data_delay[64:128].eq(d_in.data)
+                        m.d.comb += self.done.eq(~mmureq) # done if not MMU
 
             # waiting here for the MMU TLB lookup to complete.
             # either re-try the dcache lookup or throw MMU exception
             with m.Case(State.MMU_LOOKUP):
-                comb += self.busy.eq(1)
+                comb += self.busy.eq(~exception)
                 with m.If(m_in.done):
-                    with m.If(~self.instr_fault):
+                    with m.If(~self.r_instr_fault):
                         sync += Display("MMU_LOOKUP, done %x -> %x",
-                                        self.addr, d_out.addr)
+                                        self.raddr, d_out.addr)
                         # retry the request now that the MMU has
                         # installed a TLB entry, if not exception raised
                         m.d.comb += self.d_out.valid.eq(~exception)
                         sync += self.state.eq(State.ACK_WAIT)
-                        sync += ldst_r.eq(0)
                     with m.Else():
-                        sync += Display("MMU_LOOKUP, exception %x", self.addr)
-                        # instruction lookup fault: store address in DAR
-                        comb += exc.happened.eq(1)
-                        sync += self.dar.eq(self.addr)
+                        sync += self.state.eq(State.IDLE)
+                        sync += self.r_instr_fault.eq(0)
+                        comb += self.done.eq(1)
 
                 with m.If(m_in.err):
-                    # MMU RADIX exception thrown
+                    # MMU RADIX exception thrown. XXX
+                    # TODO: critical that the write here has to
+                    # notify the MMU FSM of the change to dsisr
                     comb += exception.eq(1)
+                    comb += self.done.eq(1)
+                    sync += Display("MMU RADIX exception thrown")
                     sync += self.dsisr[63 - 33].eq(m_in.invalid)
-                    sync += self.dsisr[63 - 36].eq(m_in.perm_error)
-                    sync += self.dsisr[63 - 38].eq(self.load)
+                    sync += self.dsisr[63 - 36].eq(m_in.perm_error) # noexec
+                    sync += self.dsisr[63 - 38].eq(~ldst_r.load)
                     sync += self.dsisr[63 - 44].eq(m_in.badtree)
                     sync += self.dsisr[63 - 45].eq(m_in.rc_error)
+                    sync += self.state.eq(State.IDLE)
+                    # exception thrown, clear out instruction fault state
+                    sync += self.r_instr_fault.eq(0)
 
-            with m.Case(State.TLBIE_WAIT):
-                pass
+        # MMU FSM communicating a request to update DSISR or DAR (OP_MTSPR)
+        with m.If(self.mmu_set_spr):
+            with m.If(self.mmu_set_dsisr):
+                sync += self.dsisr.eq(self.sprval_in)
+            with m.If(self.mmu_set_dar):
+                sync += self.dar.eq(self.sprval_in)
 
-        # alignment error: store address in DAR
+        # hmmm, alignment occurs in set_rd_addr/set_wr_addr, note exception
         with m.If(self.align_intr):
             comb += exc.happened.eq(1)
-            sync += self.dar.eq(self.addr)
+        # check for updating DAR
+        with m.If(exception):
+            sync += Display("exception %x", self.raddr)
+            # alignment error: store address in DAR
+            with m.If(self.align_intr):
+                sync += Display("alignment error: addr in DAR %x", self.raddr)
+                sync += self.dar.eq(self.raddr)
+            with m.Elif(~self.r_instr_fault):
+                sync += Display("not instr fault, addr in DAR %x", self.raddr)
+                sync += self.dar.eq(self.raddr)
+
+        # when done or exception, return to idle state
+        with m.If(self.done | exception):
+            sync += self.state.eq(State.IDLE)
+            comb += self.busy.eq(0)
 
         # happened, alignment, instr_fault, invalid.
         # note that all of these flow through - eventually to the TRAP
         # pipeline, via PowerDecoder2.
+        comb += self.align_intr.eq(self.req.align_intr)
         comb += exc.invalid.eq(m_in.invalid)
         comb += exc.alignment.eq(self.align_intr)
-        comb += exc.instr_fault.eq(self.instr_fault)
+        comb += exc.instr_fault.eq(self.r_instr_fault)
         # badtree, perm_error, rc_error, segment_fault
         comb += exc.badtree.eq(m_in.badtree)
         comb += exc.perm_error.eq(m_in.perm_error)
         comb += exc.rc_error.eq(m_in.rc_error)
         comb += exc.segment_fault.eq(m_in.segerr)
+        # conditions for 0x400 trap need these in SRR1
+        with m.If(exception & ~exc.alignment & exc.instr_fault):
+            comb += exc.srr1[14].eq(exc.invalid)      # 47-33
+            comb += exc.srr1[12].eq(exc.perm_error)   # 47-35
+            comb += exc.srr1[3].eq(exc.badtree)       # 47-44
+            comb += exc.srr1[2].eq(exc.rc_error)      # 47-45
 
         # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
-        comb += dbus.adr.eq(dcache.wb_out.adr)
-        comb += dbus.dat_w.eq(dcache.wb_out.dat)
-        comb += dbus.sel.eq(dcache.wb_out.sel)
-        comb += dbus.cyc.eq(dcache.wb_out.cyc)
-        comb += dbus.stb.eq(dcache.wb_out.stb)
-        comb += dbus.we.eq(dcache.wb_out.we)
-
-        comb += dcache.wb_in.dat.eq(dbus.dat_r)
-        comb += dcache.wb_in.ack.eq(dbus.ack)
+        comb += dbus.adr.eq(dcache.bus.adr)
+        comb += dbus.dat_w.eq(dcache.bus.dat_w)
+        comb += dbus.sel.eq(dcache.bus.sel)
+        comb += dbus.cyc.eq(dcache.bus.cyc)
+        comb += dbus.stb.eq(dcache.bus.stb)
+        comb += dbus.we.eq(dcache.bus.we)
+
+        comb += dcache.bus.dat_r.eq(dbus.dat_r)
+        comb += dcache.bus.ack.eq(dbus.ack)
         if hasattr(dbus, "stall"):
-            comb += dcache.wb_in.stall.eq(dbus.stall)
+            comb += dcache.bus.stall.eq(dbus.stall)
 
-        # update out d data when flag set
+        # update out d data when flag set, for first half (second done in FSM)
         with m.If(self.d_w_valid):
             m.d.sync += d_out.data.eq(self.store_data)
         #with m.Else():
@@ -305,30 +460,39 @@ class LoadStore1(PortInterfaceBase):
             m.d.comb += self.d_out.valid.eq(~exc.happened)
             m.d.comb += d_out.load.eq(self.req.load)
             m.d.comb += d_out.byte_sel.eq(self.req.byte_sel)
-            m.d.comb += self.addr.eq(self.req.addr)
+            m.d.comb += self.raddr.eq(self.req.raddr)
             m.d.comb += d_out.nc.eq(self.req.nc)
             m.d.comb += d_out.priv_mode.eq(self.req.priv_mode)
             m.d.comb += d_out.virt_mode.eq(self.req.virt_mode)
-            m.d.comb += self.align_intr.eq(self.req.align_intr)
+            m.d.comb += d_out.reserve.eq(self.req.reserve)
+            m.d.comb += d_out.atomic.eq(self.req.atomic)
+            m.d.comb += d_out.atomic_last.eq(self.req.atomic_last)
+            #m.d.comb += Display("validblip dcbz=%i addr=%x",
+            #self.req.dcbz,self.req.addr)
+            m.d.comb += d_out.dcbz.eq(self.req.dcbz)
         with m.Else():
             m.d.comb += d_out.load.eq(ldst_r.load)
             m.d.comb += d_out.byte_sel.eq(ldst_r.byte_sel)
-            m.d.comb += self.addr.eq(ldst_r.addr)
+            m.d.comb += self.raddr.eq(ldst_r.raddr)
             m.d.comb += d_out.nc.eq(ldst_r.nc)
             m.d.comb += d_out.priv_mode.eq(ldst_r.priv_mode)
             m.d.comb += d_out.virt_mode.eq(ldst_r.virt_mode)
-            m.d.comb += self.align_intr.eq(ldst_r.align_intr)
-
-        # XXX these should be possible to remove but for some reason
-        # cannot be... yet. TODO, investigate
-        m.d.comb += self.load_data.eq(d_in.data)
-        m.d.comb += d_out.addr.eq(self.addr)
+            m.d.comb += d_out.reserve.eq(ldst_r.reserve)
+            m.d.comb += d_out.atomic.eq(ldst_r.atomic)
+            m.d.comb += d_out.atomic_last.eq(ldst_r.atomic_last)
+            #m.d.comb += Display("no_validblip dcbz=%i addr=%x",
+            #ldst_r.dcbz,ldst_r.addr)
+            m.d.comb += d_out.dcbz.eq(ldst_r.dcbz)
+        m.d.comb += d_out.addr.eq(self.raddr)
 
         # Update outputs to MMU
         m.d.comb += m_out.valid.eq(mmureq)
         m.d.comb += m_out.iside.eq(self.instr_fault)
         m.d.comb += m_out.load.eq(ldst_r.load)
-        # m_out.priv <= r.priv_mode; TODO
+        with m.If(self.instr_fault):
+            m.d.comb += m_out.priv.eq(self.priv_mode)
+        with m.Else():
+            m.d.comb += m_out.priv.eq(ldst_r.priv_mode)
         m.d.comb += m_out.tlbie.eq(self.tlbie)
         # m_out.mtspr <= mmu_mtspr; # TODO
         # m_out.sprn <= sprn; # TODO
index c2d8a43cb47c0096d31e34e60e243ac7f5aba8b9..caf8bf5a15fca0dc01692225738ca70b00ae60bf 100644 (file)
@@ -22,7 +22,7 @@ class LDSTOutputData(FUBaseData):
     # LDSTCompUnit is unusual in that it's non-standard to RegSpecAPI
     regspec = [('INT', 'o', '0:63'),   # RT
                ('INT', 'o1', '0:63'),  # RA (effective address, update mode)
-               # TODO, later ('CR', 'cr_a', '0:3'),
+               ('CR', 'cr_a', '0:3'),
                # TODO, later ('XER', 'xer_so', '32')
                 ]
     def __init__(self, pspec):
@@ -32,5 +32,5 @@ class LDSTOutputData(FUBaseData):
 
 
 class LDSTPipeSpec(CommonPipeSpec):
-    regspec = (LDSTInputData.regspec, LDSTOutputData.regspec)
+    regspecklses = (LDSTInputData, LDSTOutputData)
     opsubsetkls = CompLDSTOpSubset
index dc086faefb70a5f217a6ee3cb894c4553a18371f..83eaf989ca08df99b4f23eba8dc026215a3dbab7 100644 (file)
@@ -58,15 +58,16 @@ class Bpermd(Elaboratable):
     def elaborate(self, platform):
         m = Module()
         perm = Signal(self.width, reset_less=True)
-        rb64 = [Signal(1, reset_less=True, name=f"rb64_{i}") for i in range(64)]
-        for i in range(64):
-            m.d.comb += rb64[i].eq(self.rb[63-i])
+        rb64 = [Signal(1, reset_less=True, name=f"rb64_{i}")
+                for i in range(self.width)]
+        for i in range(self.width):
+            m.d.comb += rb64[i].eq(self.rb[self.width-1-i])
         rb64 = Array(rb64)
-        for i in range(8):
+        for i in range(self.width//8):
             index = self.rs[8*i:8*i+8]
             idx = Signal(8, name=f"idx_{i}", reset_less=True)
             m.d.comb += idx.eq(index)
-            with m.If(idx < 64):
+            with m.If(idx < self.width):
                 m.d.comb += perm[i].eq(rb64[idx])
         m.d.comb += self.ra[0:8].eq(perm)
         return m
index d11f832df0b7e4d68957e85c40e83ca013b5aaf8..aa9b937d937ac52061912f994b295c7c7d4b1f6c 100644 (file)
@@ -32,7 +32,7 @@ class Driver(Elaboratable):
             recwidth += width
             comb += p.eq(AnyConst(width))
 
-        pspec = ALUPipeSpec(id_wid=2)
+        pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.dut = dut = ALUInputStage(pspec)
 
         a = Signal(64)
@@ -41,7 +41,7 @@ class Driver(Elaboratable):
                  dut.i.b.eq(b),
                  a.eq(AnyConst(64)),
                  b.eq(AnyConst(64))]
-                      
+
         comb += dut.i.ctx.op.eq(rec)
 
         # Assert that op gets copied from the input to output
@@ -70,6 +70,7 @@ class GTCombinerTestCase(FHDLTestCase):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=4)
         self.assertFormal(module, mode="cover", depth=4)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index 179d9ba26926ebe63afefc57eb5ce56add73f5fb..87d87283de4563e7c0ec2a8f6646159d601da4fc 100644 (file)
@@ -47,7 +47,7 @@ class Driver(Elaboratable):
             width = p.width
             comb += p.eq(AnyConst(width))
 
-        pspec = ALUPipeSpec(id_wid=2)
+        pspec = ALUPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.dut = dut = LogicalMainStage(pspec)
 
         # convenience variables
@@ -60,7 +60,7 @@ class Driver(Elaboratable):
         # setup random inputs
         comb += [a.eq(AnyConst(64)),
                  b.eq(AnyConst(64)),
-                 #carry_in.eq(AnyConst(0b11)),
+                 # carry_in.eq(AnyConst(0b11)),
                  ]
 
         comb += dut.i.ctx.op.eq(rec)
@@ -78,7 +78,7 @@ class Driver(Elaboratable):
         comb += a_signed_32.eq(a[0:32])
 
         o_ok = Signal()
-        comb += o_ok.eq(1) # will be set to zero if no op takes place
+        comb += o_ok.eq(1)  # will be set to zero if no op takes place
 
         # main assertion of arithmetic operations
         with m.Switch(rec.insn_type):
@@ -125,10 +125,10 @@ class Driver(Elaboratable):
                         comb += peo.eq(32)
                     with m.Else():
                         comb += peo.eq(pe32.o)
-                    with m.If(XO[-1]): # cnttzw
+                    with m.If(XO[-1]):  # cnttzw
                         comb += pe32.i.eq(a[0:32])
                         comb += Assert(o == peo)
-                    with m.Else(): # cntlzw
+                    with m.Else():  # cntlzw
                         comb += pe32.i.eq(a[0:32][::-1])
                         comb += Assert(o == peo)
                 with m.Else():
@@ -138,10 +138,10 @@ class Driver(Elaboratable):
                         comb += peo64.eq(64)
                     with m.Else():
                         comb += peo64.eq(pe64.o)
-                    with m.If(XO[-1]): # cnttzd
+                    with m.If(XO[-1]):  # cnttzd
                         comb += pe64.i.eq(a[0:64])
                         comb += Assert(o == peo64)
-                    with m.Else(): # cntlzd
+                    with m.Else():  # cntlzd
                         comb += pe64.i.eq(a[0:64][::-1])
                         comb += Assert(o == peo64)
 
@@ -180,6 +180,7 @@ class LogicalTestCase(FHDLTestCase):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=2)
         self.assertFormal(module, mode="cover", depth=2)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index e56f3445f4cd538ede6c72b36995eaf620a5c19e..6a90395783e798165bd55576c769c9bf73144952 100644 (file)
@@ -6,6 +6,8 @@
 # to the output stage
 
 # Copyright (C) 2020 Michael Nolan <mtnolan2640@gmail.com>
+# Copyright (C) 2020, 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+
 from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
 from nmutil.pipemodbase import PipeModBase
 from nmutil.clz import CLZ
@@ -13,7 +15,7 @@ from soc.fu.logical.pipe_data import LogicalInputData
 from soc.fu.logical.bpermd import Bpermd
 from soc.fu.logical.popcount import Popcount
 from soc.fu.logical.pipe_data import LogicalOutputData
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
 from openpower.decoder.power_enums import MicrOp
 
 from openpower.decoder.power_fields import DecodeFields
@@ -33,14 +35,15 @@ class LogicalMainStage(PipeModBase):
         return LogicalOutputData(self.pspec)
 
     def elaborate(self, platform):
+        XLEN = self.pspec.XLEN
         m = Module()
         comb = m.d.comb
         op, a, b, o = self.i.ctx.op, self.i.a, self.i.b, self.o.o
 
         comb += o.ok.eq(1) # overridden if no op activates
 
-        m.submodules.bpermd = bpermd = Bpermd(64)
-        m.submodules.popcount = popcount = Popcount()
+        m.submodules.bpermd = bpermd = Bpermd(XLEN)
+        m.submodules.popcount = popcount = Popcount(XLEN)
 
         ##########################
         # main switch for logic ops AND, OR and XOR, cmpb, parity, and popcount
@@ -84,12 +87,14 @@ class LogicalMainStage(PipeModBase):
                 par0 = Signal(reset_less=True)
                 par1 = Signal(reset_less=True)
                 comb += par0.eq(Cat(a[0], a[8], a[16], a[24]).xor())
-                comb += par1.eq(Cat(a[32], a[40], a[48], a[56]).xor())
+                if XLEN == 64:
+                    comb += par1.eq(Cat(a[32], a[40], a[48], a[56]).xor())
                 with m.If(op.data_len[3] == 1):
                     comb += o.data.eq(par0 ^ par1)
                 with m.Else():
                     comb += o[0].eq(par0)
-                    comb += o[32].eq(par1)
+                    if XLEN == 64:
+                        comb += o[32].eq(par1)
 
             ###################
             ###### cntlz v3.0B p99
@@ -99,7 +104,7 @@ class LogicalMainStage(PipeModBase):
                 count_right = Signal(reset_less=True)
                 comb += count_right.eq(XO[-1])
 
-                cntz_i = Signal(64, reset_less=True)
+                cntz_i = Signal(XLEN, reset_less=True)
                 a32 = Signal(32, reset_less=True)
                 comb += a32.eq(a[0:32])
 
@@ -108,7 +113,7 @@ class LogicalMainStage(PipeModBase):
                 with m.Else():
                     comb += cntz_i.eq(Mux(count_right, a[::-1], a))
 
-                m.submodules.clz = clz = CLZ(64)
+                m.submodules.clz = clz = CLZ(XLEN)
                 comb += clz.sig_in.eq(cntz_i)
                 comb += o.data.eq(Mux(op.is_32bit, clz.lz-32, clz.lz))
 
index 73b48d1eecdd33a58245c5fba91685b53c07e52c..81a1c5247de848509bea6b2e577ea78401225e60 100644 (file)
@@ -6,7 +6,7 @@ from nmutil.pipemodbase import PipeModBase
 from soc.fu.common_output_stage import CommonOutputStage
 from soc.fu.logical.pipe_data import (LogicalInputData, LogicalOutputData,
                                       LogicalOutputDataFinal)
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
 from openpower.decoder.power_enums import MicrOp
 
 
index 3d9077aaf1721b0aea1bbc65c29023e6d8638164..359a2a595689ed66b5b15f2789e92b7a90b90998 100644 (file)
@@ -5,40 +5,47 @@ from soc.fu.logical.logical_input_record import CompLogicalOpSubset
 
 # input (and output) for logical initial stage (common input)
 class LogicalInputData(FUBaseData):
-    regspec = [('INT', 'ra', '0:63'), # RA
-               ('INT', 'rb', '0:63'), # RB/immediate
-               ('XER', 'xer_so', '32'),    # bit0: so
-               ]
     def __init__(self, pspec):
         super().__init__(pspec, False)
         # convenience
         self.a, self.b = self.ra, self.rb
 
+    @property
+    def regspec(self):
+        return [('INT', 'ra', self.intrange),  # RA
+               ('INT', 'rb', self.intrange),  # RB/immediate
+               ('XER', 'xer_so', '32'),    # bit0: so
+               ]
 
 # input to logical final stage (common output)
 class LogicalOutputData(FUBaseData):
-    regspec = [('INT', 'o', '0:63'),        # RT
-               ('CR', 'cr_a', '0:3'),
-               ('XER', 'xer_so', '32'),    # bit0: so
-               ]
     def __init__(self, pspec):
         super().__init__(pspec, True)
         # convenience
         self.cr0 = self.cr_a
 
+    @property
+    def regspec(self):
+        return [('INT', 'o', self.intrange),
+               ('CR', 'cr_a', '0:3'),
+               ('XER', 'xer_so', '32'),    # bit0: so
+               ]
+
 
 # output from logical final stage (common output) - note that XER.so
 # is *not* included (the only reason it's in the input is because of CR0)
 class LogicalOutputDataFinal(FUBaseData):
-    regspec = [('INT', 'o', '0:63'),        # RT
-               ('CR', 'cr_a', '0:3'),
-               ]
     def __init__(self, pspec):
         super().__init__(pspec, True)
         # convenience
         self.cr0 = self.cr_a
+    @property
+    def regspec(self):
+        return [('INT', 'o', self.intrange),
+               ('CR', 'cr_a', '0:3'),
+               ]
 
 
 class LogicalPipeSpec(CommonPipeSpec):
-    regspec = (LogicalInputData.regspec, LogicalOutputDataFinal.regspec)
+    regspecklses = (LogicalInputData, LogicalOutputDataFinal)
     opsubsetkls = CompLogicalOpSubset
index a16bd78acab1c6c65702368ddd28b5a2f07f1dc1..a0f00d1dcd6f473b77cb852d6200ccb78c982272 100644 (file)
@@ -8,11 +8,15 @@ from soc.fu.logical.output_stage import LogicalOutputStage
 class LogicalStages1(PipeModBaseChain):
     def get_chain(self):
         inp = LogicalInputStage(self.pspec)
+        return [inp]
+
+class LogicalStages2(PipeModBaseChain):
+    def get_chain(self):
         main = LogicalMainStage(self.pspec)
-        return [inp, main]
+        return [main]
 
 
-class LogicalStages2(PipeModBaseChain):
+class LogicalStages3(PipeModBaseChain):
     def get_chain(self):
         out = LogicalOutputStage(self.pspec)
         return [out]
@@ -24,11 +28,13 @@ class LogicalBasePipe(ControlBase):
         self.pspec = pspec
         self.pipe1 = LogicalStages1(pspec)
         self.pipe2 = LogicalStages2(pspec)
-        self._eqs = self.connect([self.pipe1, self.pipe2])
+        self.pipe3 = LogicalStages3(pspec)
+        self._eqs = self.connect([self.pipe1, self.pipe2, self.pipe3])
 
     def elaborate(self, platform):
         m = ControlBase.elaborate(self, platform)
         m.submodules.logical_pipe1 = self.pipe1
         m.submodules.logical_pipe2 = self.pipe2
+        m.submodules.logical_pipe3 = self.pipe3
         m.d.comb += self._eqs
         return m
index ca90112d495c326996e16b1b11d93ef2649dfb12..5975149db345bdb28822d7ee683a47148e80c61a 100644 (file)
@@ -23,11 +23,13 @@ def array_of(count, bitwidth):
 
 
 class Popcount(Elaboratable):
-    def __init__(self):
-        self.a = Signal(64, reset_less=True)
-        self.b = Signal(64, reset_less=True)
+    def __init__(self, width=64):
+        self.width = width
+        self.a = Signal(width, reset_less=True)
+        self.b = Signal(width, reset_less=True)
         self.data_len = Signal(4, reset_less=True) # data len up to... err.. 8?
-        self.o = Signal(64, reset_less=True)
+        self.o = Signal(width, reset_less=True)
+        assert width in [32, 64], "only 32 or 64 bit supported for now"
 
     def elaborate(self, platform):
         m = Module()
@@ -38,11 +40,13 @@ class Popcount(Elaboratable):
         # creating arrays big enough to store the sum, each time
         pc = [a]
         # QTY32 2-bit (to take 2x 1-bit sums) etc.
-        work = [(32, 2), (16, 3), (8, 4), (4, 5), (2, 6), (1, 7)]
+        work = [(16, 3), (8, 4), (4, 5), (2, 6), (1, 7)]
+        if self.width == 64:
+            work = [(32, 2)] + work
         for l, bw in work: # l=number of add-reductions, bw=bitwidth
             pc.append(array_of(l, bw))
-        pc8 = pc[3]     # array of 8 8-bit counts (popcntb)
-        pc32 = pc[5]    # array of 2 32-bit counts (popcntw)
+        pc8 = pc[-4]     # array of 8 8-bit counts (popcntb)
+        pc32 = pc[-2]    # array of 2 32-bit counts (popcntw)
         popcnt = pc[-1]  # array of 1 64-bit count (popcntd)
         # cascade-tree of adds
         for idx, (l, bw) in enumerate(work):
@@ -54,12 +58,15 @@ class Popcount(Elaboratable):
         # decode operation length (1-hot)
         with m.If(data_len == 1):
             # popcntb - pack 8x 4-bit answers into 8x 8-bit output fields
-            for i in range(8):
+            for i in range(self.width//8):
                 comb += o[i*8:(i+1)*8].eq(pc8[i])
         with m.Elif(data_len == 4):
-            # popcntw - pack 2x 5-bit answers into 2x 32-bit output fields
-            for i in range(2):
-                comb += o[i*32:(i+1)*32].eq(pc32[i])
+            if self.width == 64:
+                # popcntw - pack 2x 5-bit answers into 2x 32-bit output fields
+                for i in range(2):
+                    comb += o[i*32:(i+1)*32].eq(pc32[i])
+            else:
+                comb += o.eq(popcnt[0])
         with m.Else():
             # popcntd - put 1x 6-bit answer into 64-bit output
             comb += o.eq(popcnt[0])
index ca14b271d6958534961e9814a86b27df7a7e7b50..8e7c67e83005081fed50756e46c1019411533f70 100644 (file)
@@ -39,10 +39,10 @@ def get_cu_inputs(dec2, sim):
 def set_alu_inputs(alu, dec2, sim):
     # TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
     # detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
-    # and place it into data_i.b
+    # and place it into i_data.b
 
     inp = yield from get_cu_inputs(dec2, sim)
-    print ("set alu inputs", inp)
+    print("set alu inputs", inp)
     yield from ALUHelpers.set_int_ra(alu, dec2, inp)
     yield from ALUHelpers.set_int_rb(alu, dec2, inp)
     yield from ALUHelpers.set_xer_so(alu, dec2, inp)
@@ -51,19 +51,19 @@ def set_alu_inputs(alu, dec2, sim):
 class LogicalIlangCase(TestAccumulatorBase):
 
     def case_ilang(self):
-        pspec = LogicalPipeSpec(id_wid=2)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = LogicalPipeSpec(id_wid=2, parent_pspec=pps)
         alu = LogicalBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("logical_pipeline.il", "w") as f:
             f.write(vl)
 
 
-class TestRunner(FHDLTestCase):
-    def __init__(self, test_data):
-        super().__init__("run_all")
-        self.test_data = test_data
+class TestRunner(unittest.TestCase):
 
-    def execute(self, alu,instruction, pdecode2, test):
+    def execute(self, alu, instruction, pdecode2, test):
         print(test.name)
         program = test.program
         self.subTest(test.name)
@@ -89,25 +89,27 @@ class TestRunner(FHDLTestCase):
             yield from set_alu_inputs(alu, pdecode2, simulator)
 
             # set valid for one cycle, propagate through pipeline...
-            yield alu.p.valid_i.eq(1)
+            yield alu.p.i_valid.eq(1)
             yield
-            yield alu.p.valid_i.eq(0)
+            yield alu.p.i_valid.eq(0)
 
             opname = code.split(' ')[0]
             yield from simulator.call(opname)
             index = simulator.pc.CIA.value//4
 
-            vld = yield alu.n.valid_o
+            vld = yield alu.n.o_valid
             while not vld:
                 yield
-                vld = yield alu.n.valid_o
+                vld = yield alu.n.o_valid
             yield
 
             yield from self.check_alu_outputs(alu, pdecode2,
                                               simulator, code)
             yield Settle()
 
-    def run_all(self):
+    def test_it(self):
+        test_data = LogicalIlangCase().test_data + \
+            LogicalTestCase({'soc'}).test_data
         m = Module()
         comb = m.d.comb
         instruction = Signal(32)
@@ -116,18 +118,21 @@ class TestRunner(FHDLTestCase):
 
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
 
-        pspec = LogicalPipeSpec(id_wid=2)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = LogicalPipeSpec(id_wid=2, parent_pspec=pps)
         m.submodules.alu = alu = LogicalBasePipe(pspec)
 
-        comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
-        comb += alu.n.ready_i.eq(1)
+        comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+        comb += alu.n.i_ready.eq(1)
         comb += pdecode2.dec.raw_opcode_in.eq(instruction)
         sim = Simulator(m)
 
         sim.add_clock(1e-6)
 
         def process():
-            for test in self.test_data:
+            for test in test_data:
                 print(test.name)
                 program = test.program
                 with self.subTest(test.name):
@@ -163,10 +168,4 @@ class TestRunner(FHDLTestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(exit=False)
-    suite = unittest.TestSuite()
-    suite.addTest(TestRunner(LogicalIlangCase().test_data))
-    suite.addTest(TestRunner(LogicalTestCase().test_data))
-
-    runner = unittest.TextTestRunner()
-    runner.run(suite)
+    unittest.main()
index 1a3026e79e8e760822df1108edaabcb031c0e6a1..24be3f5402710bed1fb3a016e672ef12cc13671b 100644 (file)
@@ -24,6 +24,7 @@ from soc.experiment.mem_types import LoadStore1ToMMUType
 from soc.experiment.mem_types import MMUToLoadStore1Type
 
 from soc.fu.ldst.loadstore import LoadStore1, TestSRAMLoadStore1
+from nmutil.util import Display
 
 
 class FSMMMUStage(ControlBase):
@@ -42,8 +43,9 @@ class FSMMMUStage(ControlBase):
         self.pspec = pspec
 
         # set up p/n data
-        self.p.data_i = MMUInputData(pspec)
-        self.n.data_o = MMUOutputData(pspec)
+        self.p.i_data = MMUInputData(pspec)
+        self.n.o_data = MMUOutputData(pspec)
+        self.exc_o = self.n.o_data.exception # AllFunctionUnits needs this
 
         self.mmu = MMU()
 
@@ -52,7 +54,7 @@ class FSMMMUStage(ControlBase):
         self.illegal = Signal()
 
         # for SPR field number access
-        i = self.p.data_i
+        i = self.p.i_data
         self.fields = DecodeFields(SignalBitRange, [i.ctx.op.insn])
         self.fields.create_specs()
 
@@ -64,53 +66,47 @@ class FSMMMUStage(ControlBase):
         # incoming PortInterface
         self.ldst = ldst
         self.dcache = self.ldst.dcache
+        self.icache = self.ldst.icache
         self.pi = self.ldst.pi
 
     def elaborate(self, platform):
         assert hasattr(self, "dcache"), "remember to call set_ldst_interface"
         m = super().elaborate(platform)
         comb, sync = m.d.comb, m.d.sync
-        dcache = self.dcache
+        dcache, icache = self.dcache, self.icache
+        ldst = self.ldst # managed externally: do not add here
 
-        # link mmu and dcache together
+        # link mmu, dcache and icache together
         m.submodules.mmu = mmu = self.mmu
-        ldst = self.ldst # managed externally: do not add here
         m.d.comb += dcache.m_in.eq(mmu.d_out) # MMUToDCacheType
         m.d.comb += mmu.d_in.eq(dcache.m_out) # DCacheToMMUType
+        m.d.comb += icache.m_in.eq(mmu.i_out) # MMUToICacheType
 
         l_in, l_out = mmu.l_in, mmu.l_out
         d_in, d_out = dcache.d_in, dcache.d_out
-        wb_out, wb_in = dcache.wb_out, dcache.wb_in
 
         # link ldst and MMU together
         comb += l_in.eq(ldst.m_out)
         comb += ldst.m_in.eq(l_out)
 
-        data_i, data_o = self.p.data_i, self.n.data_o
-        a_i, b_i, o, spr1_o = data_i.ra, data_i.rb, data_o.o, data_o.spr1
-        op = data_i.ctx.op
+        i_data, o_data = self.p.i_data, self.n.o_data
+        op = i_data.ctx.op
+        cia_i = op.cia
         msr_i = op.msr
-        spr1_i = data_i.spr1
-
-        # these are set / got here *ON BEHALF* of LoadStore1
-        dsisr, dar = ldst.dsisr, ldst.dar
+        a_i, b_i, spr1_i = i_data.ra, i_data.rb, i_data.spr1
+        o, exc_o, spr1_o = o_data.o, o_data.exception, o_data.spr1
 
         # busy/done signals
-        busy = Signal()
-        done = Signal()
-        m.d.comb += self.n.valid_o.eq(busy & done)
-        m.d.comb += self.p.ready_o.eq(~busy)
+        busy = Signal(name="mmu_fsm_busy")
+        done = Signal(name="mmu_fsm_done")
+        m.d.comb += self.n.o_valid.eq(busy & done)
+        m.d.comb += self.p.o_ready.eq(~busy)
 
         # take copy of X-Form SPR field
         x_fields = self.fields.FormXFX
         spr = Signal(len(x_fields.SPR))
         comb += spr.eq(decode_spr_num(x_fields.SPR))
 
-        # based on MSR bits, set priv and virt mode.  TODO: 32-bit mode
-        comb += d_in.priv_mode.eq(~msr_i[MSR.PR])
-        comb += d_in.virt_mode.eq(msr_i[MSR.DR])
-        #comb += d_in.mode_32bit.eq(msr_i[MSR.SF]) # ?? err
-
         # ok so we have to "pulse" the MMU (or dcache) rather than
         # hold the valid hi permanently.  guess what this does...
         valid = Signal()
@@ -118,7 +114,7 @@ class FSMMMUStage(ControlBase):
         m.d.comb += blip.eq(rising_edge(m, valid))
 
         with m.If(~busy):
-            with m.If(self.p.valid_i):
+            with m.If(self.p.i_valid):
                 sync += busy.eq(1)
         with m.Else():
 
@@ -127,10 +123,16 @@ class FSMMMUStage(ControlBase):
             # enabled ("valid") and we twiddle our thumbs until it
             # responds ("done").
 
-            # FIXME: properly implement MicrOp.OP_MTSPR and MicrOp.OP_MFSPR
+            # WIP: properly implement MicrOp.OP_MTSPR and MicrOp.OP_MFSPR
 
             with m.Switch(op.insn_type):
+
+                ##########
+                # OP_MTSPR
+                ##########
+
                 with m.Case(MicrOp.OP_MTSPR):
+                    comb += Display("MMUTEST: OP_MTSPR: spr=%i", spr)
                     # despite redirection this FU **MUST** behave exactly
                     # like the SPR FU.  this **INCLUDES** updating the SPR
                     # regfile because the CSV file entry for OP_MTSPR
@@ -145,13 +147,17 @@ class FSMMMUStage(ControlBase):
                     with m.If(~spr[9] & ~spr[5]):
                         comb += self.debug0.eq(3)
                         #if matched update local cached value
+                        #commented out because there is a driver conflict
+                        comb += ldst.sprval_in.eq(a_i)
+                        comb += ldst.mmu_set_spr.eq(1)
                         with m.If(spr[0]):
-                            sync += dsisr.eq(a_i[:32])
+                            comb += ldst.mmu_set_dar.eq(1)
                         with m.Else():
-                            sync += dar.eq(a_i)
+                            comb += ldst.mmu_set_dsisr.eq(1)
                         comb += done.eq(1)
                     # pass it over to the MMU instead
                     with m.Else():
+                        # PGTBL and PID
                         comb += self.debug0.eq(4)
                         # blip the MMU and wait for it to complete
                         comb += valid.eq(1)   # start "pulse"
@@ -161,50 +167,42 @@ class FSMMMUStage(ControlBase):
                         comb += l_in.rs.eq(a_i)    # incoming operand (RS)
                         comb += done.eq(1) # FIXME l_out.done
 
+                ##########
+                # OP_MFSPR
+                ##########
+
                 with m.Case(MicrOp.OP_MFSPR):
-                    # subset SPR: first check a few bits
-                    #with m.If(~spr[9] & ~spr[5]):
-                    #    comb += self.debug0.eq(5)
-                        #with m.If(spr[0]):
-                        #    comb += o.data.eq(dsisr)
-                        #with m.Else():
-                        #    comb += o.data.eq(dar)
-                    #do NOT return cached values
-                    comb += o.data.eq(spr1_i)
+                    comb += Display("MMUTEST: OP_MFSPR: spr=%i returns=%i",
+                                    spr, spr1_i)
+                    # partial SPR number decoding perfectly fine
+                    with m.If(spr[9] | spr[5]):
+                        # identified as an MMU OP_MFSPR, contact the MMU.
+                        # interestingly, the read is combinatorial: no need
+                        # to set "valid", just set the SPR number
+                        comb += l_in.sprn.eq(spr)  # which SPR
+                        comb += o.data.eq(l_out.sprval)
+                    with m.Else():
+                        # identified as DSISR or DAR.  again: read the SPR
+                        # directly, combinatorial access
+                        with m.If(spr[0]):
+                            comb += o.data.eq(ldst.dar)
+                        with m.Else():
+                            comb += o.data.eq(ldst.dsisr)
+
                     comb += o.ok.eq(1)
                     comb += done.eq(1)
-                    # pass it over to the MMU instead
-                    #with m.Else():
-                    #    comb += self.debug0.eq(6)
-                    #    # blip the MMU and wait for it to complete
-                    #    comb += valid.eq(1)   # start "pulse"
-                    #    comb += l_in.valid.eq(blip)   # start
-                    #    comb += l_in.mtspr.eq(0)   # mfspr!=mtspr
-                    #    comb += l_in.sprn.eq(spr)  # which SPR
-                    #    comb += l_in.rs.eq(a_i)    # incoming operand (RS)
-                    #    comb += o.data.eq(l_out.sprval) # SPR from MMU
-                    #    comb += o.ok.eq(l_out.done) # only when l_out valid
-                    #    comb += done.eq(1) # FIXME l_out.done
-
-                # XXX this one is going to have to go through LDSTCompUnit
-                # because it's LDST that has control over dcache
-                # (through PortInterface).  or, another means is devised
-                # so as not to have double-drivers of d_in.valid and addr
-                #
-                #with m.Case(MicrOp.OP_DCBZ):
-                #    # activate dcbz mode (spec: v3.0B p850)
-                #    comb += valid.eq(1)   # start "pulse"
-                #    comb += d_in.valid.eq(blip)     # start
-                #    comb += d_in.dcbz.eq(1)         # dcbz mode
-                #    comb += d_in.addr.eq(a_i + b_i) # addr is (RA|0) + RB
-                #    comb += done.eq(d_out.store_done)     # TODO
-                #    comb += self.debug0.eq(1)
+
+                ##########
+                # OP_TLBIE
+                ##########
 
                 with m.Case(MicrOp.OP_TLBIE):
+                    comb += Display("MMUTEST: OP_TLBIE: insn_bits=%i", spr)
                     # pass TLBIE request to MMU (spec: v3.0B p1034)
                     # note that the spr is *not* an actual spr number, it's
                     # just that those bits happen to match with field bits
                     # RIC, PRS, R
+                    comb += Display("TLBIE: %i %i", spr, l_out.done)
                     comb += valid.eq(1)   # start "pulse"
                     comb += l_in.valid.eq(blip)   # start
                     comb += l_in.tlbie.eq(1)   # mtspr mode
@@ -213,10 +211,37 @@ class FSMMMUStage(ControlBase):
                     comb += done.eq(l_out.done) # zzzz
                     comb += self.debug0.eq(2)
 
+                ##########
+                # OP_FETCH_FAILED
+                ##########
+
+                with m.Case(MicrOp.OP_FETCH_FAILED):
+                    comb += Display("MMUTEST: OP_FETCH_FAILED: @%x", cia_i)
+                    # trigger an instruction fetch failed MMU event.
+                    # PowerDecoder2 drops svstate.pc into NIA for us
+                    # really, this should be direct communication with the
+                    # MMU, rather than going through LoadStore1.  but, doing
+                    # so allows for the opportunity to prevent LoadStore1
+                    # from accepting any other LD/ST requests.
+                    comb += valid.eq(1)   # start "pulse"
+                    comb += ldst.instr_fault.eq(blip)
+                    comb += ldst.priv_mode.eq(~msr_i[MSR.PR])
+                    comb += ldst.maddr.eq(cia_i)
+                    # XXX should not access this!
+                    comb += done.eq(ldst.done)
+                    comb += self.debug0.eq(3)
+                    # LDST unit contains exception data, which (messily)
+                    # is copied over, here.  not ideal but it will do for now
+                    comb += exc_o.eq(ldst.pi.exc_o)
+
+                ############
+                # OP_ILLEGAL
+                ############
+
                 with m.Case(MicrOp.OP_ILLEGAL):
                     comb += self.illegal.eq(1)
 
-            with m.If(self.n.ready_i & self.n.valid_o):
+            with m.If(self.n.i_ready & self.n.o_valid):
                 sync += busy.eq(0)
 
         return m
index 109d2d389327f646404df4dfcaf5ba324b42c466..aea08bc8a31c9ea6dadd4cf2cbad2af6ccd69202 100644 (file)
@@ -13,7 +13,8 @@ class CompMMUOpSubset(CompOpSubsetBase):
         layout = (('insn_type', MicrOp),
                   ('fn_unit', Function),
                   ('insn', 32),
-                  ('msr', 64), # TODO: a lot less bits.  only need PR, DR, SF
+                  ('cia', 64), # for instruction fault (MMU PTE lookup)
+                  ('msr', 64), # ditto, to set priv_mode etc.
                   ('zero_a', 1),
                   )
         super().__init__(layout, name=name)
index bc86e29151d060679cca0818cf485641843d184b..7272a2256a3117fa2f554fe617e1eec75d7e1d84 100644 (file)
@@ -13,6 +13,7 @@ Links:
 from soc.fu.pipe_data import FUBaseData
 from soc.fu.mmu.mmu_input_record import CompMMUOpSubset
 from soc.fu.alu.pipe_data import CommonPipeSpec
+from openpower.exceptions import LDSTException
 
 
 class MMUInputData(FUBaseData):
@@ -32,9 +33,9 @@ class MMUOutputData(FUBaseData):
                ('SPR', 'spr1', '0:63'),     # MMU (slow)
                ]
     def __init__(self, pspec):
-        super().__init__(pspec, True)
+        super().__init__(pspec, True, LDSTException)
 
 
 class MMUPipeSpec(CommonPipeSpec):
-    regspec = (MMUInputData.regspec, MMUOutputData.regspec)
+    regspecklses = (MMUInputData, MMUOutputData)
     opsubsetkls = CompMMUOpSubset
index 0bb6ecad6a25318fadeb2e78b277e622915f5c2b..f5919b9a9dc9bc050bda852b1128c85d8e2e5ead 100644 (file)
@@ -13,33 +13,66 @@ class MMUTestCase(TestAccumulatorBase):
     # libre-soc has own SPR unit
     # other instructions here -> must be load/store
 
-    def case_mmu_ldst(self):
+    def cse_dcbz(self):
         lst = [
                 "dcbz 1,2",
+              ]
+
+        initial_regs = [0] * 32
+        initial_regs[1] = 0x2
+        initial_regs[2] = 0x2020
+
+        self.add_case(Program(lst, bigendian),
+                      initial_regs, initial_mem={})
+
+    def case_mmu_dar(self):
+        lst = [
+                "mfspr 1, 720",     # DAR to reg 1
+                "mtspr 19, 3",      # reg 3 to DAR
+              ]
+
+        initial_regs = [0] * 32
+        initial_regs[1] = 0x2
+        initial_regs[3] = 0x5
+
+        initial_sprs = {'DAR': 0x87654321,
+                        }
+        self.add_case(Program(lst, bigendian),
+                      initial_regs, initial_sprs, initial_mem={})
+
+    def case_mmu_ldst(self):
+        lst = [
+                "dcbz 1,0",
                 "tlbie 0,0,0,0,0", # RB,RS,RIC,PRS,R
                 "mtspr 18, 1",     # reg 1 to DSISR
                 "mtspr 19, 2",     # reg 2 to DAR
-                "mfspr 1, 18",     # DSISR to reg 1
-                "mfspr 2, 19",     # DAR to reg 2
+                "mfspr 5, 18",     # DSISR to reg 5
+                "mfspr 6, 19",     # DAR to reg 6
                 "mtspr 48, 3",    # set MMU PID
                 "mtspr 720, 4",    # set MMU PRTBL
-                "lhz 3, 0(1)"      # load some data
+                "lhz 3, 0(1)",     # load some data
+                "addi 7, 0, 1"
               ]
 
         initial_regs = [0] * 32
-        initial_regs[3] = 1
+        initial_regs[1] = 0x2
+        initial_regs[2] = 0x2020
+        initial_regs[3] = 5
         initial_regs[4] = 0xDEADBEEF
-        #initial_regs[1] = 0xDEADBEEF
 
-        #FIXME initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321}
-        initial_sprs = {}
+        initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321,
+                        'PIDR': 0xabcd, 'PRTBL': 0x0def}
         self.add_case(Program(lst, bigendian),
-                      initial_regs, initial_sprs)
+                      initial_regs, initial_sprs, initial_mem={})
 
 
 if __name__ == "__main__":
+    mem = {}
     unittest.main(exit=False)
     suite = unittest.TestSuite()
-    suite.addTest(TestRunner(MMUTestCase().test_data,microwatt_mmu=True))
+    suite.addTest(TestRunner(MMUTestCase().test_data,
+                             microwatt_mmu=True,
+                             svp64=False,
+                             rom=mem))
     runner = unittest.TextTestRunner()
     runner.run(suite)
index 5a77a455c10b78417cd9b7939ac8a3cdd0081fe9..e234ac22f524d9262a4085506aa19b91e7958d2b 100644 (file)
@@ -30,26 +30,27 @@ from soc.simple.test.test_core import (setup_regs, check_regs,
 
 debughang = 2
 
+
 class MMUTestCase(TestAccumulatorBase):
     # MMU handles MTSPR, MFSPR, DCBZ and TLBIE.
     # other instructions here -> must be load/store
 
     def case_mfspr_after_invalid_load(self):
-        lst = [ # TODO -- set SPR on both sinulator and port interface
-                "mfspr 1, 18", # DSISR to reg 1
-                "mfspr 2, 19", # DAR to reg 2
-                # TODO -- verify returned sprvals
-              ]
+        lst = [  # TODO -- set SPR on both sinulator and port interface
+            "mfspr 1, 18",  # DSISR to reg 1
+            "mfspr 2, 19",  # DAR to reg 2
+            # TODO -- verify returned sprvals
+        ]
 
         initial_regs = [0] * 32
 
-        #THOSE are currently broken -- initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321}
+        # THOSE are currently broken -- initial_sprs = {'DSISR': 0x12345678, 'DAR': 0x87654321}
         initial_sprs = {}
         self.add_case(Program(lst, bigendian),
                       initial_regs, initial_sprs)
 
-    #def case_ilang(self):
-    #    pspec = SPRPipeSpec(id_wid=2)
+    # def case_ilang(self):
+    #    pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
     #    alu = SPRBasePipe(pspec)
     #    vl = rtlil.convert(alu, ports=alu.ports())
     #    with open("trap_pipeline.il", "w") as f:
@@ -102,12 +103,14 @@ class TestRunner(unittest.TestCase):
 
             fsm = core.fus.fus["mmu0"].alu
 
-            vld = yield fsm.n.valid_o
+            vld = yield fsm.n.o_valid
             while not vld:
                 yield
-                if debughang:  print("not valid -- hang")
-                vld = yield fsm.n.valid_o
-                if debughang==2: vld=1
+                if debughang:
+                    print("not valid -- hang")
+                vld = yield fsm.n.o_valid
+                if debughang == 2:
+                    vld = 1
             yield
 
     def run_all(self):
@@ -126,10 +129,10 @@ class TestRunner(unittest.TestCase):
                              reg_wid=64)
 
         m.submodules.core = core = NonProductionCore(pspec
-                                     # XXX NO absolutely do not do this.
-                                     # all options must go into the pspec
-                                     #, microwatt_mmu=True
-                                                        )
+                                                     # XXX NO absolutely do not do this.
+                                                     # all options must go into the pspec
+                                                     # , microwatt_mmu=True
+                                                     )
 
         comb += pdecode2.dec.raw_opcode_in.eq(instruction)
         sim = Simulator(m)
@@ -149,6 +152,7 @@ class TestRunner(unittest.TestCase):
                            traces=[]):
             sim.run()
 
+
 if __name__ == "__main__":
     unittest.main(exit=False)
     suite = unittest.TestSuite()
index a8fcfac5b8fcd7cc65b0e504b5e6fe1552fe9137..e81dd174263a910be21531f13f080bad54cb7cb7 100644 (file)
@@ -31,10 +31,11 @@ import power_instruction_analyzer as pia
 
 debughang = 1
 
+
 def set_fsm_inputs(alu, dec2, sim):
     # TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
     # detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
-    # and place it into data_i.b
+    # and place it into i_data.b
 
     print("Error here")
     inp = yield from get_cu_inputs(dec2, sim)
@@ -45,16 +46,16 @@ def set_fsm_inputs(alu, dec2, sim):
     # yield from ALUHelpers.set_spr_spr1(alu, dec2, inp)
 
     overflow = None
-    a=None
-    b=None
+    a = None
+    b = None
     # TODO
     if 'xer_so' in inp:
         print("xer_so::::::::::::::::::::::::::::::::::::::::::::::::")
         so = inp['xer_so']
         print(so)
         overflow = pia.OverflowFlags(so=bool(so),
-                                      ov=False,
-                                      ov32=False)
+                                     ov=False,
+                                     ov32=False)
     if 'ra' in inp:
         a = inp['ra']
     if 'rb' in inp:
@@ -65,12 +66,14 @@ def set_fsm_inputs(alu, dec2, sim):
 
 def check_fsm_outputs(fsm, pdecode2, sim, code):
     # check that MMUOutputData is correct
-    return None #TODO
+    return None  # TODO
+
+# incomplete test - connect fsm inputs first
+
 
-#incomplete test - connect fsm inputs first
 class MMUIlangCase(TestAccumulatorBase):
-    #def case_ilang(self):
-    #    pspec = SPRPipeSpec(id_wid=2)
+    # def case_ilang(self):
+    #    pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
     #    alu = SPRBasePipe(pspec)
     #    vl = rtlil.convert(alu, ports=alu.ports())
     #    with open("trap_pipeline.il", "w") as f:
@@ -82,6 +85,8 @@ class TestRunner(unittest.TestCase):
     def __init__(self, test_data):
         super().__init__("run_all")
         self.test_data = test_data
+        # hack here -- all unit tests are affected
+        self.run_all()
 
     def check_fsm_outputs(self, alu, dec2, sim, code, pia_res):
 
@@ -96,26 +101,25 @@ class TestRunner(unittest.TestCase):
         sim_o = {}
         res = {}
 
-        #MMUOutputData does not have xer
+        # MMUOutputData does not have xer
 
         yield from ALUHelpers.get_cr_a(res, alu, dec2)
-        #yield from ALUHelpers.get_xer_ov(res, alu, dec2)
+        # yield from ALUHelpers.get_xer_ov(res, alu, dec2)
         yield from ALUHelpers.get_int_o(res, alu, dec2)
-        #yield from ALUHelpers.get_xer_so(res, alu, dec2)
-
+        # yield from ALUHelpers.get_xer_so(res, alu, dec2)
 
         print("res output", res)
 
         yield from ALUHelpers.get_sim_int_o(sim_o, sim, dec2)
         yield from ALUHelpers.get_wr_sim_cr_a(sim_o, sim, dec2)
-        #yield from ALUHelpers.get_sim_xer_ov(sim_o, sim, dec2)
-        #yield from ALUHelpers.get_sim_xer_so(sim_o, sim, dec2)
+        # yield from ALUHelpers.get_sim_xer_ov(sim_o, sim, dec2)
+        # yield from ALUHelpers.get_sim_xer_so(sim_o, sim, dec2)
 
         print("sim output", sim_o)
 
         print("power-instruction-analyzer result:")
         print(pia_res)
-        #if pia_res is not None:
+        # if pia_res is not None:
         #    with self.subTest(check="pia", sim_o=sim_o, pia_res=str(pia_res)):
         #        pia_o = pia_res_to_output(pia_res)
         #        ALUHelpers.check_int_o(self, res, pia_o, code)
@@ -124,18 +128,18 @@ class TestRunner(unittest.TestCase):
         #        #ALUHelpers.check_xer_so(self, res, pia_o, code)
 
         with self.subTest(check="sim", sim_o=sim_o, pia_res=str(pia_res)):
-            #ALUHelpers.check_int_o(self, res, sim_o, code) # mmu is not an alu
+            # ALUHelpers.check_int_o(self, res, sim_o, code) # mmu is not an alu
             ALUHelpers.check_cr_a(self, res, sim_o, code)
             #ALUHelpers.check_xer_ov(self, res, sim_o, code)
             #ALUHelpers.check_xer_so(self, res, sim_o, code)
 
-        #oe = yield dec2.e.do.oe.oe
-        #oe_ok = yield dec2.e.do.oe.ok
+        # oe = yield dec2.e.do.oe.oe
+        # oe_ok = yield dec2.e.do.oe.ok
         #print("oe, oe_ok", oe, oe_ok)
-        #if not oe or not oe_ok:
+        # if not oe or not oe_ok:
         #    # if OE not enabled, XER SO and OV must not be activated
-        #    so_ok = yield alu.n.data_o.xer_so.ok
-        #    ov_ok = yield alu.n.data_o.xer_ov.ok
+        #    so_ok = yield alu.n.o_data.xer_so.ok
+        #    ov_ok = yield alu.n.o_data.xer_ov.ok
         #    print("so, ov", so_ok, ov_ok)
         #    self.assertEqual(ov_ok, False, code)
         #    self.assertEqual(so_ok, False, code)
@@ -179,7 +183,7 @@ class TestRunner(unittest.TestCase):
             print("dec2 spr/fast in", fast_out, spr_out)
 
             fn_unit = yield pdecode2.e.do.fn_unit
-            #FIXME this fails -- self.assertEqual(fn_unit, Function.SPR.value)
+            # FIXME this fails -- self.assertEqual(fn_unit, Function.SPR.value)
             pia_res = yield from set_fsm_inputs(fsm, pdecode2, sim)
             yield
             opname = code.split(' ')[0]
@@ -189,14 +193,15 @@ class TestRunner(unittest.TestCase):
             index = pc//4
             print("pc after %08x" % (pc))
 
-            vld = yield fsm.n.valid_o #fsm
+            vld = yield fsm.n.o_valid  # fsm
             while not vld:
                 yield
                 if debughang:
                     print("not valid -- hang")
                     return
-                vld = yield fsm.n.valid_o
-                if debughang==2: vld=1
+                vld = yield fsm.n.o_valid
+                if debughang == 2:
+                    vld = 1
             yield
 
             yield from self.check_fsm_outputs(fsm, pdecode2, sim, code, pia_res)
@@ -206,7 +211,7 @@ class TestRunner(unittest.TestCase):
         comb = m.d.comb
         instruction = Signal(32)
 
-        pspec = TestMemPspec(addr_wid=48,
+        pspec = TestMemPspec(addr_wid=64,
                              mask_wid=8,
                              reg_wid=64,
                              )
@@ -215,18 +220,18 @@ class TestRunner(unittest.TestCase):
 
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
 
-        pipe_spec = MMUPipeSpec(id_wid=2)
+        pipe_spec = MMUPipeSpec(id_wid=2, parent_pspec=None)
         ldst = LoadStore1(pspec)
         fsm = FSMMMUStage(pipe_spec)
         fsm.set_ldst_interface(ldst)
         m.submodules.fsm = fsm
         m.submodules.ldst = ldst
 
-        #FIXME connect fsm inputs
+        # FIXME connect fsm inputs
 
-        comb += fsm.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
-        comb += fsm.p.valid_i.eq(1)
-        comb += fsm.n.ready_i.eq(1)
+        comb += fsm.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+        comb += fsm.p.i_valid.eq(1)
+        comb += fsm.n.i_ready.eq(1)
         comb += pdecode2.dec.raw_opcode_in.eq(instruction)
         sim = Simulator(m)
 
@@ -245,6 +250,7 @@ class TestRunner(unittest.TestCase):
                            traces=[]):
             sim.run()
 
+
 if __name__ == "__main__":
     unittest.main(exit=False)
     suite = unittest.TestSuite()
index f1837baa2e0c5e3183a2fe84778f5c82f8785cee..a78294606b82f5c0a3aaced9051d8ee5eeeb6fe3 100644 (file)
@@ -84,18 +84,19 @@ class Driver(Elaboratable):
 
         # set up the mul stages.  do not add them to m.submodules, this
         # is handled by StageChain.setup().
-        pspec = MulPipeSpec(id_wid=2)
+        pspec = MulPipeSpec(id_wid=2, parent_pspec=None)
         pipe1 = MulMainStage1(pspec)
         pipe2 = MulMainStage2(pspec)
         pipe3 = MulMainStage3(pspec)
 
-        class Dummy: pass
-        dut = Dummy() # make a class into which dut.i and dut.o can be dropped
+        class Dummy:
+            pass
+        dut = Dummy()  # make a class into which dut.i and dut.o can be dropped
         dut.i = pipe1.ispec()
-        chain = [pipe1, pipe2, pipe3] # chain of 3 mul stages
+        chain = [pipe1, pipe2, pipe3]  # chain of 3 mul stages
 
-        StageChain(chain).setup(m, dut.i) # input linked here, through chain
-        dut.o = chain[-1].o # output is the last thing in the chain...
+        StageChain(chain).setup(m, dut.i)  # input linked here, through chain
+        dut.o = chain[-1].o  # output is the last thing in the chain...
 
         # convenience variables
         a = dut.i.ra
@@ -145,7 +146,7 @@ class Driver(Elaboratable):
         # setup random inputs
         comb += [a.eq(AnyConst(64)),
                  b.eq(AnyConst(64)),
-                ]
+                 ]
 
         comb += dut.i.ctx.op.eq(rec)
 
@@ -169,7 +170,7 @@ class Driver(Elaboratable):
             ###### HI-32 #####
 
             with m.Case(MicrOp.OP_MUL_H32):
-                comb += Assume(rec.is_32bit) # OP_MUL_H32 is a 32-bit op
+                comb += Assume(rec.is_32bit)  # OP_MUL_H32 is a 32-bit op
 
                 exp_prod = Signal(64)
                 expected_o = Signal.like(exp_prod)
@@ -186,7 +187,7 @@ class Driver(Elaboratable):
                     # differ, we negate the product.  This implies that
                     # the product is calculated from the absolute values
                     # of the inputs.
-                    prod = Signal.like(exp_prod) # intermediate product
+                    prod = Signal.like(exp_prod)  # intermediate product
                     comb += prod.eq(abs32_a * abs32_b)
                     comb += exp_prod.eq(Mux(ab32_sne, -prod, prod))
                     comb += expected_o.eq(Repl(exp_prod[32:64], 2))
@@ -210,7 +211,7 @@ class Driver(Elaboratable):
                     # differ, we negate the product.  This implies that
                     # the product is calculated from the absolute values
                     # of the inputs.
-                    prod = Signal.like(exp_prod) # intermediate product
+                    prod = Signal.like(exp_prod)  # intermediate product
                     comb += prod.eq(abs64_a * abs64_b)
                     comb += exp_prod.eq(Mux(ab64_sne, -prod, prod))
                     comb += Assert(o[0:64] == exp_prod[64:128])
@@ -285,6 +286,7 @@ class MulTestCase(FHDLTestCase):
         module = Driver()
         self.assertFormal(module, mode="bmc", depth=2)
         self.assertFormal(module, mode="cover", depth=2)
+
     def test_ilang(self):
         dut = Driver()
         vl = rtlil.convert(dut, ports=[])
index 68bcf47d5df861ec71708dac7b3b26121a3e4dc4..e2a2727f2c1de56cbdbfa9c72e7dc320166f12cb 100644 (file)
@@ -3,7 +3,7 @@
 from nmigen import Module
 from nmutil.pipemodbase import PipeModBase
 from soc.fu.mul.pipe_data import MulIntermediateData, MulOutputData
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
 
 
 class MulMainStage2(PipeModBase):
index a55e80d1d335d19bdb1ee04475291aaabc0d06fa..a5047be722cc36559018cfd9485c7b7b82ce70ac 100644 (file)
@@ -15,8 +15,6 @@ class MulIntermediateData(DivInputData):
 
 
 class MulOutputData(FUBaseData):
-    regspec = [('INT', 'o', '0:128'),
-               ('XER', 'xer_so', '32')] # XER bit 32: SO
     def __init__(self, pspec):
         super().__init__(pspec, False) # still input style
 
@@ -25,7 +23,12 @@ class MulOutputData(FUBaseData):
         self.data.append(self.neg_res)
         self.data.append(self.neg_res32)
 
+    @property
+    def regspec(self):
+        return [('INT', 'o', "0:%d" % (self.pspec.XLEN*2)), # 2xXLEN
+               ('XER', 'xer_so', '32')] # XER bit 32: SO
+
 
 class MulPipeSpec(CommonPipeSpec):
-    regspec = (DivInputData.regspec, DivMulOutputData.regspec)
+    regspecklses = (DivInputData, DivMulOutputData)
     opsubsetkls = CompMULOpSubset
index 0b45c791ade830433a5cde5ce2638d2eb7e07d6e..d7e8df417a4a699861b59ff86705a11b11281dc4 100644 (file)
@@ -10,7 +10,7 @@ from nmigen import (Module, Signal, Cat, Repl, Mux, signed)
 from nmutil.pipemodbase import PipeModBase
 from soc.fu.div.pipe_data import DivMulOutputData
 from soc.fu.mul.pipe_data import MulOutputData
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
 from openpower.decoder.power_enums import MicrOp
 
 
index f22964dd5b50be0930fe6fa77c1841486b1363e1..a8a7fb4e5201ad479d22a9a61c7d7bb7dfa14034 100644 (file)
@@ -4,7 +4,7 @@ from nmigen import (Module, Signal, Mux)
 from nmutil.pipemodbase import PipeModBase
 from soc.fu.div.pipe_data import DivInputData
 from soc.fu.mul.pipe_data import MulIntermediateData
-from ieee754.part.partsig import PartitionedSignal
+from ieee754.part.partsig import SimdSignal
 from nmutil.util import eq32
 
 class MulMainStage1(PipeModBase):
@@ -18,6 +18,7 @@ class MulMainStage1(PipeModBase):
         return MulIntermediateData(self.pspec) # pipeline stage output format
 
     def elaborate(self, platform):
+        XLEN = self.pspec.XLEN
         m = Module()
         comb = m.d.comb
 
@@ -35,8 +36,8 @@ class MulMainStage1(PipeModBase):
         comb += is_32bit.eq(op.is_32bit)
 
         # work out if a/b are negative (check 32-bit / signed)
-        comb += sign_a.eq(Mux(op.is_32bit, a[31], a[63]) & op.is_signed)
-        comb += sign_b.eq(Mux(op.is_32bit, b[31], b[63]) & op.is_signed)
+        comb += sign_a.eq(Mux(op.is_32bit, a[31], a[XLEN-1]) & op.is_signed)
+        comb += sign_b.eq(Mux(op.is_32bit, b[31], b[XLEN-1]) & op.is_signed)
         comb += sign32_a.eq(a[31] & op.is_signed)
         comb += sign32_b.eq(b[31] & op.is_signed)
 
@@ -47,8 +48,8 @@ class MulMainStage1(PipeModBase):
         # negation of a 64-bit value produces the same lower 32-bit
         # result as negation of just the lower 32-bits, so we don't
         # need to do anything special before negating
-        abs_a = Signal(64, reset_less=True)
-        abs_b = Signal(64, reset_less=True)
+        abs_a = Signal(XLEN, reset_less=True)
+        abs_b = Signal(XLEN, reset_less=True)
         comb += abs_a.eq(Mux(sign_a, -a, a))
         comb += abs_b.eq(Mux(sign_b, -b, b))
 
index 06f1a56fa64ff0ad54e7242284b4c92a48a1e1cb..30cb94966d3292916d94b7e809ad70f88e55e609 100644 (file)
@@ -42,7 +42,7 @@ def get_cu_inputs(dec2, sim):
 def set_alu_inputs(alu, dec2, sim, has_third_input):
     # TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
     # detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
-    # and place it into data_i.b
+    # and place it into i_data.b
 
     inp = yield from get_cu_inputs(dec2, sim)
     print("set alu inputs", inp)
@@ -59,8 +59,14 @@ def set_alu_inputs(alu, dec2, sim, has_third_input):
         overflow = pia.OverflowFlags(so=bool(so),
                                      ov=False,
                                      ov32=False)
+    immediate_ok = yield dec2.e.do.imm_data.ok
+    if immediate_ok:
+        immediate = yield dec2.e.do.imm_data.data
+    else:
+        immediate = None
     rc = inp["rc"] if has_third_input else None
     return pia.InstructionInput(ra=inp.get("ra"), rb=inp.get("rb"),
+                                immediate=immediate,
                                 rc=rc, overflow=overflow)
 
 
@@ -96,33 +102,25 @@ class MulTestHelper(unittest.TestCase):
                                                    has_third_input)
 
             # set valid for one cycle, propagate through pipeline...
-            yield alu.p.valid_i.eq(1)
+            yield alu.p.i_valid.eq(1)
             yield
-            yield alu.p.valid_i.eq(0)
+            yield alu.p.i_valid.eq(0)
 
             opname = code.split(' ')[0]
             fnname = opname.replace(".", "_")
             print(f"{fnname}({pia_inputs})")
-            pia_res = None
-            try:
-                pia_res = getattr(pia, fnname)(pia_inputs)
-            except AttributeError:
-                EXPECTED_FAILURES = ["mulli"]
-                if fnname not in EXPECTED_FAILURES:
-                    raise
-                else:
-                    print("not implemented, as expected.")
+            pia_res = getattr(pia, fnname)(pia_inputs)
             print(f"-> {pia_res}")
 
             yield from isa_sim.call(opname)
             index = isa_sim.pc.CIA.value//4
 
             # ...wait for valid to pop out the end
-            vld = yield alu.n.valid_o
+            vld = yield alu.n.o_valid
             while not vld:
                 yield
                 yield Delay(0.1e-6)
-                vld = yield alu.n.valid_o
+                vld = yield alu.n.o_valid
             yield Delay(0.1e-6)
 
             # XXX sim._engine is an internal variable
@@ -148,11 +146,14 @@ class MulTestHelper(unittest.TestCase):
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
         pdecode = pdecode2.dec
 
-        pspec = MulPipeSpec(id_wid=2)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = MulPipeSpec(id_wid=2, parent_pspec=pps)
         m.submodules.alu = alu = MulBasePipe(pspec)
 
-        comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
-        comb += alu.n.ready_i.eq(1)
+        comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+        comb += alu.n.i_ready.eq(1)
         comb += pdecode2.dec.raw_opcode_in.eq(instruction)
         sim = Simulator(m)
 
@@ -183,8 +184,8 @@ class MulTestHelper(unittest.TestCase):
         oe_ok = yield dec2.e.do.oe.ok
         if not oe or not oe_ok:
             # if OE not enabled, XER SO and OV must correspondingly be false
-            so_ok = yield alu.n.data_o.xer_so.ok
-            ov_ok = yield alu.n.data_o.xer_ov.ok
+            so_ok = yield alu.n.o_data.xer_so.ok
+            ov_ok = yield alu.n.o_data.xer_ov.ok
             self.assertEqual(so_ok, False, code)
             self.assertEqual(ov_ok, False, code)
 
index 75d7fce15ab622a42042984f2c63cfa751642ea3..afa4e00701b368cc6f8a1b4139d3ccb4e4923732 100644 (file)
@@ -1,16 +1,17 @@
 import unittest
 from soc.fu.mul.test.helper import MulTestHelper
 from openpower.test.mul.long_mul_cases import (MulTestCases2Arg,
-                                               MulTestCases3Arg)
+                                               MulTestCases3Arg,
+                                               MUL_3_ARG_TEST_VALUES)
 
 
 class TestPipeLong(MulTestHelper):
     def test_mul_pipe_2_arg(self):
-        self.run_all(MulTestCases2Arg().test_data, "mul_pipe_caller_long_2_arg",
-                     has_third_input=False)
+        self.run_all(MulTestCases2Arg({'soc'}).test_data,
+                     "mul_pipe_caller_long_2_arg", has_third_input=False)
 
     def helper_3_arg(self, subtest_index):
-        self.run_all(MulTestCases3Arg(subtest_index).test_data,
+        self.run_all(MulTestCases3Arg(subtest_index, {'soc'}).test_data,
                      f"mul_pipe_caller_long_3_arg_{subtest_index}",
                      has_third_input=True)
 
index 22af35ba90037441175670866306bdd1c6743c82..7411b586b7ad8c5481c41bc27b4b2cf78ab155cf 100644 (file)
@@ -6,7 +6,10 @@ from soc.fu.mul.pipeline import MulBasePipe
 
 class TestPipeIlang(unittest.TestCase):
     def write_ilang(self):
-        pspec = MulPipeSpec(id_wid=2)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = MulPipeSpec(id_wid=2, parent_pspec=pps)
         alu = MulBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("mul_pipeline.il", "w") as f:
index abee2df9a9c91050f7978ebac50b9b4acbd5e94e..427f5c6a0cdc52a7532a989e432782a9e57f1cb0 100644 (file)
@@ -17,12 +17,14 @@ class FUBaseData:
     """
 
     def __init__(self, pspec, output, exc_kls=None):
-        self.ctx = PipeContext(pspec) # context for ReservationStation usage
+        self.pspec = pspec
+        self.ctx = PipeContext(pspec)  # context for ReservationStation usage
         self.muxid = self.ctx.muxid
         self.data = []
         self.is_output = output
         # take regspec and create data attributes (in or out)
         # TODO: use widspec to create reduced bit mapping.
+        print (self.regspec)
         for i, (regfile, regname, widspec) in enumerate(self.regspec):
             wid = get_regspec_bitwidth([self.regspec], 0, i)
             if output:
@@ -42,22 +44,27 @@ class FUBaseData:
         if hasattr(self, "exception"):
             yield from self.exception.ports()
 
+    # convenience function to return 0:63 if XLEN=64, 0:31 if XLEN=32 etc.
+    @property
+    def intrange(self):
+        return "0:%d" % (self.pspec.XLEN-1)
+
     def eq(self, i):
         eqs = [self.ctx.eq(i.ctx)]
         assert len(self.data) == len(i.data), \
-               "length of %s mismatch against %s: %s %s" % \
-                   (repr(self), repr(i), repr(self.data), repr(i.data))
+            "length of %s mismatch against %s: %s %s" % \
+            (repr(self), repr(i), repr(self.data), repr(i.data))
         for j in range(len(self.data)):
             assert type(self.data[j]) == type(i.data[j]), \
-                   "type mismatch in FUBaseData %s %s" % \
-                   (repr(self.data[j]), repr(i.data[j]))
+                "type mismatch in FUBaseData %s %s" % \
+                (repr(self.data[j]), repr(i.data[j]))
             eqs.append(self.data[j].eq(i.data[j]))
         if hasattr(self, "exception"):
             eqs.append(self.exception.eq(i.exception))
         return eqs
 
     def ports(self):
-        return self.ctx.ports() # TODO: include self.data
+        return self.ctx.ports()  # TODO: include self.data
 
 
 # hmmm there has to be a better way than this
@@ -74,9 +81,27 @@ class CommonPipeSpec:
     """CommonPipeSpec: base class for all pipeline specifications
     see README.md for explanation of members.
     """
-    def __init__(self, id_wid):
+
+    def __init__(self, id_wid, parent_pspec):
         self.pipekls = SimpleHandshakeRedir
         self.id_wid = id_wid
         self.opkls = lambda _: self.opsubsetkls()
-        self.op_wid = get_rec_width(self.opkls(None)) # hmm..
+        self.op_wid = get_rec_width(self.opkls(None))  # hmm..
         self.stage = None
+        self.parent_pspec = parent_pspec
+
+    # forward attributes from parent_pspec
+    def __getattr__(self, name):
+        return getattr(self.parent_pspec, name)
+
+
+def get_pspec_draft_bitmanip(pspec):
+    """ True if the draft bitmanip instructions are enabled in the provided
+    pspec. The instructions enabled by this are draft instructions -- they are
+    not official OpenPower instructions, they are intended to be eventually
+    submitted to the OpenPower ISA WG.
+
+    https://libre-soc.org/openpower/sv/bitmanip/
+    """
+    # use `is True` to account for Mock absurdities
+    return getattr(pspec, "draft_bitmanip", False) is True
index 07d071206f5e65d8ca41040cb1a59f7c96812b32..f5971aadff87b2cad67b9d6610ee929f2081f11f 100644 (file)
@@ -39,6 +39,7 @@ def get_regspec_bitwidth(regspec, srcdest, idx):
 class RegSpec:
     def __init__(self, rwid, n_src=None, n_dst=None, name=None):
         self._rwid = rwid
+        print ("RegSpec", rwid)
         if isinstance(rwid, int):
             # rwid: integer (covers all registers)
             self._n_src, self._n_dst = n_src, n_dst
@@ -65,6 +66,11 @@ class RegSpecAPI:
         """
         self.rwid = rwid
 
+    def get_io_spec(self, direction, i):
+        if direction: # input (read specs)
+            return self.get_in_spec(i)
+        return self.get_out_spec(i)
+
     def get_in_spec(self, i):
         return self.rwid[0][i]
 
@@ -92,15 +98,15 @@ class RegSpecALUAPI(RegSpecAPI):
         if isinstance(self.rwid, int):  # old - testing - API (rwid is int)
             return self.alu.out[i]
         # regspec-based API: look up variable through regspec thru row number
-        return getattr(self.alu.n.data_o, self.get_out_name(i))
+        return getattr(self.alu.n.o_data, self.get_out_name(i))
 
     def get_in(self, i):
         if isinstance(self.rwid, int):  # old - testing - API (rwid is int)
             return self.alu.i[i]
         # regspec-based API: look up variable through regspec thru row number
-        return getattr(self.alu.p.data_i, self.get_in_name(i))
+        return getattr(self.alu.p.i_data, self.get_in_name(i))
 
     def get_op(self):
         if isinstance(self.rwid, int):  # old - testing - API (rwid is int)
             return self.alu.op
-        return self.alu.p.data_i.ctx.op
+        return self.alu.p.i_data.ctx.op
index 5d8bae28fd3773f655679a9596902dc6644e2511..379211d623a01259f77c90229cae0d57f40228a7 100644 (file)
-# Proof of correctness for partitioned equal signal combiner
+# Proof of correctness for shift/rotate FU
 # Copyright (C) 2020 Michael Nolan <mtnolan2640@gmail.com>
 """
 Links:
 * https://bugs.libre-soc.org/show_bug.cgi?id=340
+
+run tests with:
+pip install pytest
+pip install pytest-xdist
+pytest -n auto src/soc/fu/shift_rot/formal/proof_main_stage.py
+because that tells pytest to run the tests in parallel, it will take a few
+minutes instead of an hour.
 """
 
+import unittest
+import enum
 from nmigen import (Module, Signal, Elaboratable, Mux, Cat, Repl,
-                    signed)
-from nmigen.asserts import Assert, AnyConst, Assume, Cover
+                    signed, Const, unsigned)
+from nmigen.asserts import Assert, AnyConst, Assume
 from nmutil.formaltest import FHDLTestCase
-from nmigen.cli import rtlil
+from nmutil.sim_util import do_sim
+from nmigen.sim import Delay
 
 from soc.fu.shift_rot.main_stage import ShiftRotMainStage
-from soc.fu.shift_rot.rotator import right_mask, left_mask
 from soc.fu.shift_rot.pipe_data import ShiftRotPipeSpec
-from soc.fu.shift_rot.sr_input_record import CompSROpSubset
 from openpower.decoder.power_enums import MicrOp
-from openpower.consts import field
 
-import unittest
-from nmutil.extend import exts
+
+@enum.unique
+class TstOp(enum.Enum):
+    """ops we're testing, the idea is if we run a separate formal proof for
+    each instruction, we end up covering them all and each runs much faster,
+    also the formal proofs can be run in parallel."""
+    SHL = MicrOp.OP_SHL
+    SHR = MicrOp.OP_SHR
+    RLC32 = MicrOp.OP_RLC, 32
+    RLC64 = MicrOp.OP_RLC, 64
+    RLCL = MicrOp.OP_RLCL
+    RLCR = MicrOp.OP_RLCR
+    EXTSWSLI = MicrOp.OP_EXTSWSLI
+    TERNLOG = MicrOp.OP_TERNLOG
+    # grev removed -- leaving code for later use in grevlut
+    # GREV32 = MicrOp.OP_GREV, 32
+    # GREV64 = MicrOp.OP_GREV, 64
+
+    @property
+    def op(self):
+        if isinstance(self.value, tuple):
+            return self.value[0]
+        return self.value
+
+
+def eq_any_const(sig: Signal):
+    return sig.eq(AnyConst(sig.shape(), src_loc_at=1))
+
+
+class Mask(Elaboratable):
+    # copied from qemu's mask fn:
+    # https://gitlab.com/qemu-project/qemu/-/blob/477c3b934a47adf7de285863f59d6e4503dd1a6d/target/ppc/internal.h#L21
+    def __init__(self):
+        self.start = Signal(6)
+        self.end = Signal(6)
+        self.out = Signal(64)
+
+    def elaborate(self, platform):
+        m = Module()
+        max_val = Const(~0, unsigned(64))
+        max_bit = 63
+        with m.If(self.start == 0):
+            m.d.comb += self.out.eq(max_val << (max_bit - self.end))
+        with m.Elif(self.end == max_bit):
+            m.d.comb += self.out.eq(max_val >> self.start)
+        with m.Else():
+            ret = (max_val >> self.start) ^ ((max_val >> self.end) >> 1)
+            m.d.comb += self.out.eq(Mux(self.start > self.end, ~ret, ret))
+        return m
+
+
+class TstMask(unittest.TestCase):
+    def test_mask(self):
+        dut = Mask()
+
+        def case(start, end, expected):
+            with self.subTest(start=start, end=end):
+                yield dut.start.eq(start)
+                yield dut.end.eq(end)
+                yield Delay(1e-6)
+                out = yield dut.out
+                with self.subTest(out=hex(out), expected=hex(expected)):
+                    self.assertEqual(expected, out)
+
+        def process():
+            for start in range(64):
+                for end in range(64):
+                    expected = 0
+                    if start > end:
+                        for i in range(start, 64):
+                            expected |= 1 << (63 - i)
+                        for i in range(0, end + 1):
+                            expected |= 1 << (63 - i)
+                    else:
+                        for i in range(start, end + 1):
+                            expected |= 1 << (63 - i)
+                    yield from case(start, end, expected)
+        with do_sim(self, dut, [dut.start, dut.end, dut.out]) as sim:
+            sim.add_process(process)
+            sim.run()
+
+
+def rotl64(v, amt):
+    v |= Const(0, 64)  # convert to value at least 64-bits wide
+    amt |= Const(0, 6)  # convert to value at least 6-bits wide
+    return (Cat(v[:64], v[:64]) >> (64 - amt[:6]))[:64]
+
+
+def rotl32(v, amt):
+    v |= Const(0, 32)  # convert to value at least 32-bits wide
+    return rotl64(Cat(v[:32], v[:32]), amt)
 
 
 # This defines a module to drive the device under test and assert
 # properties about its outputs
 class Driver(Elaboratable):
-    def __init__(self):
-        # inputs and outputs
-        pass
+    def __init__(self, which):
+        assert isinstance(which, TstOp) or which is None
+        self.which = which
 
     def elaborate(self, platform):
         m = Module()
         comb = m.d.comb
 
-        rec = CompSROpSubset()
-        # Setup random inputs for dut.op.  do them explicitly so that
-        # we can see which ones cause failures in the debug report
-        #for p in rec.ports():
-        #    comb += p.eq(AnyConst(p.width))
-        comb += rec.insn_type.eq(AnyConst(rec.insn_type.width))
-        comb += rec.fn_unit.eq(AnyConst(rec.fn_unit.width))
-        comb += rec.imm_data.imm.eq(AnyConst(rec.imm_data.imm.width))
-        comb += rec.imm_data.imm_ok.eq(AnyConst(rec.imm_data.imm_ok.width))
-        comb += rec.rc.rc.eq(AnyConst(rec.rc.rc.width))
-        comb += rec.rc.rc_ok.eq(AnyConst(rec.rc.rc_ok.width))
-        comb += rec.oe.oe.eq(AnyConst(rec.oe.oe.width))
-        comb += rec.oe.oe_ok.eq(AnyConst(rec.oe.oe_ok.width))
-        comb += rec.write_cr0.eq(AnyConst(rec.write_cr0.width))
-        comb += rec.input_carry.eq(AnyConst(rec.input_carry.width))
-        comb += rec.output_carry.eq(AnyConst(rec.output_carry.width))
-        comb += rec.input_cr.eq(AnyConst(rec.input_cr.width))
-        comb += rec.is_32bit.eq(AnyConst(rec.is_32bit.width))
-        comb += rec.is_signed.eq(AnyConst(rec.is_signed.width))
-        comb += rec.insn.eq(AnyConst(rec.insn.width))
-
-
-        pspec = ShiftRotPipeSpec(id_wid=2)
+        pspec = ShiftRotPipeSpec(id_wid=2, parent_pspec=None)
+        pspec.draft_bitmanip = True
         m.submodules.dut = dut = ShiftRotMainStage(pspec)
 
-        # convenience variables
-        rs = dut.i.rs  # register to shift
-        b = dut.i.rb   # register containing amount to shift by
-        ra = dut.i.a   # source register if masking is to be done
-        carry_in = dut.i.xer_ca[0]
-        carry_in32 = dut.i.xer_ca[1]
-        carry_out = dut.o.xer_ca
-        o = dut.o.o.data
-        print ("fields", rec.fields)
-        itype = rec.insn_type
-
-        # instruction fields
-        m_fields = dut.fields.FormM
-        md_fields = dut.fields.FormMD
-
-        # setup random inputs
-        comb += rs.eq(AnyConst(64))
-        comb += ra.eq(AnyConst(64))
-        comb += b.eq(AnyConst(64))
-        comb += carry_in.eq(AnyConst(1))
-        comb += carry_in32.eq(AnyConst(1))
-
-        # copy operation
-        comb += dut.i.ctx.op.eq(rec)
+        # Set inputs to formal variables
+        comb += [
+            eq_any_const(dut.i.ctx.op.insn_type),
+            eq_any_const(dut.i.ctx.op.fn_unit),
+            eq_any_const(dut.i.ctx.op.imm_data.data),
+            eq_any_const(dut.i.ctx.op.imm_data.ok),
+            eq_any_const(dut.i.ctx.op.rc.rc),
+            eq_any_const(dut.i.ctx.op.rc.ok),
+            eq_any_const(dut.i.ctx.op.oe.oe),
+            eq_any_const(dut.i.ctx.op.oe.ok),
+            eq_any_const(dut.i.ctx.op.write_cr0),
+            eq_any_const(dut.i.ctx.op.input_carry),
+            eq_any_const(dut.i.ctx.op.output_carry),
+            eq_any_const(dut.i.ctx.op.input_cr),
+            eq_any_const(dut.i.ctx.op.is_32bit),
+            eq_any_const(dut.i.ctx.op.is_signed),
+            eq_any_const(dut.i.ctx.op.insn),
+            eq_any_const(dut.i.xer_ca),
+            eq_any_const(dut.i.ra),
+            eq_any_const(dut.i.rb),
+            eq_any_const(dut.i.rc),
+        ]
 
         # check that the operation (op) is passed through (and muxid)
         comb += Assert(dut.o.ctx.op == dut.i.ctx.op)
         comb += Assert(dut.o.ctx.muxid == dut.i.ctx.muxid)
 
-        # signed and signed/32 versions of input rs
-        a_signed = Signal(signed(64))
-        a_signed_32 = Signal(signed(32))
-        comb += a_signed.eq(rs)
-        comb += a_signed_32.eq(rs[0:32])
-
-        # masks: start-left
-        mb = Signal(7, reset_less=True)
-        ml = Signal(64, reset_less=True)
-
-        # clear left?
-        with m.If((itype == MicrOp.OP_RLC) | (itype == MicrOp.OP_RLCL)):
-            with m.If(rec.is_32bit):
-                comb += mb.eq(m_fields.MB)
-            with m.Else():
-                comb += mb.eq(md_fields.mb)
-        with m.Else():
-            with m.If(rec.is_32bit):
-                comb += mb.eq(b[0:6])
-            with m.Else():
-                comb += mb.eq(b+32)
-        comb += ml.eq(left_mask(m, mb))
-
-        # masks: end-right
-        me = Signal(7, reset_less=True)
-        mr = Signal(64, reset_less=True)
-
-        # clear right?
-        with m.If((itype == MicrOp.OP_RLC) | (itype == MicrOp.OP_RLCR)):
-            with m.If(rec.is_32bit):
-                comb += me.eq(m_fields.ME)
-            with m.Else():
-                comb += me.eq(md_fields.me)
-        with m.Else():
-            with m.If(rec.is_32bit):
-                comb += me.eq(b[0:6])
-            with m.Else():
-                comb += me.eq(63-b)
-        comb += mr.eq(right_mask(m, me))
-
-        # must check Data.ok
-        o_ok = Signal()
-        comb += o_ok.eq(1)
-
-        # main assertion of arithmetic operations
-        with m.Switch(itype):
-
-            # left-shift: 64/32-bit
-            with m.Case(MicrOp.OP_SHL):
-                comb += Assume(ra == 0)
-                with m.If(rec.is_32bit):
-                    comb += Assert(o[0:32] == ((rs << b[0:6]) & 0xffffffff))
-                    comb += Assert(o[32:64] == 0)
-                with m.Else():
-                    comb += Assert(o == ((rs << b[0:7]) & ((1 << 64)-1)))
-
-            # right-shift: 64/32-bit / signed
-            with m.Case(MicrOp.OP_SHR):
-                comb += Assume(ra == 0)
-                with m.If(~rec.is_signed):
-                    with m.If(rec.is_32bit):
-                        comb += Assert(o[0:32] == (rs[0:32] >> b[0:6]))
-                        comb += Assert(o[32:64] == 0)
-                    with m.Else():
-                        comb += Assert(o == (rs >> b[0:7]))
-                with m.Else():
-                    with m.If(rec.is_32bit):
-                        comb += Assert(o[0:32] == (a_signed_32 >> b[0:6]))
-                        comb += Assert(o[32:64] == Repl(rs[31], 32))
-                    with m.Else():
-                        comb += Assert(o == (a_signed >> b[0:7]))
-
-            # extswsli: 32/64-bit moded
-            with m.Case(MicrOp.OP_EXTSWSLI):
-                comb += Assume(ra == 0)
-                with m.If(rec.is_32bit):
-                    comb += Assert(o[0:32] == ((rs << b[0:6]) & 0xffffffff))
-                    comb += Assert(o[32:64] == 0)
-                with m.Else():
-                    # sign-extend to 64 bit
-                    a_s = Signal(64, reset_less=True)
-                    comb += a_s.eq(exts(rs, 32, 64))
-                    comb += Assert(o == ((a_s << b[0:7]) & ((1 << 64)-1)))
-
-            # rlwinm, rlwnm, rlwimi
-            # *CAN* these even be 64-bit capable?  I don't think they are.
-            with m.Case(MicrOp.OP_RLC):
-                comb += Assume(ra == 0)
-                comb += Assume(rec.is_32bit)
-
-                # Duplicate some signals so that they're much easier to find
-                # in gtkwave.
-                # Pro-tip: when debugging, factor out expressions into
-                # explicitly named
-                # signals, and search using a unique grep-tag (RLC in my case).
-                #   After
-                # debugging, resubstitute values to comply with surrounding
-                # code norms.
-
-                mrl = Signal(64, reset_less=True, name='MASK_FOR_RLC')
-                with m.If(mb > me):
-                    comb += mrl.eq(ml | mr)
-                with m.Else():
-                    comb += mrl.eq(ml & mr)
-
-                ainp = Signal(64, reset_less=True, name='A_INP_FOR_RLC')
-                comb += ainp.eq(field(rs, 32, 63))
-
-                sh = Signal(6, reset_less=True, name='SH_FOR_RLC')
-                comb += sh.eq(b[0:6])
-
-                exp_shl = Signal(64, reset_less=True,
-                                    name='A_SHIFTED_LEFT_BY_SH_FOR_RLC')
-                comb += exp_shl.eq((ainp << sh) & 0xFFFFFFFF)
-
-                exp_shr = Signal(64, reset_less=True,
-                                    name='A_SHIFTED_RIGHT_FOR_RLC')
-                comb += exp_shr.eq((ainp >> (32 - sh)) & 0xFFFFFFFF)
-
-                exp_rot = Signal(64, reset_less=True,
-                                    name='A_ROTATED_LEFT_FOR_RLC')
-                comb += exp_rot.eq(exp_shl | exp_shr)
-
-                exp_ol = Signal(32, reset_less=True, name='EXPECTED_OL_FOR_RLC')
-                comb += exp_ol.eq(field((exp_rot & mrl) | (ainp & ~mrl),
-                                    32, 63))
-
-                act_ol = Signal(32, reset_less=True, name='ACTUAL_OL_FOR_RLC')
-                comb += act_ol.eq(field(o, 32, 63))
-
-                # If I uncomment the following lines, I can confirm that all
-                # 32-bit rotations work.  If I uncomment only one of the
-                # following lines, I can confirm that all 32-bit rotations
-                # work.  When I remove/recomment BOTH lines, however, the
-                # assertion fails.  Why??
-
-#               comb += Assume(mr == 0xFFFFFFFF)
-#               comb += Assume(ml == 0xFFFFFFFF)
-                #with m.If(rec.is_32bit):
-                #    comb += Assert(act_ol == exp_ol)
-                #    comb += Assert(field(o, 0, 31) == 0)
-
-            #TODO
-            with m.Case(MicrOp.OP_RLCR):
-                pass
-            with m.Case(MicrOp.OP_RLCL):
-                pass
-            with m.Default():
-                comb += o_ok.eq(0)
-
-        # check that data ok was only enabled when op actioned
-        comb += Assert(dut.o.o.ok == o_ok)
+        if self.which is None:
+            for i in TstOp:
+                comb += Assume(dut.i.ctx.op.insn_type != i.op)
+            comb += Assert(~dut.o.o.ok)
+        else:
+            # we're only checking a particular operation:
+            comb += Assume(dut.i.ctx.op.insn_type == self.which.op)
+            comb += Assert(dut.o.o.ok)
+
+            # dispatch to check fn for each op
+            getattr(self, f"_check_{self.which.name.lower()}")(m, dut)
 
         return m
 
+    def _check_shl(self, m, dut):
+        m.d.comb += Assume(dut.i.ra == 0)
+        expected = Signal(64)
+        with m.If(dut.i.ctx.op.is_32bit):
+            m.d.comb += expected.eq((dut.i.rs << dut.i.rb[:6])[:32])
+        with m.Else():
+            m.d.comb += expected.eq((dut.i.rs << dut.i.rb[:7])[:64])
+        m.d.comb += Assert(dut.o.o.data == expected)
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    def _check_shr(self, m, dut):
+        m.d.comb += Assume(dut.i.ra == 0)
+        expected = Signal(64)
+        carry = Signal()
+        shift_in_s = Signal(signed(128))
+        shift_roundtrip = Signal(signed(128))
+        shift_in_u = Signal(128)
+        shift_amt = Signal(7)
+        with m.If(dut.i.ctx.op.is_32bit):
+            m.d.comb += [
+                shift_amt.eq(dut.i.rb[:6]),
+                shift_in_s.eq(dut.i.rs[:32].as_signed()),
+                shift_in_u.eq(dut.i.rs[:32]),
+            ]
+        with m.Else():
+            m.d.comb += [
+                shift_amt.eq(dut.i.rb[:7]),
+                shift_in_s.eq(dut.i.rs.as_signed()),
+                shift_in_u.eq(dut.i.rs),
+            ]
+
+        with m.If(dut.i.ctx.op.is_signed):
+            m.d.comb += [
+                expected.eq(shift_in_s >> shift_amt),
+                shift_roundtrip.eq((shift_in_s >> shift_amt) << shift_amt),
+                carry.eq((shift_in_s < 0) & (shift_roundtrip != shift_in_s)),
+            ]
+        with m.Else():
+            m.d.comb += [
+                expected.eq(shift_in_u >> shift_amt),
+                carry.eq(0),
+            ]
+        m.d.comb += Assert(dut.o.o.data == expected)
+        m.d.comb += Assert(dut.o.xer_ca.data == Repl(carry, 2))
+
+    def _check_rlc32(self, m, dut):
+        m.d.comb += Assume(dut.i.ctx.op.is_32bit)
+        # rlwimi, rlwinm, and rlwnm
+
+        m.submodules.mask = mask = Mask()
+        expected = Signal(64)
+        rot = Signal(64)
+        m.d.comb += rot.eq(rotl32(dut.i.rs[:32], dut.i.rb[:5]))
+        m.d.comb += mask.start.eq(dut.fields.FormM.MB[:] + 32)
+        m.d.comb += mask.end.eq(dut.fields.FormM.ME[:] + 32)
+
+        # for rlwinm and rlwnm, ra is guaranteed to be 0, so that part of
+        # the expression turns into a no-op
+        m.d.comb += expected.eq((rot & mask.out) | (dut.i.ra & ~mask.out))
+        m.d.comb += Assert(dut.o.o.data == expected)
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    def _check_rlc64(self, m, dut):
+        m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+        # rldic and rldimi
+
+        # `rb` is always a 6-bit immediate
+        m.d.comb += Assume(dut.i.rb[6:] == 0)
+
+        m.submodules.mask = mask = Mask()
+        expected = Signal(64)
+        rot = Signal(64)
+        m.d.comb += rot.eq(rotl64(dut.i.rs, dut.i.rb[:6]))
+        mb = dut.fields.FormMD.mb[:]
+        m.d.comb += mask.start.eq(Cat(mb[1:6], mb[0]))
+        m.d.comb += mask.end.eq(63 - dut.i.rb[:6])
+
+        # for rldic, ra is guaranteed to be 0, so that part of
+        # the expression turns into a no-op
+        m.d.comb += expected.eq((rot & mask.out) | (dut.i.ra & ~mask.out))
+        m.d.comb += Assert(dut.o.o.data == expected)
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    def _check_rlcl(self, m, dut):
+        m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+        # rldicl and rldcl
+
+        m.d.comb += Assume(~dut.i.ctx.op.is_signed)
+        m.d.comb += Assume(dut.i.ra == 0)
+
+        m.submodules.mask = mask = Mask()
+        m.d.comb += mask.end.eq(63)
+        mb = dut.fields.FormMD.mb[:]
+        m.d.comb += mask.start.eq(Cat(mb[1:6], mb[0]))
+
+        rot = Signal(64)
+        m.d.comb += rot.eq(rotl64(dut.i.rs, dut.i.rb[:6]))
+
+        expected = Signal(64)
+        m.d.comb += expected.eq(rot & mask.out)
+
+        m.d.comb += Assert(dut.o.o.data == expected)
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    def _check_rlcr(self, m, dut):
+        m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+        # rldicr and rldcr
+
+        m.d.comb += Assume(~dut.i.ctx.op.is_signed)
+        m.d.comb += Assume(dut.i.ra == 0)
+
+        m.submodules.mask = mask = Mask()
+        m.d.comb += mask.start.eq(0)
+        me = dut.fields.FormMD.me[:]
+        m.d.comb += mask.end.eq(Cat(me[1:6], me[0]))
+
+        rot = Signal(64)
+        m.d.comb += rot.eq(rotl64(dut.i.rs, dut.i.rb[:6]))
+
+        expected = Signal(64)
+        m.d.comb += expected.eq(rot & mask.out)
+
+        m.d.comb += Assert(dut.o.o.data == expected)
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    def _check_extswsli(self, m, dut):
+        m.d.comb += Assume(dut.i.ra == 0)
+        m.d.comb += Assume(dut.i.rb[6:] == 0)
+        m.d.comb += Assume(~dut.i.ctx.op.is_32bit)  # all instrs. are 64-bit
+        expected = Signal(64)
+        m.d.comb += expected.eq((dut.i.rs[0:32].as_signed() << dut.i.rb[:6]))
+        m.d.comb += Assert(dut.o.o.data == expected)
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    def _check_ternlog(self, m, dut):
+        lut = dut.fields.FormTLI.TLI[:]
+        for i in range(64):
+            idx = Cat(dut.i.rb[i], dut.i.ra[i], dut.i.rc[i])
+            for j in range(8):
+                with m.If(j == idx):
+                    m.d.comb += Assert(dut.o.o.data[i] == lut[j])
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    # grev removed -- leaving code for later use in grevlut
+    def _check_grev32(self, m, dut):
+        m.d.comb += Assume(dut.i.ctx.op.is_32bit)
+        # assert zero-extended
+        m.d.comb += Assert(dut.o.o.data[32:] == 0)
+        i = Signal(5)
+        m.d.comb += eq_any_const(i)
+        idx = dut.i.rb[0: 5] ^ i
+        m.d.comb += Assert((dut.o.o.data >> i)[0] == (dut.i.ra >> idx)[0])
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
+    # grev removed -- leaving code for later use in grevlut
+    def _check_grev64(self, m, dut):
+        m.d.comb += Assume(~dut.i.ctx.op.is_32bit)
+        i = Signal(6)
+        m.d.comb += eq_any_const(i)
+        idx = dut.i.rb[0: 6] ^ i
+        m.d.comb += Assert((dut.o.o.data >> i)[0] == (dut.i.ra >> idx)[0])
+        m.d.comb += Assert(dut.o.xer_ca.data == 0)
+
 
 class ALUTestCase(FHDLTestCase):
-    def test_formal(self):
-        module = Driver()
+    def run_it(self, which):
+        module = Driver(which)
         self.assertFormal(module, mode="bmc", depth=2)
         self.assertFormal(module, mode="cover", depth=2)
-    def test_ilang(self):
-        dut = Driver()
-        vl = rtlil.convert(dut, ports=[])
-        with open("main_stage.il", "w") as f:
-            f.write(vl)
+
+    def test_none(self):
+        self.run_it(None)
+
+    def test_shl(self):
+        self.run_it(TstOp.SHL)
+
+    def test_shr(self):
+        self.run_it(TstOp.SHR)
+
+    def test_rlc32(self):
+        self.run_it(TstOp.RLC32)
+
+    def test_rlc64(self):
+        self.run_it(TstOp.RLC64)
+
+    def test_rlcl(self):
+        self.run_it(TstOp.RLCL)
+
+    def test_rlcr(self):
+        self.run_it(TstOp.RLCR)
+
+    def test_extswsli(self):
+        self.run_it(TstOp.EXTSWSLI)
+
+    def test_ternlog(self):
+        self.run_it(TstOp.TERNLOG)
+
+    @unittest.skip("grev removed -- leaving code for later use in grevlut")
+    def test_grev32(self):
+        self.run_it(TstOp.GREV32)
+
+    @unittest.skip("grev removed -- leaving code for later use in grevlut")
+    def test_grev64(self):
+        self.run_it(TstOp.GREV64)
+
+
+# check that all test cases are covered
+for i in TstOp:
+    assert callable(getattr(ALUTestCase, f"test_{i.name.lower()}"))
 
 
 if __name__ == '__main__':
index 0be12d1b2fd08a9a90456fd81eac606c4f0117bc..2735927839b73d1d8f13f9713f9c9f1fe3b00192 100644 (file)
@@ -8,9 +8,10 @@
 # output stage
 from nmigen import (Module, Signal, Cat, Repl, Mux, Const)
 from nmutil.pipemodbase import PipeModBase
+from soc.fu.pipe_data import get_pspec_draft_bitmanip
 from soc.fu.shift_rot.pipe_data import (ShiftRotOutputData,
-                                       ShiftRotInputData)
-from ieee754.part.partsig import PartitionedSignal
+                                        ShiftRotInputData)
+from nmutil.lut import BitwiseLut
 from openpower.decoder.power_enums import MicrOp
 from soc.fu.shift_rot.rotator import Rotator
 
@@ -21,6 +22,7 @@ from openpower.decoder.power_fieldsn import SignalBitRange
 class ShiftRotMainStage(PipeModBase):
     def __init__(self, pspec):
         super().__init__(pspec, "main")
+        self.draft_bitmanip = get_pspec_draft_bitmanip(pspec)
         self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
         self.fields.create_specs()
 
@@ -31,11 +33,20 @@ class ShiftRotMainStage(PipeModBase):
         return ShiftRotOutputData(self.pspec)
 
     def elaborate(self, platform):
+        XLEN = self.pspec.XLEN
         m = Module()
         comb = m.d.comb
         op = self.i.ctx.op
         o = self.o.o
 
+        bitwise_lut = None
+        if self.draft_bitmanip:
+            bitwise_lut = BitwiseLut(input_count=3, width=XLEN)
+            m.submodules.bitwise_lut = bitwise_lut
+            comb += bitwise_lut.inputs[0].eq(self.i.rb)
+            comb += bitwise_lut.inputs[1].eq(self.i.ra)
+            comb += bitwise_lut.inputs[2].eq(self.i.rc)
+
         # NOTE: the sh field immediate is read in by PowerDecode2
         # (actually DecodeRB), whereupon by way of rb "immediate" mode
         # it ends up in self.i.rb.
@@ -51,32 +62,22 @@ class ShiftRotMainStage(PipeModBase):
         comb += mb_extra.eq(md_fields['mb'][0:-1][0])
 
         # set up microwatt rotator module
-        m.submodules.rotator = rotator = Rotator()
+        m.submodules.rotator = rotator = Rotator(XLEN)
         comb += [
             rotator.me.eq(me),
             rotator.mb.eq(mb),
             rotator.mb_extra.eq(mb_extra),
             rotator.rs.eq(self.i.rs),
             rotator.ra.eq(self.i.a),
-            rotator.shift.eq(self.i.rb), # can also be sh (in immediate mode)
+            rotator.shift.eq(self.i.rb),  # can also be sh (in immediate mode)
             rotator.is_32bit.eq(op.is_32bit),
             rotator.arith.eq(op.is_signed),
         ]
 
-        comb += o.ok.eq(1) # defaults to enabled
+        comb += o.ok.eq(1)  # defaults to enabled
 
         # instruction rotate type
         mode = Signal(4, reset_less=True)
-        with m.Switch(op.insn_type):
-            with m.Case(MicrOp.OP_SHL):  comb += mode.eq(0b0000) # L-shift
-            with m.Case(MicrOp.OP_SHR):  comb += mode.eq(0b0001) # R-shift
-            with m.Case(MicrOp.OP_RLC):  comb += mode.eq(0b0110) # clear LR
-            with m.Case(MicrOp.OP_RLCL): comb += mode.eq(0b0010) # clear L
-            with m.Case(MicrOp.OP_RLCR): comb += mode.eq(0b0100) # clear R
-            with m.Case(MicrOp.OP_EXTSWSLI): comb += mode.eq(0b1000) # L-ext
-            with m.Default():
-                comb += o.ok.eq(0) # otherwise disable
-
         comb += Cat(rotator.right_shift,
                     rotator.clear_left,
                     rotator.clear_right,
@@ -86,6 +87,29 @@ class ShiftRotMainStage(PipeModBase):
         comb += [o.data.eq(rotator.result_o),
                  self.o.xer_ca.data.eq(Repl(rotator.carry_out_o, 2))]
 
+        with m.Switch(op.insn_type):
+            with m.Case(MicrOp.OP_SHL):
+                comb += mode.eq(0b0000)  # L-shift
+            with m.Case(MicrOp.OP_SHR):
+                comb += mode.eq(0b0001)  # R-shift
+            with m.Case(MicrOp.OP_RLC):
+                comb += mode.eq(0b0110)  # clear LR
+            with m.Case(MicrOp.OP_RLCL):
+                comb += mode.eq(0b0010)  # clear L
+            with m.Case(MicrOp.OP_RLCR):
+                comb += mode.eq(0b0100)  # clear R
+            with m.Case(MicrOp.OP_EXTSWSLI):
+                comb += mode.eq(0b1000)  # L-ext
+            if self.draft_bitmanip:
+                with m.Case(MicrOp.OP_TERNLOG):
+                    # TODO: this only works for ternlogi, change to get lut
+                    # value from register when we implement other variants
+                    comb += bitwise_lut.lut.eq(self.fields.FormTLI.TLI[:])
+                    comb += o.data.eq(bitwise_lut.output)
+                    comb += self.o.xer_ca.data.eq(0)
+            with m.Default():
+                comb += o.ok.eq(0)  # otherwise disable
+
         ###### sticky overflow and context, both pass-through #####
 
         comb += self.o.xer_so.data.eq(self.i.xer_so)
index fd2336dda5f50c9aed5da5558d7f2eb419dab265..d783d017ed3851ea2dbb55b2f56b2e20d2ef66eb 100644 (file)
@@ -4,43 +4,52 @@ from soc.fu.alu.pipe_data import ALUOutputData
 
 
 class ShiftRotInputData(FUBaseData):
-    regspec = [('INT', 'ra', '0:63'),      # RA
-               ('INT', 'rb', '0:63'),      # RB
-               ('INT', 'rc', '0:63'),      # RS
-               ('XER', 'xer_so', '32'), # XER bit 32: SO
-               ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32
     def __init__(self, pspec):
         super().__init__(pspec, False)
         # convenience
         self.a, self.b, self.rs = self.ra, self.rb, self.rc
 
+    @property
+    def regspec(self):
+        return [('INT', 'ra', self.intrange),  # RA
+               ('INT', 'rb', self.intrange),  # RB/immediate
+               ('INT', 'rc', self.intrange),  # RB/immediate
+               ('XER', 'xer_so', '32'), # XER bit 32: SO
+               ('XER', 'xer_ca', '34,45')] # XER bit 34/45: CA/CA32
+
 
 # input to shiftrot final stage (common output)
 class ShiftRotOutputData(FUBaseData):
-    regspec = [('INT', 'o', '0:63'),        # RT
-               ('CR', 'cr_a', '0:3'),
-               ('XER', 'xer_so', '32'),    # bit0: so
-               ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
-               ]
     def __init__(self, pspec):
         super().__init__(pspec, True)
         # convenience
         self.cr0 = self.cr_a
 
+    @property
+    def regspec(self):
+        return [('INT', 'o', self.intrange),
+               ('CR', 'cr_a', '0:3'),
+               ('XER', 'xer_so', '32'),    # bit0: so
+               ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
+               ]
+
 
 # output from shiftrot final stage (common output) - note that XER.so
 # is *not* included (the only reason it's in the input is because of CR0)
 class ShiftRotOutputDataFinal(FUBaseData):
-    regspec = [('INT', 'o', '0:63'),        # RT
-               ('CR', 'cr_a', '0:3'),
-               ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
-               ]
     def __init__(self, pspec):
         super().__init__(pspec, True)
         # convenience
         self.cr0 = self.cr_a
 
+    @property
+    def regspec(self):
+        return [('INT', 'o', self.intrange),
+               ('CR', 'cr_a', '0:3'),
+               ('XER', 'xer_ca', '34,45'), # XER bit 34/45: CA/CA32
+               ]
+
 
 class ShiftRotPipeSpec(CommonPipeSpec):
-    regspec = (ShiftRotInputData.regspec, ShiftRotOutputDataFinal.regspec)
+    regspecklses = (ShiftRotInputData, ShiftRotOutputDataFinal)
     opsubsetkls = CompSROpSubset
index 80e46038166e89194147b5c6d3a45f818fa417e8..67dc034c61d07a7f37c7ef5a6ec222743abc0221 100644 (file)
@@ -4,11 +4,15 @@ from soc.fu.shift_rot.input_stage import ShiftRotInputStage
 from soc.fu.shift_rot.main_stage import ShiftRotMainStage
 from soc.fu.shift_rot.output_stage import ShiftRotOutputStage
 
-class ShiftRotStages(PipeModBaseChain):
+class ShiftRotStart(PipeModBaseChain):
     def get_chain(self):
         inp = ShiftRotInputStage(self.pspec)
+        return [inp]
+
+class ShiftRotStage(PipeModBaseChain):
+    def get_chain(self):
         main = ShiftRotMainStage(self.pspec)
-        return [inp, main]
+        return [main]
 
 
 class ShiftRotStageEnd(PipeModBaseChain):
@@ -21,13 +25,15 @@ class ShiftRotBasePipe(ControlBase):
     def __init__(self, pspec):
         ControlBase.__init__(self)
         self.pspec = pspec
-        self.pipe1 = ShiftRotStages(pspec)
-        self.pipe2 = ShiftRotStageEnd(pspec)
-        self._eqs = self.connect([self.pipe1, self.pipe2])
+        self.pipe1 = ShiftRotStart(pspec)
+        self.pipe2 = ShiftRotStage(pspec)
+        self.pipe3 = ShiftRotStageEnd(pspec)
+        self._eqs = self.connect([self.pipe1, self.pipe2, self.pipe3])
 
     def elaborate(self, platform):
         m = ControlBase.elaborate(self, platform)
         m.submodules.pipe1 = self.pipe1
         m.submodules.pipe2 = self.pipe2
+        m.submodules.pipe3 = self.pipe3
         m.d.comb += self._eqs
         return m
index 7c3d811c8fa0402a70d5a3e1551e8ecb83873280..eac042fedcece092fec572ed75dd9759f852728e 100644 (file)
@@ -11,18 +11,18 @@ from nmutil.mask import Mask
 
 
 # note BE bit numbering
-def right_mask(m, mask_begin):
-    ret = Signal(64, name="right_mask", reset_less=True)
-    with m.If(mask_begin <= 64):
-        m.d.comb += ret.eq((1 << (64-mask_begin)) - 1)
+def right_mask(m, mask_begin, width):
+    ret = Signal(width, name="right_mask", reset_less=True)
+    with m.If(mask_begin <= width):
+        m.d.comb += ret.eq((1 << (width-mask_begin)) - 1)
     with m.Else():
         m.d.comb += ret.eq(0)
     return ret
 
 
-def left_mask(m, mask_end):
-    ret = Signal(64, name="left_mask", reset_less=True)
-    m.d.comb += ret.eq(~((1 << (63-mask_end)) - 1))
+def left_mask(m, mask_end, width):
+    ret = Signal(width, name="left_mask", reset_less=True)
+    m.d.comb += ret.eq(~((1 << (width-1-mask_end)) - 1))
     return ret
 
 
@@ -45,14 +45,15 @@ class Rotator(Elaboratable):
         * clear_right = 1 when insn_type is OP_RLC or OP_RLCR
     """
 
-    def __init__(self):
+    def __init__(self, width):
+        self.width = width
         # input
         self.me = Signal(5, reset_less=True)        # ME field
         self.mb = Signal(5, reset_less=True)        # MB field
         # extra bit of mb in MD-form
         self.mb_extra = Signal(1, reset_less=True)
-        self.ra = Signal(64, reset_less=True)       # RA
-        self.rs = Signal(64, reset_less=True)       # RS
+        self.ra = Signal(width, reset_less=True)       # RA
+        self.rs = Signal(width, reset_less=True)       # RS
         self.shift = Signal(7, reset_less=True)     # RB[0:7]
         self.is_32bit = Signal(reset_less=True)
         self.right_shift = Signal(reset_less=True)
@@ -61,10 +62,11 @@ class Rotator(Elaboratable):
         self.clear_right = Signal(reset_less=True)
         self.sign_ext_rs = Signal(reset_less=True)
         # output
-        self.result_o = Signal(64, reset_less=True)
+        self.result_o = Signal(width, reset_less=True)
         self.carry_out_o = Signal(reset_less=True)
 
     def elaborate(self, platform):
+        width = self.width
         m = Module()
         comb = m.d.comb
         ra, rs = self.ra, self.rs
@@ -75,11 +77,11 @@ class Rotator(Elaboratable):
         sh = Signal(7, reset_less=True)
         mb = Signal(7, reset_less=True)
         me = Signal(7, reset_less=True)
-        mr = Signal(64, reset_less=True)
-        ml = Signal(64, reset_less=True)
+        mr = Signal(width, reset_less=True)
+        ml = Signal(width, reset_less=True)
         output_mode = Signal(2, reset_less=True)
         hi32 = Signal(32, reset_less=True)
-        repl32 = Signal(64, reset_less=True)
+        repl32 = Signal(width, reset_less=True)
 
         # First replicate bottom 32 bits to both halves if 32-bit
         with m.If(self.is_32bit):
@@ -88,7 +90,8 @@ class Rotator(Elaboratable):
             # sign-extend bottom 32 bits
             comb += hi32.eq(Repl(rs[31], 32))
         with m.Else():
-            comb += hi32.eq(rs[32:64])
+            if width == 64:
+                comb += hi32.eq(rs[32:64])
         comb += repl32.eq(Cat(rs[0:32], hi32))
 
         shift_signed = Signal(signed(6))
@@ -101,7 +104,7 @@ class Rotator(Elaboratable):
             comb += rot_count.eq(self.shift[0:6])
 
         # ROTL submodule
-        m.submodules.rotl = rotl = ROTL(64)
+        m.submodules.rotl = rotl = ROTL(width)
         comb += rotl.a.eq(repl32)
         comb += rotl.b.eq(rot_count)
         comb += rot.eq(rotl.o)
@@ -139,16 +142,16 @@ class Rotator(Elaboratable):
             comb += me.eq(Cat(~sh[0:6], sh[6]))
 
         # Calculate left and right masks
-        m.submodules.right_mask = right_mask = Mask(64)
-        with m.If(mb <= 64):
-            comb += right_mask.shift.eq(64-mb)
+        m.submodules.right_mask = right_mask = Mask(width)
+        with m.If(mb <= width):
+            comb += right_mask.shift.eq(width-mb)
             comb += mr.eq(right_mask.mask)
         with m.Else():
             comb += mr.eq(0)
         #comb += mr.eq(right_mask(m, mb))
 
-        m.submodules.left_mask = left_mask = Mask(64)
-        comb += left_mask.shift.eq(63-me)
+        m.submodules.left_mask = left_mask = Mask(width)
+        comb += left_mask.shift.eq(width-1-me)
         comb += ml.eq(~left_mask.mask)
         #comb += ml.eq(left_mask(m, me))
 
@@ -159,7 +162,8 @@ class Rotator(Elaboratable):
         # 10 for rldicl, sr[wd]
         # 1z for sra[wd][i], z = 1 if rs is negative
         with m.If((self.clear_left & ~self.clear_right) | self.right_shift):
-            comb += output_mode.eq(Cat(self.arith & repl32[63], Const(1, 1)))
+            comb += output_mode.eq(Cat(self.arith &
+                                       repl32[width-1], Const(1, 1)))
         with m.Else():
             mbgt = self.clear_right & (mb[0:6] > me[0:6])
             comb += output_mode.eq(Cat(mbgt, Const(0, 1)))
@@ -186,7 +190,7 @@ if __name__ == '__main__':
     comb = m.d.comb
     mr = Signal(64)
     mb = Signal(6)
-    comb += mr.eq(left_mask(m, mb))
+    comb += mr.eq(left_mask(m, mb, 64))
 
     def loop():
         for i in range(64):
index 27a1d4c495526b22a92246aa20599c2c7d4a24a9..8898224d5a089ff3039a69f8ea3543998cbee67d 100644 (file)
@@ -3,12 +3,13 @@ from nmigen.back.pysim import Simulator, Delay, Settle
 from nmutil.formaltest import FHDLTestCase
 from nmigen.cli import rtlil
 from soc.fu.shift_rot.maskgen import MaskGen
-from openpower.decoder.helpers import MASK
+from openpower.decoder.helpers import ISACallerHelper
 import random
 import unittest
 
 class MaskGenTestCase(FHDLTestCase):
     def test_maskgen(self):
+        MASK = ISACallerHelper(64, FPSCR=None).MASK
         m = Module()
         comb = m.d.comb
         m.submodules.dut = dut = MaskGen(64)
index ce1b8401d9a25fa32c0c66b795675e661d459c75..cfa1c67492d2f0b7b7b01a0445a3c29f05cbfb66 100644 (file)
@@ -17,6 +17,7 @@ from nmigen import Module, Signal
 from nmutil.sim_tmp_alternative import Simulator, Settle
 
 from openpower.test.shift_rot.shift_rot_cases import ShiftRotTestCase
+from openpower.test.bitmanip.bitmanip_cases import BitManipTestCase
 
 
 def get_cu_inputs(dec2, sim):
@@ -38,7 +39,7 @@ def get_cu_inputs(dec2, sim):
 def set_alu_inputs(alu, dec2, sim):
     # TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
     # detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
-    # and place it into data_i.b
+    # and place it into i_data.b
 
     inp = yield from get_cu_inputs(dec2, sim)
     yield from ALUHelpers.set_int_ra(alu, dec2, inp)
@@ -70,7 +71,11 @@ def set_alu_inputs(alu, dec2, sim):
 class ShiftRotIlangCase(TestAccumulatorBase):
 
     def case_ilang(self):
-        pspec = ShiftRotPipeSpec(id_wid=2)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = ShiftRotPipeSpec(id_wid=2, parent_pspec=pps)
+        pspec.draft_bitmanip = True
         alu = ShiftRotBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("shift_rot_pipeline.il", "w") as f:
@@ -106,20 +111,20 @@ class TestRunner(unittest.TestCase):
             yield from set_alu_inputs(alu, pdecode2, simulator)
 
             # set valid for one cycle, propagate through pipeline...
-            yield alu.p.valid_i.eq(1)
+            yield alu.p.i_valid.eq(1)
             yield
-            yield alu.p.valid_i.eq(0)
+            yield alu.p.i_valid.eq(0)
 
             opname = code.split(' ')[0]
             yield from simulator.call(opname)
             index = simulator.pc.CIA.value//4
 
-            vld = yield alu.n.valid_o
+            vld = yield alu.n.o_valid
             while not vld:
                 yield
-                vld = yield alu.n.valid_o
+                vld = yield alu.n.o_valid
             yield
-            alu_out = yield alu.n.data_o.o.data
+            alu_out = yield alu.n.o_data.o.data
 
             yield from self.check_alu_outputs(alu, pdecode2,
                                               simulator, code)
@@ -136,11 +141,15 @@ class TestRunner(unittest.TestCase):
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(None, opkls, fn_name)
         pdecode = pdecode2.dec
 
-        pspec = ShiftRotPipeSpec(id_wid=2)
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = ShiftRotPipeSpec(id_wid=2, parent_pspec=pps)
+        pspec.draft_bitmanip = True
         m.submodules.alu = alu = ShiftRotBasePipe(pspec)
 
-        comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
-        comb += alu.n.ready_i.eq(1)
+        comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+        comb += alu.n.i_ready.eq(1)
         comb += pdecode2.dec.raw_opcode_in.eq(instruction)
         sim = Simulator(m)
 
@@ -174,13 +183,13 @@ class TestRunner(unittest.TestCase):
         yield from ALUHelpers.get_xer_ca(res, alu, dec2)
         yield from ALUHelpers.get_int_o(res, alu, dec2)
 
-        print ("hw outputs", res)
+        print("hw outputs", res)
 
         yield from ALUHelpers.get_sim_int_o(sim_o, sim, dec2)
         yield from ALUHelpers.get_wr_sim_cr_a(sim_o, sim, dec2)
         yield from ALUHelpers.get_wr_sim_xer_ca(sim_o, sim, dec2)
 
-        print ("sim outputs", sim_o)
+        print("sim outputs", sim_o)
 
         ALUHelpers.check_cr_a(self, res, sim_o, "CR%d %s" % (cridx, code))
         ALUHelpers.check_xer_ca(self, res, sim_o, code)
@@ -191,6 +200,7 @@ if __name__ == "__main__":
     unittest.main(exit=False)
     suite = unittest.TestSuite()
     suite.addTest(TestRunner(ShiftRotTestCase().test_data))
+    suite.addTest(TestRunner(BitManipTestCase().test_data))
     suite.addTest(TestRunner(ShiftRotIlangCase().test_data))
 
     runner = unittest.TextTestRunner()
index 1431a0386d595a1252c19c1254d0c050295d748e..db9f86a84ed32947c76101e3dfa270a11685a8f5 100644 (file)
@@ -24,6 +24,8 @@ from openpower.decoder.power_fields import DecodeFields
 from openpower.decoder.power_fieldsn import SignalBitRange
 
 # use POWER numbering. sigh.
+
+
 def xer_bit(name):
     return 63-XER_bits[name]
 
@@ -46,16 +48,16 @@ class Driver(Elaboratable):
             width = p.width
             comb += p.eq(AnyConst(width))
 
-        pspec = SPRPipeSpec(id_wid=2)
+        pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.dut = dut = SPRMainStage(pspec)
 
         # frequently used aliases
         a = dut.i.a
         ca_in = dut.i.xer_ca[0]   # CA carry in
-        ca32_in = dut.i.xer_ca[1] # CA32 carry in 32
+        ca32_in = dut.i.xer_ca[1]  # CA32 carry in 32
         so_in = dut.i.xer_so      # SO sticky overflow
         ov_in = dut.i.xer_ov[0]   # XER OV in
-        ov32_in = dut.i.xer_ov[1] # XER OV32 in
+        ov32_in = dut.i.xer_ov[1]  # XER OV32 in
         o = dut.o.o
 
         # setup random inputs
@@ -71,8 +73,8 @@ class Driver(Elaboratable):
         comb += dut.i.ctx.op.eq(rec)
 
         # check that the operation (op) is passed through (and muxid)
-        comb += Assert(dut.o.ctx.op == dut.i.ctx.op )
-        comb += Assert(dut.o.ctx.muxid == dut.i.ctx.muxid )
+        comb += Assert(dut.o.ctx.op == dut.i.ctx.op)
+        comb += Assert(dut.o.ctx.muxid == dut.i.ctx.muxid)
 
         # MTSPR
         fields = DecodeFields(SignalBitRange, [dut.i.ctx.op.insn])
index 6d9d13a6b85985d456da76347c6ebcda69f98dd9..b3a49cb642e9509732eaa3763599180b718a41f9 100644 (file)
@@ -19,7 +19,7 @@ class SPRMainStage(PipeModBase):
         super().__init__(pspec, "spr_main")
         # test if regfiles are reduced
         self.regreduce_en = (hasattr(pspec, "regreduce") and
-                                            (pspec.regreduce == True))
+                             (pspec.regreduce == True))
 
         self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
         self.fields.create_specs()
@@ -44,6 +44,7 @@ class SPRMainStage(PipeModBase):
         so_i, ov_i, ca_i = self.i.xer_so, self.i.xer_ov, self.i.xer_ca
         so_o, ov_o, ca_o = self.o.xer_so, self.o.xer_ov, self.o.xer_ca
         o, spr1_o, fast1_o = self.o.o, self.o.spr1, self.o.fast1
+        state1_i, state1_o = self.i.state1, self.o.state1
 
         # take copy of D-Form TO field
         x_fields = self.fields.FormXFX
@@ -55,9 +56,18 @@ class SPRMainStage(PipeModBase):
             #### MTSPR ####
             with m.Case(MicrOp.OP_MTSPR):
                 with m.Switch(spr):
-                    # fast SPRs first
+                    # State SPRs first, note that this triggers a regfile write
+                    # which is monitored right the way down in TestIssuerBase.
+                    with m.Case(SPR.DEC, SPR.TB):
+                        comb += state1_o.data.eq(a_i)
+                        comb += state1_o.ok.eq(1)
+
+                    # Fast SPRs second: anything in FAST regs
                     with m.Case(SPR.CTR, SPR.LR, SPR.TAR, SPR.SRR0,
-                                SPR.SRR1, SPR.XER, SPR.DEC):
+                                SPR.SRR1, SPR.XER, SPR.HSRR0, SPR.HSRR1,
+                                SPR.SPRG0_priv, SPR.SPRG1_priv,
+                                SPR.SPRG2_priv, SPR.SPRG3,
+                                SPR.HSPRG0, SPR.HSPRG1, SPR.SVSRR0):
                         comb += fast1_o.data.eq(a_i)
                         comb += fast1_o.ok.eq(1)
                         # XER is constructed
@@ -83,15 +93,25 @@ class SPRMainStage(PipeModBase):
             with m.Case(MicrOp.OP_MFSPR):
                 comb += o.ok.eq(1)
                 with m.Switch(spr):
-                    # fast SPRs first
-                    with m.Case(SPR.CTR, SPR.LR, SPR.TAR, SPR.SRR0, SPR.SRR1,
-                                SPR.XER, SPR.DEC, SPR.TB):
+                    # state SPRs first
+                    with m.Case(SPR.DEC, SPR.TB):
+                        comb += o.data.eq(state1_i)
+                    # TBU is upper 32-bits of State Reg
+                    with m.Case(SPR.TBU):
+                        comb += o.data[0:32].eq(state1_i[32:64])
+
+                    # fast SPRs second
+                    with m.Case(SPR.CTR, SPR.LR, SPR.TAR, SPR.SRR0,
+                                SPR.SRR1, SPR.XER, SPR.HSRR0, SPR.HSRR1,
+                                SPR.SPRG0_priv, SPR.SPRG1_priv,
+                                SPR.SPRG2_priv, SPR.SPRG3,
+                                SPR.HSPRG0, SPR.HSPRG1, SPR.SVSRR0):
                         comb += o.data.eq(fast1_i)
                         with m.If(spr == SPR.XER):
                             # bits 0:31 and 35:43 are treated as reserved
                             # and return 0s when read using mfxer
                             comb += o[32:64].eq(0)       # MBS0 bits 0-31
-                            comb += o[63-43:64-35].eq(0) # MSB0 bits 35-43
+                            comb += o[63-43:64-35].eq(0)  # MSB0 bits 35-43
                             # sticky
                             comb += o[63-XER_bits['SO']].eq(so_i)
                             # overflow
@@ -100,9 +120,6 @@ class SPRMainStage(PipeModBase):
                             # carry
                             comb += o[63-XER_bits['CA']].eq(ca_i[0])
                             comb += o[63-XER_bits['CA32']].eq(ca_i[1])
-                    with m.Case(SPR.TBU):
-                        comb += o.data[0:32].eq(fast1_i[32:64])
-
                     # slow SPRs TODO
                     with m.Default():
                         comb += o.data.eq(spr1_i)
index bd0ed97e4e0a2dc4165d8b3e942d6d4575badc84..21db95827ccc53d3f5d67454ef13f5881aff7d51 100644 (file)
@@ -19,6 +19,7 @@ class SPRInputData(FUBaseData):
     regspec = [('INT', 'ra', '0:63'),        # RA
                ('SPR', 'spr1', '0:63'),      # SPR (slow)
                ('FAST', 'fast1', '0:63'),    # SPR (fast: LR, CTR etc)
+               ('STATE', 'state1', '0:63'),  # SPR (DEC/TB)
                ('XER', 'xer_so', '32'),      # XER bit 32: SO
                ('XER', 'xer_ov', '33,44'),   # XER bit 34/45: CA/CA32
                ('XER', 'xer_ca', '34,45')]   # bit0: ov, bit1: ov32
@@ -27,11 +28,16 @@ class SPRInputData(FUBaseData):
         # convenience
         self.a = self.ra
 
+# note that state1 gets a corresponding "state1" write port created
+# by core.py which is "monitored" by TestIssuerBase (hack-job, sigh).
+# when writes are spotted then the DEC/TB FSM resets and re-reads
+# DEC/TB.
 
 class SPROutputData(FUBaseData):
     regspec = [('INT', 'o', '0:63'),        # RT
                ('SPR', 'spr1', '0:63'),     # SPR (slow)
                ('FAST', 'fast1', '0:63'),   # SPR (fast: LR, CTR etc)
+               ('STATE', 'state1', '0:63'), # SPR (DEC/TB)
                ('XER', 'xer_so', '32'),     # XER bit 32: SO
                ('XER', 'xer_ov', '33,44'),  # XER bit 34/45: CA/CA32
                ('XER', 'xer_ca', '34,45')]  # bit0: ov, bit1: ov32
@@ -40,5 +46,5 @@ class SPROutputData(FUBaseData):
 
 
 class SPRPipeSpec(CommonPipeSpec):
-    regspec = (SPRInputData.regspec, SPROutputData.regspec)
+    regspecklses = (SPRInputData, SPROutputData)
     opsubsetkls = CompSPROpSubset
index 8dc13da5c78877a5a0530a6603a3f12483b40b01..894212bcb68549221fc8119fdfbc999e165e4b35 100644 (file)
@@ -46,7 +46,7 @@ def get_cu_inputs(dec2, sim):
 def set_alu_inputs(alu, dec2, sim):
     # TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
     # detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
-    # and place it into data_i.b
+    # and place it into i_data.b
 
     inp = yield from get_cu_inputs(dec2, sim)
     yield from ALUHelpers.set_int_ra(alu, dec2, inp)
@@ -61,7 +61,7 @@ def set_alu_inputs(alu, dec2, sim):
 
 class SPRIlangCase(TestAccumulatorBase):
     def case_ilang(self):
-        pspec = SPRPipeSpec(id_wid=2)
+        pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
         alu = SPRBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("trap_pipeline.il", "w") as f:
@@ -122,10 +122,10 @@ class TestRunner(unittest.TestCase):
             index = pc//4
             print("pc after %08x" % (pc))
 
-            vld = yield alu.n.valid_o
+            vld = yield alu.n.o_valid
             while not vld:
                 yield
-                vld = yield alu.n.valid_o
+                vld = yield alu.n.o_valid
             yield
 
             yield from self.check_alu_outputs(alu, pdecode2, sim, code)
@@ -139,12 +139,12 @@ class TestRunner(unittest.TestCase):
 
         m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
 
-        pspec = SPRPipeSpec(id_wid=2)
+        pspec = SPRPipeSpec(id_wid=2, parent_pspec=None)
         m.submodules.alu = alu = SPRBasePipe(pspec)
 
-        comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
-        comb += alu.p.valid_i.eq(1)
-        comb += alu.n.ready_i.eq(1)
+        comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+        comb += alu.p.i_valid.eq(1)
+        comb += alu.n.i_ready.eq(1)
         comb += pdecode2.dec.raw_opcode_in.eq(instruction)
         sim = Simulator(m)
 
index 235df615a896928e43c036520acd46e6049858bf..b94f7e732d255cc5aa1a063e012c9edd354a1c79 100644 (file)
@@ -37,7 +37,7 @@ class Driver(Elaboratable):
         comb = m.d.comb
 
         rec = CompTrapOpSubset()
-        pspec = TrapPipeSpec(id_wid=2)
+        pspec = TrapPipeSpec(id_wid=2, parent_pspec=None)
 
         m.submodules.dut = dut = TrapMainStage(pspec)
 
@@ -202,7 +202,7 @@ class Driver(Elaboratable):
             ###################
 
             with m.Case(MicrOp.OP_MTMSRD):
-                msr_od = msr_o.data # another "shortener"
+                msr_od = msr_o.data  # another "shortener"
 
                 with m.If(L == 0):
                     # if (MSR[29:31] != 0b010) | (SRR1[29:31] != 0b000) then
@@ -216,7 +216,7 @@ class Driver(Elaboratable):
                     # MSR[48] <- (RS)[48] | (RS)[49]
                     # MSR[58] <- (RS)[58] | (RS)[49]
                     # MSR[59] <- (RS)[59] | (RS)[49]
-                    PR = field(rs, 49) # alias/copy of SRR1 PR field
+                    PR = field(rs, 49)  # alias/copy of SRR1 PR field
                     comb += [
                         Assert(field(msr_od, 48) == field(rs, 48) | PR),
                         Assert(field(msr_od, 58) == field(rs, 58) | PR),
@@ -263,7 +263,7 @@ class Driver(Elaboratable):
             # RFID.  v3.0B p955
             ###################
             with m.Case(MicrOp.OP_RFID):
-                msr_od = msr_o.data # another "shortener"
+                msr_od = msr_o.data  # another "shortener"
                 comb += [
                     Assert(msr_o.ok),
                     Assert(nia_o.ok),
@@ -280,7 +280,7 @@ class Driver(Elaboratable):
 
                 # if (MSR[29:31] != 0b010) | (SRR1[29:31] != 0b000) then
                 #     MSR[29:31] <- SRR1[29:31]
-                with m.If((field(msr_i , 29, 31) != 0b010) |
+                with m.If((field(msr_i, 29, 31) != 0b010) |
                           (field(srr1_i, 29, 31) != 0b000)):
                     comb += Assert(F(msr_od, 29, 31) == F(srr1_i, 29, 31))
                 with m.Else():
@@ -290,7 +290,7 @@ class Driver(Elaboratable):
                 # MSR[48] <- (RS)[48] | (RS)[49]
                 # MSR[58] <- (RS)[58] | (RS)[49]
                 # MSR[59] <- (RS)[59] | (RS)[49]
-                PR = field(srr1_i, 49) # alias/copy of SRR1 PR field
+                PR = field(srr1_i, 49)  # alias/copy of SRR1 PR field
                 comb += [
                     Assert(field(msr_od, 48) == field(srr1_i, 48) | PR),
                     Assert(field(msr_od, 58) == field(srr1_i, 58) | PR),
@@ -373,4 +373,3 @@ class TrapMainStageTestCase(FHDLTestCase):
 
 if __name__ == '__main__':
     unittest.main()
-
index c597b75e7e01f57375ff20d2fd05cb1f6e8c686e..8127e226e34afaf5cf87489bb86fa04a39a544e1 100644 (file)
@@ -24,7 +24,8 @@ from openpower.consts import MSR, PI, TT, field, field_slice
 
 
 def msr_copy(msr_o, msr_i, zero_me=True):
-    """msr_copy
+    """msr_copy (also used to copy relevant bits into SRR1)
+
     ISA says this:
     Defined MSR bits are classified as either full func tion or partial
     function. Full function MSR bits are saved in SRR1 or HSRR1 when
@@ -42,11 +43,11 @@ def msr_copy(msr_o, msr_i, zero_me=True):
     return l
 
 
-def msr_check_pr(m, msr):
+def msr_check_pr(m, d_in, msr):
     """msr_check_pr: checks "problem state"
     """
     comb = m.d.comb
-    with m.If(msr[MSR.PR]):
+    with m.If(d_in[MSR.PR]):
         comb += msr[MSR.EE].eq(1) # set external interrupt bit
         comb += msr[MSR.IR].eq(1) # set instruction relocation bit
         comb += msr[MSR.DR].eq(1) # set data relocation bit
@@ -57,6 +58,8 @@ class TrapMainStage(PipeModBase):
         super().__init__(pspec, "main")
         self.fields = DecodeFields(SignalBitRange, [self.i.ctx.op.insn])
         self.fields.create_specs()
+        self.kaivb = Signal(64) # KAIVB SPR
+        self.state_reset = Signal() # raise high to reset KAIVB cache
 
     def trap(self, m, trap_addr, return_addr):
         """trap.  sets new PC, stores MSR and old PC in SRR1 and SRR0
@@ -65,19 +68,33 @@ class TrapMainStage(PipeModBase):
         op = self.i.ctx.op
         msr_i = op.msr
         svstate_i = op.svstate
+
+        exc = LDSTException("trapexc")
+        comb += exc.eq(op.ldst_exc)
+        srr1_i = exc.srr1 # new SRR1 bits come from exception
         nia_o = self.o.nia
         svsrr0_o, srr0_o, srr1_o = self.o.svsrr0, self.o.srr0, self.o.srr1
 
-        # trap address
+        # trap address, including KAIVB override
         comb += nia_o.data.eq(trap_addr)
+        comb += nia_o.data[13:].eq(self.kaivb[13:])
         comb += nia_o.ok.eq(1)
 
         # addr to begin from on return
         comb += srr0_o.data.eq(return_addr)
         comb += srr0_o.ok.eq(1)
 
-        # take a copy of the current MSR into SRR1
-        comb += msr_copy(srr1_o.data, msr_i) # old MSR
+        # take a copy of the current MSR into SRR1, but first copy old SRR1
+        # this preserves the bits of SRR1 that are not supposed to change:
+        # MSR.IR,DR,PMM,RI,LE (0-5) and MR,FP,ME,FE0 (11-14)
+        # i would suggest reading v3.0C p1063 Book III section 7.2.1 for
+        # advice but it's so obscure and indirect, that it's just easier
+        # to copy microwatt behaviour.  see writeback.vhdl
+        # IMPORTANT: PowerDecoder2 needed to actually read SRR1 for
+        # it to have the contents *of* SRR1 to copy over!
+        comb += msr_copy(srr1_o.data, msr_i, False)  # old MSR
+        comb += srr1_o.data[16:22].eq(srr1_i[0:6])   # IR,DR,PMM,RI,LE
+        comb += srr1_o.data[27:31].eq(srr1_i[11:15]) # MR,FP,ME,FE0
         comb += srr1_o.ok.eq(1)
 
         # take a copy of the current SVSTATE into SVSRR0
@@ -125,7 +142,7 @@ class TrapMainStage(PipeModBase):
 
     def elaborate(self, platform):
         m = Module()
-        comb = m.d.comb
+        comb, sync = m.d.comb, m.d.sync
         op = self.i.ctx.op
 
         # convenience variables
@@ -137,6 +154,10 @@ class TrapMainStage(PipeModBase):
         srr0_o, srr1_o, svsrr0_o = self.o.srr0, self.o.srr1, self.o.svsrr0
         traptype, trapaddr = op.traptype, op.trapaddr
 
+        # hard reset of KAIVB
+        with m.If(self.state_reset):
+            sync += self.kaivb.eq(0)
+
         # take copy of D-Form TO field
         i_fields = self.fields.FormD
         to = Signal(i_fields.TO[0:-1].shape())
@@ -187,6 +208,16 @@ class TrapMainStage(PipeModBase):
         # TODO: some #defines for the bits n stuff.
         with m.Switch(op.insn_type):
 
+            ##############
+            # KAIVB https://bugs.libre-soc.org/show_bug.cgi?id=859
+
+            with m.Case(MicrOp.OP_MTSPR):
+                sync += self.kaivb.eq(a_i)
+
+            with m.Case(MicrOp.OP_MFSPR):
+                comb += o.data.eq(self.kaivb)
+                comb += o.ok.eq(1)
+
             ###############
             # TDI/TWI/TD/TW.  v3.0B p90-91
 
@@ -204,7 +235,10 @@ class TrapMainStage(PipeModBase):
                         comb += srr1_o.data[PI.FP].eq(1)
                     with m.If(traptype & TT.ADDR):
                         comb += srr1_o.data[PI.ADR].eq(1)
-                    with m.If(traptype & TT.MEMEXC):
+                    with m.If((traptype & TT.MEMEXC).bool() &
+                              (trapaddr == 0x400)):
+                        # Instruction Storage Interrupt (ISI - 0x400)
+                        #           v3.0C Book III Chap 7.5.5 p1085
                         # decode exception bits, store in SRR1
                         exc = LDSTException("trapexc")
                         comb += exc.eq(op.ldst_exc)
@@ -233,9 +267,10 @@ class TrapMainStage(PipeModBase):
             # MTMSR/D.  v3.0B p TODO - move to MSR
 
             with m.Case(MicrOp.OP_MTMSRD, MicrOp.OP_MTMSR):
-                L = self.fields.FormX.L[0:-1] # X-Form field L
+                # L => bit 16 in LSB0, bit 15 in MSB0 order
+                L = self.fields.FormX.L1[0:1] # X-Form field L1
                 # start with copy of msr
-                comb += msr_o.eq(msr_i)
+                comb += msr_o.data.eq(msr_i)
                 with m.If(L):
                     # just update RI..EE
                     comb += msr_o.data[MSR.RI].eq(a_i[MSR.RI])
@@ -257,7 +292,8 @@ class TrapMainStage(PipeModBase):
                         # mtmsr - 32-bit, only room for bottom 32 LSB flags
                         for stt, end in [(1,12), (13, 32)]:
                             comb += msr_o.data[stt:end].eq(a_i[stt:end])
-                    msr_check_pr(m, msr_o.data)
+                    # check problem state: if set, not permitted to set EE,IR,DR
+                    msr_check_pr(m, a_i, msr_o.data)
 
                 # Per https://bugs.libre-soc.org/show_bug.cgi?id=325#c123,
                 # this actually *is* in the microwatt code now.
@@ -265,9 +301,13 @@ class TrapMainStage(PipeModBase):
                 # hypervisor stuff.  here: bits 3 (HV) and 51 (ME) were
                 # copied over by msr_copy but if HV was not set we need
                 # the *original* (msr_i) bits
-                with m.If(~msr_i[MSR.HV]):
-                    comb += msr_o.data[MSR.HV].eq(msr_i[MSR.HV])
-                    comb += msr_o.data[MSR.ME].eq(msr_i[MSR.ME])
+                # XXX taking this out to see what happens when running
+                # linux-5.7 microwatt buildroot.  microwatt does not
+                # implement HV, so this is unlikely to work.  0x900
+                # linux kernel exception handling tends to support this
+                # with m.If(~msr_i[MSR.HV]):
+                #     comb += msr_o.data[MSR.HV].eq(msr_i[MSR.HV])
+                #     comb += msr_o.data[MSR.ME].eq(msr_i[MSR.ME])
 
                 comb += msr_o.ok.eq(1)
 
@@ -295,14 +335,18 @@ class TrapMainStage(PipeModBase):
                 # MSR was in srr1: copy it over, however *caveats below*
                 comb += msr_copy(msr_o.data, srr1_i, zero_me=False) # don't zero
 
-                with m.If(~self.i.ctx.op.insn[9]): # XXX BAD HACK! (hrfid)
-                    with m.If(field(msr_i, 3)): # HV
-                        comb += field(msr_o, 51).eq(field(srr1_i, 51)) # ME
-                    with m.Else():
-                        comb += field(msr_o, 51).eq(field(msr_i, 51)) # ME
-
-                # check problem state
-                msr_check_pr(m, msr_o.data)
+                if False: # XXX no - not doing hypervisor yet
+                    with m.If(~self.i.ctx.op.insn[9]): # XXX BAD HACK! (hrfid)
+                        with m.If(field(msr_i, 3)): # HV
+                            comb += field(msr_o.data, 51).eq(field(srr1_i, 51)) # ME
+                        with m.Else():
+                            comb += field(msr_o.data, 51).eq(field(msr_i, 51)) # ME
+                else:
+                    # same as microwatt: treat MSR.ME rfid same as hrfid
+                    comb += field(msr_o.data, 51).eq(field(srr1_i, 51)) # ME
+
+                # check problem state: if set, not permitted to set EE,IR,DR
+                msr_check_pr(m, srr1_i, msr_o.data)
 
                 # don't understand but it's in the spec.  again: bits 32-34
                 # are copied from srr1_i and need *restoring* to msr_i
index 44c63654f6b01fdb080189f30bfdbe2ba460808d..b9c829bccc1811a1e7e334aba22fc3b09e7a907d 100644 (file)
@@ -27,7 +27,7 @@ class TrapOutputData(FUBaseData):
                # ... however we *do* need to *write* MSR, NIA, SVSTATE (RFID)
                ('STATE', 'nia', '0:63'),  # NIA (Next PC)
                ('STATE', 'msr', '0:63'),  # MSR
-               ('STATE', 'svstate', '0:31')]  # SVSTATE
+               ('STATE', 'svstate', '0:63')]  # SVSTATE
     def __init__(self, pspec):
         super().__init__(pspec, True)
         # convenience
@@ -36,5 +36,5 @@ class TrapOutputData(FUBaseData):
 
 
 class TrapPipeSpec(CommonPipeSpec):
-    regspec = (TrapInputData.regspec, TrapOutputData.regspec)
+    regspecklses = (TrapInputData, TrapOutputData)
     opsubsetkls = CompTrapOpSubset
index 26b12ff68b11f51b2a876ae2d17f7c286722424e..dff1f4139db5b9c78fdaeb962dfffb75b8ade168 100644 (file)
@@ -49,7 +49,7 @@ def get_cu_inputs(dec2, sim):
 def set_alu_inputs(alu, dec2, sim):
     # TODO: see https://bugs.libre-soc.org/show_bug.cgi?id=305#c43
     # detect the immediate here (with m.If(self.i.ctx.op.imm_data.imm_ok))
-    # and place it into data_i.b
+    # and place it into i_data.b
 
     inp = yield from get_cu_inputs(dec2, sim)
     yield from ALUHelpers.set_int_ra(alu, dec2, inp)
@@ -66,7 +66,7 @@ def set_alu_inputs(alu, dec2, sim):
 class TrapIlangCase(TestAccumulatorBase):
 
     def case_ilang(self):
-        pspec = TrapPipeSpec(id_wid=2)
+        pspec = TrapPipeSpec(id_wid=2, parent_pspec=None)
         alu = TrapBasePipe(pspec)
         vl = rtlil.convert(alu, ports=alu.ports())
         with open("trap_pipeline.il", "w") as f:
@@ -74,82 +74,98 @@ class TrapIlangCase(TestAccumulatorBase):
 
 
 class TestRunner(unittest.TestCase):
-    def __init__(self, test_data):
-        super().__init__("run_all")
-        self.test_data = test_data
 
-    def run_all(self):
+    def execute(self, alu, instruction, pdecode2, test):
+        program = test.program
+        sim = ISA(pdecode2, test.regs, test.sprs, test.cr,
+                  test.mem, test.msr,
+                  bigendian=bigendian)
+        gen = program.generate_instructions()
+        instructions = list(zip(gen, program.assembly.splitlines()))
+
+        msr = sim.msr.value
+        pc = sim.pc.CIA.value
+        print("starting msr, pc %08x, %08x" % (msr, pc))
+        index = pc//4
+        while index < len(instructions):
+            ins, code = instructions[index]
+
+            print("pc %08x msr %08x instr: %08x" % (pc, msr, ins))
+            print(code)
+            if 'XER' in sim.spr:
+                so = 1 if sim.spr['XER'][XER_bits['SO']] else 0
+                ov = 1 if sim.spr['XER'][XER_bits['OV']] else 0
+                ov32 = 1 if sim.spr['XER'][XER_bits['OV32']] else 0
+                print("before: so/ov/32", so, ov, ov32)
+
+            # ask the decoder to decode this binary data (endian'd)
+            yield pdecode2.dec.bigendian.eq(bigendian)  # l/big?
+            yield pdecode2.state.msr.eq(msr)  # set MSR in pdecode2
+            yield pdecode2.state.pc.eq(pc)  # set CIA in pdecode2
+            yield instruction.eq(ins)          # raw binary instr.
+            yield Settle()
+            fn_unit = yield pdecode2.e.do.fn_unit
+            asmcode = yield pdecode2.e.asmcode
+            dec_asmcode = yield pdecode2.dec.op.asmcode
+            print("asmcode", asmcode, dec_asmcode)
+            self.assertEqual(fn_unit, Function.TRAP.value)
+            alu_o = yield from set_alu_inputs(alu, pdecode2, sim)
+
+            # set valid for one cycle, propagate through pipeline...
+            yield alu.p.i_valid.eq(1)
+            yield
+            yield alu.p.i_valid.eq(0)
+
+            opname = code.split(' ')[0]
+            yield from sim.call(opname)
+            pc = sim.pc.CIA.value
+            index = pc//4
+            print("pc after %08x" % (pc))
+            msr = sim.msr.value
+            print("msr after %08x" % (msr))
+
+            vld = yield alu.n.o_valid
+            while not vld:
+                yield
+                vld = yield alu.n.o_valid
+            yield
+
+            yield from self.check_alu_outputs(alu, pdecode2, sim, code)
+            yield Settle()
+
+    def test_it(self):
+        test_data = TrapTestCase().test_data
         m = Module()
         comb = m.d.comb
         instruction = Signal(32)
 
-        pdecode = create_pdecode()
-
-        m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
+        fn_name = "TRAP"
+        opkls = TrapPipeSpec.opsubsetkls
 
-        pspec = TrapPipeSpec(id_wid=2)
+        pdecode = create_pdecode()
+        m.submodules.pdecode2 = pdecode2 = PowerDecode2(
+            pdecode, opkls, fn_name)
+        pdecode = pdecode2.dec
+
+        class PPspec:
+            XLEN = 64
+        pps = PPspec()
+        pspec = TrapPipeSpec(id_wid=2, parent_pspec=pps)
         m.submodules.alu = alu = TrapBasePipe(pspec)
 
-        comb += alu.p.data_i.ctx.op.eq_from_execute1(pdecode2.do)
-        comb += alu.p.valid_i.eq(1)
-        comb += alu.n.ready_i.eq(1)
+        comb += alu.p.i_data.ctx.op.eq_from_execute1(pdecode2.do)
+        comb += alu.n.i_ready.eq(1)
         comb += pdecode2.dec.raw_opcode_in.eq(instruction)
         sim = Simulator(m)
 
         sim.add_clock(1e-6)
 
         def process():
-            for test in self.test_data:
+            for test in test_data:
                 print(test.name)
                 program = test.program
                 with self.subTest(test.name):
-                    sim = ISA(pdecode2, test.regs, test.sprs, test.cr,
-                              test.mem, test.msr,
-                              bigendian=bigendian)
-                    gen = program.generate_instructions()
-                    instructions = list(zip(gen, program.assembly.splitlines()))
-
-                    msr = sim.msr.value
-                    pc = sim.pc.CIA.value
-                    print("starting msr, pc %08x, %08x" % (msr, pc))
-                    index = pc//4
-                    while index < len(instructions):
-                        ins, code = instructions[index]
-
-                        print("pc %08x msr %08x instr: %08x" % (pc, msr, ins))
-                        print(code)
-                        if 'XER' in sim.spr:
-                            so = 1 if sim.spr['XER'][XER_bits['SO']] else 0
-                            ov = 1 if sim.spr['XER'][XER_bits['OV']] else 0
-                            ov32 = 1 if sim.spr['XER'][XER_bits['OV32']] else 0
-                            print("before: so/ov/32", so, ov, ov32)
-
-                        # ask the decoder to decode this binary data (endian'd)
-                        yield pdecode2.dec.bigendian.eq(bigendian)  # l/big?
-                        yield pdecode2.state.msr.eq(msr)  # set MSR in pdecode2
-                        yield pdecode2.state.pc.eq(pc)  # set CIA in pdecode2
-                        yield instruction.eq(ins)          # raw binary instr.
-                        yield Settle()
-                        fn_unit = yield pdecode2.e.do.fn_unit
-                        self.assertEqual(fn_unit, Function.TRAP.value)
-                        alu_o = yield from set_alu_inputs(alu, pdecode2, sim)
-                        yield
-                        opname = code.split(' ')[0]
-                        yield from sim.call(opname)
-                        pc = sim.pc.CIA.value
-                        index = pc//4
-                        print("pc after %08x" % (pc))
-                        msr = sim.msr.value
-                        print("msr after %08x" % (msr))
-
-                        vld = yield alu.n.valid_o
-                        while not vld:
-                            yield
-                            vld = yield alu.n.valid_o
-                        yield
-
-                        yield from self.check_alu_outputs(alu, pdecode2,
-                                                          sim, code)
+                    yield from self.execute(alu, instruction, pdecode2, test)
 
         sim.add_sync_process(process)
         with sim.write_vcd("alu_simulator.vcd", "simulator.gtkw",
@@ -158,14 +174,6 @@ class TestRunner(unittest.TestCase):
 
     def check_alu_outputs(self, alu, dec2, sim, code):
 
-        rc = yield dec2.e.do.rc.data
-        cridx_ok = yield dec2.e.write_cr.ok
-        cridx = yield dec2.e.write_cr.data
-
-        print("check extra output", repr(code), cridx_ok, cridx)
-        if rc:
-            self.assertEqual(cridx, 0, code)
-
         sim_o = {}
         res = {}
 
@@ -196,10 +204,4 @@ class TestRunner(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(exit=False)
-    suite = unittest.TestSuite()
-    suite.addTest(TestRunner(TrapTestCase().test_data))
-    suite.addTest(TestRunner(TrapIlangCase().test_data))
-
-    runner = unittest.TextTestRunner()
-    runner.run(suite)
+    unittest.main()
index 4d3d66e8d8980af2ea55b6a32ae7b80d90ba0aba..107bc0f4c7e8d5f5f0275f7062c0681cf531eb2c 100644 (file)
@@ -16,11 +16,11 @@ class CompTrapOpSubset(CompOpSubsetBase):
                   ('insn', 32),
                   ('msr', 64),     # from core.state
                   ('cia', 64),     # likewise
-                  ('svstate', 32), # likewise
+                  ('svstate', 64), # likewise
                   ('is_32bit', 1),
                   ('traptype', TT.size), # see trap main_stage.py, PowerDecoder2
                   ('trapaddr', 13),
-                  ('ldst_exc', len(LDSTException._exc_types)),
+                  ('ldst_exc', LDSTException.length), # blech
                   ]
 
         super().__init__(layout, name=name)
index ede33a1b03913307f33ce8080652d315c0403ce9..a5ed8f0d7338a0fcbb70cceb7eac417dd235c804 100644 (file)
@@ -16,6 +16,9 @@
 # highest priority interrupt currently presented (which is allowed
 # via XICS)
 #
+# Bugreports:
+#
+# * https://bugs.libre-soc.org/show_bug.cgi?id=407
 """
 from nmigen import Elaboratable, Module, Signal, Cat, Const, Record, Array, Mux
 from nmutil.iocontrol import RecordObject
@@ -72,9 +75,10 @@ def bswap(v):
 
 class XICS_ICP(Elaboratable):
 
-    def __init__(self):
-        class Spec: pass
-        spec = Spec()
+    def __init__(self, spec=None):
+        if spec is None:
+            class Spec: pass
+            spec = Spec()
         spec.addr_wid = 30
         spec.mask_wid = 4
         spec.reg_wid = 32
@@ -223,12 +227,13 @@ class Xive(RecordObject):
 
 
 class XICS_ICS(Elaboratable):
-    def __init__(self, SRC_NUM=16, PRIO_BITS=8):
+    def __init__(self, spec=None, SRC_NUM=16, PRIO_BITS=8):
         self.SRC_NUM = SRC_NUM
         self.PRIO_BITS = PRIO_BITS
         self.pri_masked = (1<<self.PRIO_BITS)-1
-        class Spec: pass
-        spec = Spec()
+        if spec is None:
+            class Spec: pass
+            spec = Spec()
         spec.addr_wid = 30
         spec.mask_wid = 4
         spec.reg_wid = 32
index 6efd2e59703f6f0747435f97030e8a463233457f..0f03df1546c8cf6ab91ef63b04713dca768a84c4 160000 (submodule)
@@ -1 +1 @@
-Subproject commit 6efd2e59703f6f0747435f97030e8a463233457f
+Subproject commit 0f03df1546c8cf6ab91ef63b04713dca768a84c4
index a1f14b3d5dfbeafcee830b86a0fc4192920b0b41..1f2629a255816397e6f510b866d3118e99625ac7 100644 (file)
@@ -23,9 +23,9 @@ class FetchUnitInterface:
         # inputs: address to fetch PC, and valid/stall signalling
         self.a_pc_i = Signal(self.addr_wid)
         self.a_stall_i = Signal()
-        self.a_valid_i = Signal()
+        self.a_i_valid = Signal()
         self.f_stall_i = Signal()
-        self.f_valid_i = Signal()
+        self.f_i_valid = Signal()
 
         # outputs: instruction (or error), and busy indicators
         self.a_busy_o = Signal()
@@ -45,9 +45,9 @@ class FetchUnitInterface:
     def __iter__(self):
         yield self.a_pc_i
         yield self.a_stall_i
-        yield self.a_valid_i
+        yield self.a_i_valid
         yield self.f_stall_i
-        yield self.f_valid_i
+        yield self.f_i_valid
         yield self.a_busy_o
         yield self.f_busy_o
         yield self.f_instr_o
@@ -68,14 +68,14 @@ class BareFetchUnit(FetchUnitInterface, Elaboratable):
 
             ibus_rdata = Signal.like(self.ibus.dat_r)
             with m.If(self.ibus.cyc):
-                with m.If(self.ibus.ack | self.ibus.err | ~self.f_valid_i):
+                with m.If(self.ibus.ack | self.ibus.err | ~self.f_i_valid):
                     m.d.sync += [
                         self.ibus.cyc.eq(0),
                         self.ibus.stb.eq(0),
                         self.ibus.sel.eq(0),
                         ibus_rdata.eq(self.ibus.dat_r)
                     ]
-            with m.Elif(self.a_valid_i & ~self.a_stall_i):
+            with m.Elif(self.a_i_valid & ~self.a_stall_i):
                 m.d.sync += [
                     self.ibus.adr.eq(self.a_pc_i[self.adr_lsbs:]),
                     self.ibus.cyc.eq(1),
@@ -153,11 +153,11 @@ class CachedFetchUnit(FetchUnitInterface, Elaboratable):
             icache.s1_addr.eq(self.a_pc_i[self.adr_lsbs:]),
             icache.s1_flush.eq(self.a_flush),
             icache.s1_stall.eq(self.a_stall_i),
-            icache.s1_valid.eq(self.a_valid_i & a_icache_select),
+            icache.s1_valid.eq(self.a_i_valid & a_icache_select),
             icache.s2_addr.eq(self.f_pc[self.adr_lsbs:]),
             icache.s2_re.eq(Const(1)),
             icache.s2_evict.eq(Const(0)),
-            icache.s2_valid.eq(self.f_valid_i & f_icache_select)
+            icache.s2_valid.eq(self.f_i_valid & f_icache_select)
         ]
 
         iba = WishboneArbiter(self.pspec)
@@ -180,14 +180,14 @@ class CachedFetchUnit(FetchUnitInterface, Elaboratable):
         bare_port = iba.port(priority=1)
         bare_rdata = Signal.like(bare_port.dat_r)
         with m.If(bare_port.cyc):
-            with m.If(bare_port.ack | bare_port.err | ~self.f_valid_i):
+            with m.If(bare_port.ack | bare_port.err | ~self.f_i_valid):
                 m.d.sync += [
                     bare_port.cyc.eq(0),
                     bare_port.stb.eq(0),
                     bare_port.sel.eq(0),
                     bare_rdata.eq(bare_port.dat_r)
                 ]
-        with m.Elif(~a_icache_select & self.a_valid_i & ~self.a_stall_i):
+        with m.Elif(~a_icache_select & self.a_i_valid & ~self.a_stall_i):
             m.d.sync += [
                 bare_port.cyc.eq(1),
                 bare_port.stb.eq(1),
index e9830c7a643e2364fbea48a6842bc1e1e28059c2..363c44ff3dcf394e392f5829c1b23a9cc612ad51 100644 (file)
@@ -54,11 +54,11 @@ class LoadStoreUnitInterface:
         self.x_st_data_i = Signal(data_wid)  # The data to write when storing
 
         self.x_stall_i = Signal()           # do nothing until low
-        self.x_valid_i = Signal()           # Whether x pipeline stage is
+        self.x_i_valid = Signal()           # Whether x pipeline stage is
         # currently enabled (I
         # think?). Set to 1 for #now
         self.m_stall_i = Signal()           # do nothing until low
-        self.m_valid_i = Signal()           # Whether m pipeline stage is
+        self.m_i_valid = Signal()           # Whether m pipeline stage is
         # currently enabled. Set
         # to 1 for now
 
@@ -67,7 +67,7 @@ class LoadStoreUnitInterface:
         self.m_busy_o = Signal()            # set when the memory is busy
 
         self.m_ld_data_o = Signal(data_wid)  # Data returned from memory read
-        # Data validity is NOT indicated by m_valid_i or x_valid_i as
+        # Data validity is NOT indicated by m_i_valid or x_i_valid as
         # those are inputs. I believe it is valid on the next cycle
         # after raising m_load where busy is low
 
@@ -84,9 +84,9 @@ class LoadStoreUnitInterface:
         yield self.x_st_data_i
 
         yield self.x_stall_i
-        yield self.x_valid_i
+        yield self.x_i_valid
         yield self.m_stall_i
-        yield self.m_valid_i
+        yield self.m_i_valid
         yield self.x_busy_o
         yield self.m_busy_o
         yield self.m_ld_data_o
@@ -111,7 +111,7 @@ class BareLoadStoreUnit(LoadStoreUnitInterface, Elaboratable):
         with m.If(self.jtag_en): # for safety, JTAG can completely disable WB
 
             with m.If(self.dbus.cyc):
-                with m.If(self.dbus.ack | self.dbus.err | ~self.m_valid_i):
+                with m.If(self.dbus.ack | self.dbus.err | ~self.m_i_valid):
                     m.d.sync += [
                         self.dbus.cyc.eq(0),
                         self.dbus.stb.eq(0),
@@ -119,7 +119,7 @@ class BareLoadStoreUnit(LoadStoreUnitInterface, Elaboratable):
                         self.m_ld_data_o.eq(self.dbus.dat_r)
                     ]
             with m.Elif((self.x_ld_i | self.x_st_i) &
-                        self.x_valid_i & ~self.x_stall_i):
+                        self.x_i_valid & ~self.x_stall_i):
                 m.d.sync += [
                     self.dbus.cyc.eq(1),
                     self.dbus.stb.eq(1),
@@ -207,11 +207,11 @@ class CachedLoadStoreUnit(LoadStoreUnitInterface, Elaboratable):
             dcache.s1_addr.eq(self.x_addr_i[self.adr_lsbs:]),
             dcache.s1_flush.eq(self.x_flush),
             dcache.s1_stall.eq(self.x_stall_i),
-            dcache.s1_valid.eq(self.x_valid_i & x_dcache_select),
+            dcache.s1_valid.eq(self.x_i_valid & x_dcache_select),
             dcache.s2_addr.eq(m_addr[self.adr_lsbs:]),
             dcache.s2_re.eq(self.m_load),
             dcache.s2_evict.eq(self.m_store),
-            dcache.s2_valid.eq(self.m_valid_i & m_dcache_select)
+            dcache.s2_valid.eq(self.m_i_valid & m_dcache_select)
         ]
 
         wrbuf_w_data = Record([("addr", self.addr_wid-self.adr_lsbs),
@@ -225,7 +225,7 @@ class CachedLoadStoreUnit(LoadStoreUnitInterface, Elaboratable):
             wrbuf_w_data.addr.eq(self.x_addr_i[self.adr_lsbs:]),
             wrbuf_w_data.mask.eq(self.x_mask_i),
             wrbuf_w_data.data.eq(self.x_st_data_i),
-            wrbuf.w_en.eq(self.x_st_i & self.x_valid_i &
+            wrbuf.w_en.eq(self.x_st_i & self.x_i_valid &
                           x_dcache_select & ~self.x_stall_i),
             wrbuf_r_data.eq(wrbuf.r_data),
         ]
@@ -267,14 +267,14 @@ class CachedLoadStoreUnit(LoadStoreUnitInterface, Elaboratable):
         bare_port = dba.port(priority=2)
         bare_rdata = Signal.like(bare_port.dat_r)
         with m.If(bare_port.cyc):
-            with m.If(bare_port.ack | bare_port.err | ~self.m_valid_i):
+            with m.If(bare_port.ack | bare_port.err | ~self.m_i_valid):
                 m.d.sync += [
                     bare_port.cyc.eq(0),
                     bare_port.stb.eq(0),
                     bare_rdata.eq(bare_port.dat_r)
                 ]
         with m.Elif((self.x_ld_i | self.x_st_i) &
-                    ~x_dcache_select & self.x_valid_i & ~self.x_stall_i):
+                    ~x_dcache_select & self.x_i_valid & ~self.x_stall_i):
             m.d.sync += [
                 bare_port.cyc.eq(1),
                 bare_port.stb.eq(1),
index f84a01ccc2f3b98f57ffe16c37cb5a2c206cf24b..f249d5547330b632d0499f7917edca5dc15e259b 100644 (file)
@@ -18,6 +18,12 @@ def make_wb_layout(spec, cti=True):
     addr_wid, mask_wid, data_wid = spec.addr_wid, spec.mask_wid, spec.reg_wid
     adr_lsbs = log2_int(mask_wid) # LSBs of addr covered by mask
     badwid = spec.addr_wid-adr_lsbs    # MSBs (not covered by mask)
+    # test if microwatt compatibility is to be enabled
+    microwatt_compat = (hasattr(spec, "microwatt_compat") and
+                               (spec.microwatt_compat == True))
+    # test if fabric compatibility is to be enabled
+    fabric_compat = (hasattr(spec, "fabric_compat") and
+                               (spec.fabric_compat == True))
 
     res = [
     ("adr",   badwid  , DIR_FANOUT),
@@ -30,6 +36,9 @@ def make_wb_layout(spec, cti=True):
     ("we",            1, DIR_FANOUT),
     ("err",           1, DIR_FANIN)
     ]
+    # microwatt needs a stall signal (operates in pipeline mode)
+    if microwatt_compat or fabric_compat:
+        res.append(("stall", 1, DIR_FANIN))
     if not cti:
         return res
     return res + [
index 710fc268d7e578e4cd3fbdb7731c5c4dc61aa515..2427a680a94ad5f7dac71b013579dba05bfea27c 100644 (file)
@@ -42,21 +42,22 @@ class Register(Elaboratable):
 
     def read_port(self, name=None):
         port = RecordObject([("ren", 1),
-                             ("data_o", self.width)],
+                             ("o_data", self.width)],
                             name=name)
         self._rdports.append(port)
         return port
 
     def write_port(self, name=None):
         port = RecordObject([("wen", 1),
-                             ("data_i", self.width)],
+                             ("i_data", self.width)],
                             name=name)
         self._wrports.append(port)
         return port
 
     def elaborate(self, platform):
         m = Module()
-        self.reg = reg = Signal(self.width, name="reg", reset=self.reset)
+        self.reg = reg = Signal(self.width, name="reg", reset=self.reset,
+                                attrs={'syn_ramstyle': "block_ram"})
 
         if self.synced:
             domain = m.d.sync
@@ -65,24 +66,24 @@ class Register(Elaboratable):
 
         # read ports. has write-through detection (returns data written)
         for rp in self._rdports:
-            domain += rp.data_o.eq(0)
+            domain += rp.o_data.eq(0)
             with m.If(rp.ren):
                 if self.writethru:
                     wr_detect = Signal(reset_less=False)
                     m.d.comb += wr_detect.eq(0)
                     for wp in self._wrports:
                         with m.If(wp.wen):
-                            domain += rp.data_o.eq(wp.data_i)
+                            domain += rp.o_data.eq(wp.i_data)
                             m.d.comb += wr_detect.eq(1)
                     with m.If(~wr_detect):
-                        domain += rp.data_o.eq(reg)
+                        domain += rp.o_data.eq(reg)
                 else:
-                    domain += rp.data_o.eq(reg)
+                    domain += rp.o_data.eq(reg)
 
         # write ports, delayed by 1 cycle
         for wp in self._wrports:
             with m.If(wp.wen):
-                m.d.sync += reg.eq(wp.data_i)
+                m.d.sync += reg.eq(wp.i_data)
 
         return m
 
@@ -96,7 +97,7 @@ class Register(Elaboratable):
         res = list(self)
 
 
-def ortreereduce(tree, attr="data_o"):
+def ortreereduce(tree, attr="o_data"):
     return treereduce(tree, operator.or_, lambda x: getattr(x, attr))
 
 
@@ -107,13 +108,17 @@ class RegFileArray(Elaboratable):
         and read-en signals (per port).
     """
 
-    def __init__(self, width, depth, synced=True, fwd_bus_mode=True):
+    def __init__(self, width, depth, synced=True, fwd_bus_mode=True,
+                                     resets=None):
+        if resets is None:
+            resets = [0] * depth
         self.synced = synced
         self.width = width
         self.depth = depth
         self.regs = Array(Register(width, synced=synced,
-                                   writethru=fwd_bus_mode) \
-                          for _ in range(self.depth))
+                                   writethru=fwd_bus_mode,
+                                   resetval=rst) \
+                          for rst in resets)
         self._rdports = []
         self._wrports = []
 
@@ -135,7 +140,7 @@ class RegFileArray(Elaboratable):
         regs = self.read_reg_port(name)
         regs = Array(regs)
         port = RecordObject([("ren", self.depth),
-                             ("data_o", self.width)], name)
+                             ("o_data", self.width)], name)
         self._rdports.append((regs, port))
         return port
 
@@ -143,7 +148,7 @@ class RegFileArray(Elaboratable):
         regs = self.write_reg_port(name)
         regs = Array(regs)
         port = RecordObject([("wen", self.depth),
-                             ("data_i", self.width)])
+                             ("i_data", self.width)])
         self._wrports.append((regs, port))
         return port
 
@@ -171,13 +176,13 @@ class RegFileArray(Elaboratable):
                 ren_delay = Signal.like(p.ren)
                 m.d.sync += ren_delay.eq(p.ren)
                 with m.If(ren_delay):
-                    m.d.comb += p.data_o.eq(ror)
+                    m.d.comb += p.o_data.eq(ror)
             else:
-                m.d.comb += p.data_o.eq(ror)
+                m.d.comb += p.o_data.eq(ror)
         for (regs, p) in self._wrports:
             m.d.comb += self._get_en_sig(regs, 'wen').eq(p.wen)
             for r in regs:
-                m.d.comb += r.data_i.eq(p.data_i)
+                m.d.comb += r.i_data.eq(p.i_data)
 
         return m
 
@@ -195,7 +200,8 @@ class RegFileMem(Elaboratable):
         self.fwd_bus_mode = fwd_bus_mode
         self.synced = synced
         self.width, self.depth = width, depth
-        self.memory = Memory(width=width, depth=depth)
+        self.memory = Memory(width=width, depth=depth,
+                             attrs={'syn_ramstyle': "block_ram"})
         self._rdports = {}
         self._wrports = {}
 
@@ -203,7 +209,7 @@ class RegFileMem(Elaboratable):
         bsz = log2_int(self.depth, False)
         port = RecordObject([("addr", bsz),
                              ("ren", 1),
-                             ("data_o", self.width)], name=name)
+                             ("o_data", self.width)], name=name)
         if self.synced:
             domain = "sync"
         else:
@@ -215,7 +221,7 @@ class RegFileMem(Elaboratable):
         bsz = log2_int(self.depth, False)
         port = RecordObject([("addr", bsz),
                              ("wen", 1),
-                             ("data_i", self.width)], name=name)
+                             ("i_data", self.width)], name=name)
         self._wrports[name] = (port, self.memory.write_port())
         return port
 
@@ -235,25 +241,25 @@ class RegFileMem(Elaboratable):
                         addrmatch = Signal(reset_less=False)
                         m.d.comb += addrmatch.eq(wp.addr == rp.addr)
                         with m.If(wp.wen & addrmatch):
-                            m.d.comb += rp.data_o.eq(wp.data_i)
+                            m.d.comb += rp.o_data.eq(wp.i_data)
                             m.d.comb += wr_detect.eq(1)
                     with m.If(~wr_detect):
-                        m.d.comb += rp.data_o.eq(rport.data)
+                        m.d.comb += rp.o_data.eq(rport.data)
             else:
                 if self.synced:
                     ren_delay = Signal.like(rp.ren)
                     m.d.sync += ren_delay.eq(rp.ren)
                     with m.If(ren_delay):
-                        m.d.comb += rp.data_o.eq(rport.data)
+                        m.d.comb += rp.o_data.eq(rport.data)
                 else:
-                    m.d.comb += rp.data_o.eq(rport.data)
+                    m.d.comb += rp.o_data.eq(rport.data)
 
         # write ports, delayed by one cycle (in the memory itself)
         for name, (port, wp) in self._wrports.items():
             setattr(m.submodules, "wp_"+name, wp)
             comb += wp.addr.eq(port.addr)
             comb += wp.en.eq(port.wen)
-            comb += wp.data.eq(port.data_i)
+            comb += wp.data.eq(port.i_data)
 
         return m
 
@@ -270,7 +276,7 @@ class RegFile(Elaboratable):
         bsz = int(log(self.width) / log(2))
         port = RecordObject([("addr", bsz),
                              ("ren", 1),
-                             ("data_o", self.width)], name=name)
+                             ("o_data", self.width)], name=name)
         self._rdports.append(port)
         return port
 
@@ -278,14 +284,16 @@ class RegFile(Elaboratable):
         bsz = int(log(self.width) / log(2))
         port = RecordObject([("addr", bsz),
                              ("wen", 1),
-                             ("data_i", self.width)], name=name)
+                             ("i_data", self.width)], name=name)
         self._wrports.append(port)
         return port
 
     def elaborate(self, platform):
         m = Module()
         bsz = int(log(self.width) / log(2))
-        regs = Array(Signal(self.width, name="reg") for _ in range(self.depth))
+        regs = Array(Signal(self.width, name="reg",
+                            attrs={'syn_ramstyle': "block_ram"}) \
+                    for _ in range(self.depth))
 
         # read ports. has write-through detection (returns data written)
         for rp in self._rdports:
@@ -296,15 +304,15 @@ class RegFile(Elaboratable):
                     addrmatch = Signal(reset_less=False)
                     m.d.comb += addrmatch.eq(wp.addr == rp.addr)
                     with m.If(wp.wen & addrmatch):
-                        m.d.comb += rp.data_o.eq(wp.data_i)
+                        m.d.comb += rp.o_data.eq(wp.i_data)
                         m.d.comb += wr_detect.eq(1)
                 with m.If(~wr_detect):
-                    m.d.comb += rp.data_o.eq(regs[rp.addr])
+                    m.d.comb += rp.o_data.eq(regs[rp.addr])
 
         # write ports, delayed by one cycle
         for wp in self._wrports:
             with m.If(wp.wen):
-                m.d.sync += regs[wp.addr].eq(wp.data_i)
+                m.d.sync += regs[wp.addr].eq(wp.i_data)
 
         return m
 
@@ -323,7 +331,7 @@ class RegFile(Elaboratable):
 
 def regfile_sim(dut, rp, wp):
     yield wp.addr.eq(1)
-    yield wp.data_i.eq(2)
+    yield wp.i_data.eq(2)
     yield wp.wen.eq(1)
     yield
     yield wp.wen.eq(0)
@@ -333,13 +341,13 @@ def regfile_sim(dut, rp, wp):
     yield rp.ren.eq(1)
     yield rp.addr.eq(1)
     yield Settle()
-    data = yield rp.data_o
+    data = yield rp.o_data
     print(data)
     yield
-    data = yield rp.data_o
+    data = yield rp.o_data
     print(data)
     yield
-    data2 = yield rp.data_o
+    data2 = yield rp.o_data
     print(data2)
     assert data == 2
     yield
@@ -348,32 +356,32 @@ def regfile_sim(dut, rp, wp):
     yield rp.addr.eq(5)
     yield rp.ren.eq(1)
     yield wp.wen.eq(1)
-    yield wp.data_i.eq(6)
+    yield wp.i_data.eq(6)
     yield
-    data = yield rp.data_o
+    data = yield rp.o_data
     print(data)
     assert data == 6
     yield
     yield wp.wen.eq(0)
     yield rp.ren.eq(0)
     yield
-    data = yield rp.data_o
+    data = yield rp.o_data
     print(data)
     assert data == 0
     yield
-    data = yield rp.data_o
+    data = yield rp.o_data
     print(data)
 
 
 def regfile_array_sim(dut, rp1, rp2, wp, wp2):
     print("regfile_array_sim")
-    yield wp.data_i.eq(2)
+    yield wp.i_data.eq(2)
     yield wp.wen.eq(1 << 1)
     yield
     yield wp.wen.eq(0)
     yield rp1.ren.eq(1 << 1)
     yield Settle()
-    data = yield rp1.data_o
+    data = yield rp1.o_data
     print(data)
     assert data == 2
     yield
@@ -381,9 +389,9 @@ def regfile_array_sim(dut, rp1, rp2, wp, wp2):
     yield rp1.ren.eq(1 << 5)
     yield rp2.ren.eq(1 << 1)
     yield wp.wen.eq(1 << 5)
-    yield wp.data_i.eq(6)
+    yield wp.i_data.eq(6)
     yield Settle()
-    data = yield rp1.data_o
+    data = yield rp1.o_data
     assert data == 6
     print(data)
     yield
@@ -391,15 +399,15 @@ def regfile_array_sim(dut, rp1, rp2, wp, wp2):
     yield rp1.ren.eq(0)
     yield rp2.ren.eq(0)
     yield Settle()
-    data1 = yield rp1.data_o
+    data1 = yield rp1.o_data
     print(data1)
     assert data1 == 0
-    data2 = yield rp2.data_o
+    data2 = yield rp2.o_data
     print(data2)
     assert data2 == 0
 
     yield
-    data = yield rp1.data_o
+    data = yield rp1.o_data
     print(data)
     assert data == 0
 
index 8f881423e4aedfc38b4f35d78c842aec908cf990..5ef301a8bf8bdbda55ad86131fd5f54ea66b5fe8 100644 (file)
@@ -31,6 +31,28 @@ from openpower.decoder.power_enums import SPRfull, SPRreduced
 # XXX MAKE DAMN SURE TO KEEP THESE UP-TO-DATE if changing/adding regs
 from openpower.consts import StateRegsEnum, XERRegsEnum, FastRegsEnum
 
+from nmigen import Module
+from nmigen.cli import rtlil
+from nmutil.latch import SRLatch
+
+
+def create_ports(rf, wr_spec, rd_spec):
+    """create_ports: creates register file ports based on requested specs
+    """
+    rf.r_ports, rf.w_ports = {}, {}
+    # create read ports based on read specs
+    for key, name in rd_spec.items():
+        if hasattr(rf, name): # some regfiles already have a port
+            rf.r_ports[key] = getattr(rf, name)
+        else:
+            rf.r_ports[key] = rf.read_port(name)
+    # create write ports based on write specs
+    for key, name in wr_spec.items():
+        if hasattr(rf, name): # some regfiles already have a port
+            rf.w_ports[key] = getattr(rf, name)
+        else:
+            rf.w_ports[key] = rf.write_port(name)
+
 
 # "State" Regfile
 class StateRegs(RegFileArray, StateRegsEnum):
@@ -48,17 +70,38 @@ class StateRegs(RegFileArray, StateRegsEnum):
     (d_rd2)
 
     """
-    def __init__(self, svp64_en=False, regreduce_en=False):
-        super().__init__(64, StateRegsEnum.N_REGS)
-        self.w_ports = {'nia': self.write_port("nia"),
-                        'msr': self.write_port("msr"),
-                        'svstate': self.write_port("svstate"),
-                        'sv': self.write_port("sv"), # writing SVSTATE (issuer)
-                        'd_wr1': self.write_port("d_wr1")} # writing PC (issuer)
-        self.r_ports = {'cia': self.read_port("cia"), # reading PC (issuer)
-                        'msr': self.read_port("msr"), # reading MSR (issuer)
-                        'sv': self.read_port("sv"), # reading SV (issuer)
+    def __init__(self, svp64_en=False, regreduce_en=False, resets=None):
+        super().__init__(64, StateRegsEnum.N_REGS, resets=resets)
+        wr_spec, rd_spec = self.get_port_specs()
+        create_ports(self, wr_spec, rd_spec)
+
+    def get_port_specs(self):
+        w_port_spec = { # these 3 allow writing state by Function Units
+                        # strictly speaking this should not be allowed,
+                        # the information should be passed back to Issuer
+                        # to work out what to do
+                        'nia': "nia",
+                        'msr': "msr",
+                        'svstate': "svstate",
+                        'issue': "issue", # writing DEC/TB
+                        'state1': "state1", # SPR pipeline
+                        # these 3 allow writing state by Issuer
+                        'sv': "sv", # writing SVSTATE
+                        'd_wr1': "d_wr1", # writing PC
+                        'd_wr2': "d_wr2"} # writing MSR
+        r_port_spec = { # these are for reading state by Issuer but
+                        # the FUs do not read them: they are passed in
+                        # because of multi-issue / pipelining / etc.
+                        # the state could be totally different and is
+                        # only known *at* issue time, *by* the issuer
+                        'cia': "cia", # reading PC (issuer)
+                        'msr': "msr", # reading MSR (issuer)
+                        'sv': "sv", # reading SV (issuer)
+                        # SPR and DEC/TB FSM
+                        'issue': "issue", # reading DEC/TB
+                        'state1': "state1", # SPR pipeline
                         }
+        return w_port_spec, r_port_spec
 
 
 # Integer Regfile
@@ -70,28 +113,35 @@ class IntRegs(RegFileMem): #class IntRegs(RegFileArray):
     * Array-based unary-indexed (not binary-indexed)
     * write-through capability (read on same cycle as write)
     """
-    def __init__(self, svp64_en=False, regreduce_en=False):
-        super().__init__(64, 32, fwd_bus_mode=not regreduce_en)
-        self.w_ports = {'o': self.write_port("dest1"),
+    def __init__(self, svp64_en=False, regreduce_en=False, reg_wid=64):
+        super().__init__(reg_wid, 32, fwd_bus_mode=False)
+        self.svp64_en = svp64_en
+        self.regreduce_en = regreduce_en
+        wr_spec, rd_spec = self.get_port_specs()
+        create_ports(self, wr_spec, rd_spec)
+
+    def get_port_specs(self):
+        w_port_spec = {'o': "dest1",
                         }
-        self.r_ports = {
-                        'dmi': self.read_port("dmi")} # needed for Debug (DMI)
-        if svp64_en:
-            self.r_ports['pred'] = self.read_port("pred") # for predicate mask
-        if not regreduce_en:
-            self.w_ports['o1'] = self.write_port("dest2") # (LD/ST update)
-            self.r_ports['ra'] = self.read_port("src1")
-            self.r_ports['rb'] = self.read_port("src2")
-            self.r_ports['rc'] = self.read_port("src3")
+        r_port_spec = { 'dmi': "dmi" # needed for Debug (DMI)
+                      }
+        if self.svp64_en:
+            r_port_spec['pred'] = "pred" # for predicate mask
+        if not self.regreduce_en:
+            w_port_spec['o1'] = "dest2" # (LD/ST update)
+            r_port_spec['ra'] = "src1"
+            r_port_spec['rb'] = "src2"
+            r_port_spec['rc'] = "src3"
         else:
-            self.r_ports['rabc'] = self.read_port("src1")
+            r_port_spec['rabc'] = "src1"
+        return w_port_spec, r_port_spec
 
 
 # Fast SPRs Regfile
 class FastRegs(RegFileMem, FastRegsEnum): #RegFileArray):
     """FastRegs
 
-    FAST regfile  - CTR, LR, TAR, SRR1, SRR2, XER, TB, DEC, SVSRR0
+    FAST regfile  - CTR, LR, TAR, SRR1, SRR2, XER, SVSRR0
 
     * QTY 6of 64-bit registers
     * 3R2W
@@ -101,15 +151,25 @@ class FastRegs(RegFileMem, FastRegsEnum): #RegFileArray):
     Note: r/w issue are used by issuer to increment/decrement TB/DEC.
     """
     def __init__(self, svp64_en=False, regreduce_en=False):
-        super().__init__(64, FastRegsEnum.N_REGS, fwd_bus_mode=not regreduce_en)
-        self.w_ports = {'fast1': self.write_port("dest1"),
-                        'issue': self.write_port("issue"), # writing DEC/TB
+        super().__init__(64, FastRegsEnum.N_REGS, fwd_bus_mode=False)
+        self.svp64_en = svp64_en
+        self.regreduce_en = regreduce_en
+        wr_spec, rd_spec = self.get_port_specs()
+        create_ports(self, wr_spec, rd_spec)
+
+    def get_port_specs(self):
+        w_port_spec = {'fast1': "dest1",
                        }
-        self.r_ports = {'fast1': self.read_port("src1"),
-                        'issue': self.read_port("issue"), # reading DEC/TB
+        r_port_spec = {'fast1': "src1",
+                        'dmi': "dmi" # needed for Debug (DMI)
                         }
-        if not regreduce_en:
-            self.r_ports['fast2'] = self.read_port("src2")
+        if not self.regreduce_en:
+            r_port_spec['fast2'] = "src2"
+            r_port_spec['fast3'] = "src3"
+            w_port_spec['fast2'] = "dest2"
+            w_port_spec['fast3'] = "dest3"
+
+        return w_port_spec, r_port_spec
 
 
 # CR Regfile
@@ -123,16 +183,24 @@ class CRRegs(VirtualRegPort):
     """
     def __init__(self, svp64_en=False, regreduce_en=False):
         super().__init__(32, 8, rd2=True)
-        self.w_ports = {'full_cr': self.full_wr, # 32-bit (masked, 8-en lines)
-                        'cr_a': self.write_port("dest1"), # 4-bit, unary-indexed
-                        'cr_b': self.write_port("dest2")} # 4-bit, unary-indexed
-        self.r_ports = {'full_cr': self.full_rd, # 32-bit (masked, 8-en lines)
-                        'full_cr_dbg': self.full_rd2, # for DMI
-                        'cr_a': self.read_port("src1"),
-                        'cr_b': self.read_port("src2"),
-                        'cr_c': self.read_port("src3")}
-        if svp64_en:
-            self.r_ports['cr_pred'] = self.read_port("cr_pred") # for predicate
+        self.svp64_en = svp64_en
+        self.regreduce_en = regreduce_en
+        wr_spec, rd_spec = self.get_port_specs()
+        create_ports(self, wr_spec, rd_spec)
+
+    def get_port_specs(self):
+        w_port_spec = {'full_cr': "full_wr", # 32-bit (masked, 8-en lines)
+                        'cr_a': "dest1", # 4-bit, unary-indexed
+                        'cr_b': "dest2"} # 4-bit, unary-indexed
+        r_port_spec = {'full_cr': "full_rd", # 32-bit (masked, 8-en lines)
+                        'full_cr_dbg': "full_rd2", # for DMI
+                        'cr_a': "src1",
+                        'cr_b': "src2",
+                        'cr_c': "src3"}
+        if self.svp64_en:
+            r_port_spec['cr_pred'] = "cr_pred" # for predicate
+
+        return w_port_spec, r_port_spec
 
 
 # XER Regfile
@@ -149,14 +217,21 @@ class XERRegs(VirtualRegPort, XERRegsEnum):
     OV=2 # OV and OV32
     def __init__(self, svp64_en=False, regreduce_en=False):
         super().__init__(6, XERRegsEnum.N_REGS)
-        self.w_ports = {'full_xer': self.full_wr, # 6-bit (masked, 3-en lines)
-                        'xer_so': self.write_port("dest1"),
-                        'xer_ca': self.write_port("dest2"),
-                        'xer_ov': self.write_port("dest3")}
-        self.r_ports = {'full_xer': self.full_rd, # 6-bit (masked, 3-en lines)
-                        'xer_so': self.read_port("src1"),
-                        'xer_ca': self.read_port("src2"),
-                        'xer_ov': self.read_port("src3")}
+        self.svp64_en = svp64_en
+        self.regreduce_en = regreduce_en
+        wr_spec, rd_spec = self.get_port_specs()
+        create_ports(self, wr_spec, rd_spec)
+
+    def get_port_specs(self):
+        w_port_spec = {'full_xer': "full_wr", # 6-bit (masked, 3-en lines)
+                        'xer_so': "dest1",
+                        'xer_ca': "dest2",
+                        'xer_ov': "dest3"}
+        r_port_spec = {'full_xer': "full_rd", # 6-bit (masked, 3-en lines)
+                        'xer_so': "src1",
+                        'xer_ca': "src2",
+                        'xer_ov': "src3"}
+        return w_port_spec, r_port_spec
 
 
 # SPR Regfile
@@ -174,14 +249,29 @@ class SPRRegs(RegFileMem):
         else:
             n_sprs = len(SPRfull)
         super().__init__(width=64, depth=n_sprs,
-                         fwd_bus_mode=not regreduce_en)
-        self.w_ports = {'spr1': self.write_port("spr1")}
-        self.r_ports = {'spr1': self.read_port("spr1")}
+                         fwd_bus_mode=False)
+        self.svp64_en = svp64_en
+        self.regreduce_en = regreduce_en
+        wr_spec, rd_spec = self.get_port_specs()
+        create_ports(self, wr_spec, rd_spec)
+
+    def get_port_specs(self):
+        w_port_spec = {'spr1': "spr1"}
+        r_port_spec = {'spr1': "spr1"}
+        return w_port_spec, r_port_spec
 
 
 # class containing all regfiles: int, cr, xer, fast, spr
 class RegFiles:
-    def __init__(self, pspec):
+    # Factory style classes
+    regkls = [('int', IntRegs),
+              ('cr', CRRegs),
+              ('xer', XERRegs),
+              ('fast', FastRegs),
+              ('state', StateRegs),
+              ('spr', SPRRegs),]
+    def __init__(self, pspec, make_hazard_vecs=False,
+                      state_resets=None): # state file reset values
         # test is SVP64 is to be enabled
         svp64_en = hasattr(pspec, "svp64") and (pspec.svp64 == True)
 
@@ -189,20 +279,61 @@ class RegFiles:
         regreduce_en = hasattr(pspec, "regreduce") and \
                       (pspec.regreduce == True)
 
-        self.rf = {}
+        # get Integer File register width
+        reg_wid = 64
+        if isinstance(pspec.XLEN, int):
+            reg_wid = pspec.XLEN
+
+        self.rf = {} # register file dict
         # create regfiles here, Factory style
-        for (name, kls) in [('int', IntRegs),
-                            ('cr', CRRegs),
-                            ('xer', XERRegs),
-                            ('fast', FastRegs),
-                            ('state', StateRegs),
-                            ('spr', SPRRegs),]:
-            rf = self.rf[name] = kls(svp64_en, regreduce_en)
+        for (name, kls) in RegFiles.regkls:
+            kwargs = {'svp64_en': svp64_en, 'regreduce_en': regreduce_en}
+            if name == 'state':
+                kwargs['resets'] = state_resets
+            if name == 'int':
+                kwargs['reg_wid'] = reg_wid
+            rf = self.rf[name] = kls(**kwargs)
             # also add these as instances, self.state, self.fast, self.cr etc.
             setattr(self, name, rf)
 
+        self.rv, self.wv = {}, {}
+        if make_hazard_vecs:
+            # create a read-hazard and write-hazard vectors for this regfile
+            self.wv = self.make_vecs("wr") # global write vectors
+            self.rv = self.make_vecs("rd") # global read vectors
+
+    def make_vecs(self, name):
+        vec = {}
+        # create regfiles here, Factory style
+        for (name, kls) in RegFiles.regkls:
+            rf = self.rf[name]
+            vec[name] = self.make_hazard_vec(rf, name)
+        return vec
+
+    def make_hazard_vec(self, rf, name):
+        if isinstance(rf, VirtualRegPort):
+            vec = SRLatch(sync=False, llen=rf.nregs, name=name)
+        else:
+            vec = SRLatch(sync=False, llen=rf.depth, name=name)
+        return vec
+
     def elaborate_into(self, m, platform):
         for (name, rf) in self.rf.items():
             setattr(m.submodules, name, rf)
+        for (name, rv) in self.rv.items():
+            setattr(m.submodules, "rv_"+name, rv)
+        for (name, wv) in self.wv.items():
+            setattr(m.submodules, "wv_"+name, wv)
         return m
 
+if __name__ == '__main__':
+    m = Module()
+    from soc.config.test.test_loadstore import TestMemPspec
+    pspec = TestMemPspec(regreduce_en=True,
+                         XLEN=32) # integer reg width = 32
+    rf = RegFiles(pspec, make_hazard_vecs=True)
+    rf.elaborate_into(m, None)
+    vl = rtlil.convert(m)
+    with open("test_regfiles.il", "w") as f:
+        f.write(vl)
+
diff --git a/src/soc/regfile/sram_wrapper.py b/src/soc/regfile/sram_wrapper.py
new file mode 100644 (file)
index 0000000..e4223f5
--- /dev/null
@@ -0,0 +1,1472 @@
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2022 Cesar Strauss <cestrauss@gmail.com>
+# Sponsored by NLnet and NGI POINTER under EU Grants 871528 and 957073
+# Part of the Libre-SOC Project.
+
+"""
+Wrapper around a single port (1R or 1W) SRAM, to make a multi-port regfile.
+
+This SRAM primitive has one cycle delay for reads, and, after a write,
+it reads the value just written. The goal is to use it to make at least an
+1W2R regfile.
+
+See https://bugs.libre-soc.org/show_bug.cgi?id=781 and
+https://bugs.libre-soc.org/show_bug.cgi?id=502
+"""
+
+import unittest
+
+from nmigen import Elaboratable, Module, Memory, Signal, Repl, Mux
+from nmigen.back import rtlil
+from nmigen.sim import Simulator
+from nmigen.asserts import Assert, Assume, Past, AnyConst
+
+from nmutil.formaltest import FHDLTestCase
+from nmutil.gtkw import write_gtkw
+
+
+class SinglePortSRAM(Elaboratable):
+    """
+    Model of a single port SRAM, which can be simulated, verified and/or
+    synthesized to an FPGA.
+
+    :param addr_width: width of the address bus
+    :param data_width: width of the data bus
+    :param we_width: number of write enable lines
+
+    .. note:: The debug read port is meant only to assist in formal proofs!
+    """
+    def __init__(self, addr_width, data_width, we_width):
+        self.addr_width = addr_width
+        self.data_width = data_width
+        self.we_width = we_width
+        # interface signals
+        self.d = Signal(data_width); """ write data"""
+        self.q = Signal(data_width); """read data"""
+        self.a = Signal(addr_width); """ read/write address"""
+        self.we = Signal(we_width); """write enable"""
+        # debug signals, only used in formal proofs
+        self.dbg_addr = Signal(addr_width); """debug: address under test"""
+        lanes = range(we_width)
+        self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+        gran = self.data_width // self.we_width
+        self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+        self.dbg_wrote = Signal(); """debug: data is valid"""
+
+    def elaborate(self, platform):
+        m = Module()
+        # backing memory
+        depth = 1 << self.addr_width
+        gran = self.data_width // self.we_width
+        mem = Memory(width=self.data_width, depth=depth)
+        # create read and write ports
+        # By connecting the same address to both ports, they behave, in fact,
+        # as a single, "half-duplex" port.
+        # The transparent attribute means that, on a write, we read the new
+        # value, on the next cycle
+        # Note that nmigen memories have a one cycle delay, for reads,
+        # by default
+        m.submodules.rdport = rdport = mem.read_port(transparent=True)
+        m.submodules.wrport = wrport = mem.write_port(granularity=gran)
+        # duplicate the address to both ports
+        m.d.comb += wrport.addr.eq(self.a)
+        m.d.comb += rdport.addr.eq(self.a)
+        # write enable
+        m.d.comb += wrport.en.eq(self.we)
+        # read and write data
+        m.d.comb += wrport.data.eq(self.d)
+        m.d.comb += self.q.eq(rdport.data)
+
+        # the following is needed for induction, where an unreachable state
+        # (memory and holding register differ) is turned into an illegal one
+        if platform == "formal":
+            # the debug port is an asynchronous read port, allowing direct
+            # access to a given memory location by the formal engine
+            m.submodules.dbgport = dbgport = mem.read_port(domain="comb")
+            # first, get the value stored in our memory location,
+            # using its debug port
+            stored = Signal(self.data_width)
+            m.d.comb += dbgport.addr.eq(self.dbg_addr)
+            m.d.comb += stored.eq(dbgport.data)
+            # now, ensure that the value stored in memory is always in sync
+            # with the holding register
+            with m.If(self.dbg_wrote):
+                m.d.sync += Assert(self.dbg_data ==
+                                   stored.word_select(self.dbg_lane, gran))
+
+        return m
+
+    def ports(self):
+        return [
+            self.d,
+            self.a,
+            self.we,
+            self.q
+        ]
+
+
+def create_ilang(dut, ports, test_name):
+    vl = rtlil.convert(dut, name=test_name, ports=ports)
+    with open("%s.il" % test_name, "w") as f:
+        f.write(vl)
+
+
+class SinglePortSRAMTestCase(FHDLTestCase):
+    @staticmethod
+    def test_simple_rtlil():
+        """
+        Generate a simple SRAM. Try ``read_rtlil mem_simple.il; proc; show``
+        from a yosys prompt, to see the memory primitives, and
+        ``read_rtlil mem_simple.il; synth; show`` to see it implemented as
+        flip-flop RAM
+        """
+        dut = SinglePortSRAM(2, 4, 2)
+        create_ilang(dut, dut.ports(), "mem_simple")
+
+    @staticmethod
+    def test_blkram_rtlil():
+        """
+        Generates a bigger SRAM.
+        Try ``read_rtlil mem_blkram.il; synth_ecp5; show`` from a yosys
+        prompt, to see it implemented as block RAM
+        """
+        dut = SinglePortSRAM(10, 16, 2)
+        create_ilang(dut, dut.ports(), "mem_blkram")
+
+    def test_sram_model(self):
+        """
+        Simulate some read/write/modify operations on the SRAM model
+        """
+        dut = SinglePortSRAM(7, 32, 4)
+        sim = Simulator(dut)
+        sim.add_clock(1e-6)
+
+        def process():
+            # 1) write 0x12_34_56_78 to address 0
+            yield dut.a.eq(0)
+            yield dut.d.eq(0x12_34_56_78)
+            yield dut.we.eq(0b1111)
+            yield
+            # 2) write 0x9A_BC_DE_F0 to address 1
+            yield dut.a.eq(1)
+            yield dut.d.eq(0x9A_BC_DE_F0)
+            yield dut.we.eq(0b1111)
+            yield
+            # ... and read value just written to address 0
+            self.assertEqual((yield dut.q), 0x12_34_56_78)
+            # 3) prepare to read from address 0
+            yield dut.d.eq(0)
+            yield dut.we.eq(0b0000)
+            yield dut.a.eq(0)
+            yield
+            # ... and read value just written to address 1
+            self.assertEqual((yield dut.q), 0x9A_BC_DE_F0)
+            # 4) prepare to read from address 1
+            yield dut.a.eq(1)
+            yield
+            # ... and read value from address 0
+            self.assertEqual((yield dut.q), 0x12_34_56_78)
+            # 5) write 0x9A and 0xDE to bytes 1 and 3, leaving
+            # bytes 0 and 2 unchanged
+            yield dut.a.eq(0)
+            yield dut.d.eq(0x9A_FF_DE_FF)
+            yield dut.we.eq(0b1010)
+            yield
+            # ... and read value from address 1
+            self.assertEqual((yield dut.q), 0x9A_BC_DE_F0)
+            # 6) nothing more to do
+            yield dut.d.eq(0)
+            yield dut.we.eq(0)
+            yield
+            # ... other than confirm that bytes 1 and 3 were modified
+            # correctly
+            self.assertEqual((yield dut.q), 0x9A_34_DE_78)
+
+        sim.add_sync_process(process)
+        traces = ['rdport.clk', 'a[6:0]', 'we[3:0]', 'd[31:0]', 'q[31:0]']
+        write_gtkw('test_sram_model.gtkw', 'test_sram_model.vcd',
+                   traces, module='top')
+        sim_writer = sim.write_vcd('test_sram_model.vcd')
+        with sim_writer:
+            sim.run()
+
+    def test_model_sram_proof(self):
+        """
+        Formal proof of the single port SRAM model
+        """
+        m = Module()
+        # 128 x 32-bit, 8-bit granularity
+        m.submodules.dut = dut = SinglePortSRAM(7, 32, 4)
+        gran = len(dut.d) // len(dut.we)  # granularity
+        # choose a single random memory location to test
+        a_const = AnyConst(dut.a.shape())
+        # choose a single byte lane to test
+        lane = AnyConst(range(dut.we_width))
+        # holding data register
+        d_reg = Signal(gran)
+        # for some reason, simulated formal memory is not zeroed at reset
+        # ... so, remember whether we wrote it, at least once.
+        wrote = Signal()
+        # if our memory location and byte lane is being written
+        # ... capture the data in our holding register
+        with m.If((dut.a == a_const) & dut.we.bit_select(lane, 1)):
+            m.d.sync += d_reg.eq(dut.d.word_select(lane, gran))
+            m.d.sync += wrote.eq(1)
+        # if our memory location is being read
+        # ... and the holding register has valid data
+        # ... then its value must match the memory output, on the given lane
+        with m.If((Past(dut.a) == a_const) & wrote):
+            m.d.sync += Assert(d_reg == dut.q.word_select(lane, gran))
+
+        # pass our state to the device under test, so it can ensure that
+        # its state is in sync with ours, for induction
+        m.d.comb += [
+            dut.dbg_addr.eq(a_const),
+            dut.dbg_lane.eq(lane),
+            dut.dbg_data.eq(d_reg),
+            dut.dbg_wrote.eq(wrote),
+        ]
+
+        self.assertFormal(m, mode="prove", depth=2)
+
+
+class PhasedDualPortRegfile(Elaboratable):
+    """
+    Builds, from a pair of 1RW blocks, a pseudo 1W/1R RAM, where the
+    read port works every cycle, but the write port is only available on
+    either even (1eW/1R) or odd (1oW/1R) cycles.
+
+    :param addr_width: width of the address bus
+    :param data_width: width of the data bus
+    :param we_width: number of write enable lines
+    :param write_phase: indicates on which phase the write port will
+                        accept data
+    :param transparent: whether a simultaneous read and write returns the
+                        new value (True) or the old value (False)
+
+    .. note:: The debug read port is meant only to assist in formal proofs!
+    """
+
+    def __init__(self, addr_width, data_width, we_width, write_phase,
+                 transparent=False):
+        self.addr_width = addr_width
+        self.data_width = data_width
+        self.we_width = we_width
+        self.write_phase = write_phase
+        self.transparent = transparent
+        # interface signals
+        self.wr_addr_i = Signal(addr_width); """write port address"""
+        self.wr_data_i = Signal(data_width); """write port data"""
+        self.wr_we_i = Signal(we_width); """write port enable"""
+        self.rd_addr_i = Signal(addr_width); """read port address"""
+        self.rd_data_o = Signal(data_width); """read port data"""
+        self.phase = Signal(); """even/odd cycle indicator"""
+        # debug signals, only used in formal proofs
+        self.dbg_addr = Signal(addr_width); """debug: address under test"""
+        lanes = range(we_width)
+        self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+        gran = self.data_width // self.we_width
+        self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+        self.dbg_wrote = Signal(); """debug: data is valid"""
+
+    def elaborate(self, platform):
+        m = Module()
+        # granularity
+        # instantiate the two 1RW memory blocks
+        mem1 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+        mem2 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+        m.submodules.mem1 = mem1
+        m.submodules.mem2 = mem2
+        # wire write port to first memory, and its output to the second
+        m.d.comb += mem1.d.eq(self.wr_data_i)
+        m.d.comb += mem2.d.eq(mem1.q)
+        # holding registers for the write port of the second memory
+        last_wr_addr = Signal(self.addr_width)
+        last_wr_we = Signal(self.we_width)
+        # do the read and write address coincide?
+        same_read_write = Signal()
+        with m.If(self.phase == self.write_phase):
+            # write phase, start a write on the first memory
+            m.d.comb += mem1.a.eq(self.wr_addr_i)
+            m.d.comb += mem1.we.eq(self.wr_we_i)
+            # save write address and write select for repeating the write
+            # on the second memory, later
+            m.d.sync += last_wr_we.eq(self.wr_we_i)
+            m.d.sync += last_wr_addr.eq(self.wr_addr_i)
+            # start a read on the second memory
+            m.d.comb += mem2.a.eq(self.rd_addr_i)
+            # output previously read data from the first memory
+            m.d.comb += self.rd_data_o.eq(mem1.q)
+            if self.transparent:
+                # remember whether we are reading from the same location we are
+                # writing
+                m.d.sync += same_read_write.eq(self.rd_addr_i == self.wr_addr_i)
+        with m.Else():
+            # read phase, write last written data on second memory
+            m.d.comb += mem2.a.eq(last_wr_addr)
+            m.d.comb += mem2.we.eq(last_wr_we)
+            # start a read on the first memory
+            m.d.comb += mem1.a.eq(self.rd_addr_i)
+            if self.transparent:
+                with m.If(same_read_write):
+                    # when transparent, and read and write addresses coincide,
+                    # output the data just written
+                    m.d.comb += self.rd_data_o.eq(mem1.q)
+                with m.Else():
+                    # otherwise, output previously read data
+                    # from the second memory
+                    m.d.comb += self.rd_data_o.eq(mem2.q)
+            else:
+                # always output the read data from the second memory,
+                # if not transparent
+                m.d.comb += self.rd_data_o.eq(mem2.q)
+
+        if platform == "formal":
+            # pass our state to the device under test, so it can ensure that
+            # its state is in sync with ours, for induction
+            m.d.comb += [
+                # pass the address and write lane under test to both memories
+                mem1.dbg_addr.eq(self.dbg_addr),
+                mem2.dbg_addr.eq(self.dbg_addr),
+                mem1.dbg_lane.eq(self.dbg_lane),
+                mem2.dbg_lane.eq(self.dbg_lane),
+                # the second memory copies its state from the first memory,
+                # after a cycle, so it has a one cycle delay
+                mem1.dbg_data.eq(self.dbg_data),
+                mem2.dbg_data.eq(Past(self.dbg_data)),
+                mem1.dbg_wrote.eq(self.dbg_wrote),
+                mem2.dbg_wrote.eq(Past(self.dbg_wrote)),
+            ]
+
+        return m
+
+    def ports(self):
+        return [
+            self.wr_addr_i,
+            self.wr_data_i,
+            self.wr_we_i,
+            self.rd_addr_i,
+            self.rd_data_o,
+            self.phase
+        ]
+
+
+class PhasedDualPortRegfileTestCase(FHDLTestCase):
+
+    def do_test_phased_dual_port_regfile(self, write_phase, transparent):
+        """
+        Simulate some read/write/modify operations on the phased write memory
+        """
+        dut = PhasedDualPortRegfile(7, 32, 4, write_phase, transparent)
+        sim = Simulator(dut)
+        sim.add_clock(1e-6)
+
+        # compare read data with previously written data
+        # and start a new read
+        def read(rd_addr_i, expected=None):
+            if expected is not None:
+                self.assertEqual((yield dut.rd_data_o), expected)
+            yield dut.rd_addr_i.eq(rd_addr_i)
+
+        # start a write, and set write phase
+        def write(wr_addr_i, wr_we_i, wr_data_i):
+            yield dut.wr_addr_i.eq(wr_addr_i)
+            yield dut.wr_we_i.eq(wr_we_i)
+            yield dut.wr_data_i.eq(wr_data_i)
+            yield dut.phase.eq(write_phase)
+
+        # disable writes, and start read phase
+        def skip_write():
+            yield dut.wr_addr_i.eq(0)
+            yield dut.wr_we_i.eq(0)
+            yield dut.wr_data_i.eq(0)
+            yield dut.phase.eq(~write_phase)
+
+        # writes a few values on the write port, and read them back
+        # ... reads can happen every cycle
+        # ... writes, only every two cycles.
+        # since reads have a one cycle delay, the expected value on
+        # each read refers to the last read performed, not the
+        # current one, which is in progress.
+        def process():
+            yield from read(0)
+            yield from write(0x42, 0b1111, 0x12345678)
+            yield
+            yield from read(0x42)
+            yield from skip_write()
+            yield
+            yield from read(0x42)
+            yield from write(0x43, 0b1111, 0x9ABCDEF0)
+            yield
+            yield from read(0x43, 0x12345678)
+            yield from skip_write()
+            yield
+            yield from read(0x42, 0x12345678)
+            yield from write(0x43, 0b1001, 0xF0FFFF9A)
+            yield
+            yield from read(0x43, 0x9ABCDEF0)
+            yield from skip_write()
+            yield
+            yield from read(0x43, 0x12345678)
+            yield from write(0x42, 0b0110, 0xFF5634FF)
+            yield
+            yield from read(0x42, 0xF0BCDE9A)
+            yield from skip_write()
+            yield
+            yield from read(0, 0xF0BCDE9A)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0, 0x12563478)
+            yield from skip_write()
+            yield
+            # try reading and writing to the same location, simultaneously
+            yield from read(0x42)
+            yield from write(0x42, 0b0101, 0x55AA9966)
+            yield
+            # ... and read again
+            yield from read(0x42)
+            yield from skip_write()
+            yield
+            if transparent:
+                # returns the value just written
+                yield from read(0, 0x12AA3466)
+            else:
+                # returns the old value
+                yield from read(0, 0x12563478)
+            yield from write(0, 0, 0)
+            yield
+            # after a cycle, always returns the new value
+            yield from read(0, 0x12AA3466)
+            yield from skip_write()
+
+        sim.add_sync_process(process)
+        debug_file = f'test_phased_dual_port_{write_phase}'
+        if transparent:
+            debug_file += '_transparent'
+        traces = ['clk', 'phase',
+                  'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+                  'rd_addr_i[6:0]', 'rd_data_o[31:0]']
+        write_gtkw(debug_file + '.gtkw',
+                   debug_file + '.vcd',
+                   traces, module='top', zoom=-22)
+        sim_writer = sim.write_vcd(debug_file + '.vcd')
+        with sim_writer:
+            sim.run()
+
+    def test_phased_dual_port_regfile(self):
+        """test both types (odd and even write ports) of phased write memory"""
+        with self.subTest("writes happen on phase 0"):
+            self.do_test_phased_dual_port_regfile(0, False)
+        with self.subTest("writes happen on phase 1"):
+            self.do_test_phased_dual_port_regfile(1, False)
+        """test again, with a transparent read port"""
+        with self.subTest("writes happen on phase 0 (transparent reads)"):
+            self.do_test_phased_dual_port_regfile(0, True)
+        with self.subTest("writes happen on phase 1 (transparent reads)"):
+            self.do_test_phased_dual_port_regfile(1, True)
+
+    def do_test_phased_dual_port_regfile_proof(self, write_phase, transparent):
+        """
+        Formal proof of the pseudo 1W/1R regfile
+        """
+        m = Module()
+        # 128 x 32-bit, 8-bit granularity
+        dut = PhasedDualPortRegfile(7, 32, 4, write_phase, transparent)
+        m.submodules.dut = dut
+        gran = dut.data_width // dut.we_width  # granularity
+        # choose a single random memory location to test
+        a_const = AnyConst(dut.addr_width)
+        # choose a single byte lane to test
+        lane = AnyConst(range(dut.we_width))
+        # drive alternating phases
+        m.d.comb += Assume(dut.phase != Past(dut.phase))
+        # holding data register
+        d_reg = Signal(gran)
+        # for some reason, simulated formal memory is not zeroed at reset
+        # ... so, remember whether we wrote it, at least once.
+        wrote = Signal()
+        # if our memory location and byte lane is being written,
+        # capture the data in our holding register
+        with m.If((dut.wr_addr_i == a_const)
+                  & dut.wr_we_i.bit_select(lane, 1)
+                  & (dut.phase == dut.write_phase)):
+            m.d.sync += d_reg.eq(dut.wr_data_i.word_select(lane, gran))
+            m.d.sync += wrote.eq(1)
+        # if our memory location is being read,
+        # and the holding register has valid data,
+        # then its value must match the memory output, on the given lane
+        with m.If(Past(dut.rd_addr_i) == a_const):
+            if transparent:
+                with m.If(wrote):
+                    rd_lane = dut.rd_data_o.word_select(lane, gran)
+                    m.d.sync += Assert(d_reg == rd_lane)
+            else:
+                # with a non-transparent read port, the read value depends
+                # on whether there is a simultaneous write, or not
+                with m.If((Past(dut.wr_addr_i) == a_const)
+                          & Past(dut.phase) == dut.write_phase):
+                    # simultaneous write -> check against last written value
+                    with m.If(Past(wrote)):
+                        rd_lane = dut.rd_data_o.word_select(lane, gran)
+                        m.d.sync += Assert(Past(d_reg) == rd_lane)
+                with m.Else():
+                    # otherwise, check against current written value
+                    with m.If(wrote):
+                        rd_lane = dut.rd_data_o.word_select(lane, gran)
+                        m.d.sync += Assert(d_reg == rd_lane)
+
+        # pass our state to the device under test, so it can ensure that
+        # its state is in sync with ours, for induction
+        m.d.comb += [
+            # address and mask under test
+            dut.dbg_addr.eq(a_const),
+            dut.dbg_lane.eq(lane),
+            # state of our holding register
+            dut.dbg_data.eq(d_reg),
+            dut.dbg_wrote.eq(wrote),
+        ]
+
+        self.assertFormal(m, mode="prove", depth=3)
+
+    def test_phased_dual_port_regfile_proof(self):
+        """test both types (odd and even write ports) of phased write memory"""
+        with self.subTest("writes happen on phase 0"):
+            self.do_test_phased_dual_port_regfile_proof(0, False)
+        with self.subTest("writes happen on phase 1"):
+            self.do_test_phased_dual_port_regfile_proof(1, False)
+        # test again, with transparent read ports
+        with self.subTest("writes happen on phase 0 (transparent reads)"):
+            self.do_test_phased_dual_port_regfile_proof(0, True)
+        with self.subTest("writes happen on phase 1 (transparent reads)"):
+            self.do_test_phased_dual_port_regfile_proof(1, True)
+
+
+class DualPortRegfile(Elaboratable):
+    """
+    Builds, from a pair of phased 1W/1R blocks, a true 1W/1R RAM, where both
+    read and write ports work every cycle.
+    It employs a Last Value Table, that tracks to which memory each address was
+    last written.
+
+    :param addr_width: width of the address bus
+    :param data_width: width of the data bus
+    :param we_width: number of write enable lines
+    :param transparent: whether a simultaneous read and write returns the
+                        new value (True) or the old value (False)
+    """
+
+    def __init__(self, addr_width, data_width, we_width, transparent=True):
+        self.addr_width = addr_width
+        self.data_width = data_width
+        self.we_width = we_width
+        self.transparent = transparent
+        # interface signals
+        self.wr_addr_i = Signal(addr_width); """write port address"""
+        self.wr_data_i = Signal(data_width); """write port data"""
+        self.wr_we_i = Signal(we_width); """write port enable"""
+        self.rd_addr_i = Signal(addr_width); """read port address"""
+        self.rd_data_o = Signal(data_width); """read port data"""
+        # debug signals, only used in formal proofs
+        # address and write lane under test
+        self.dbg_addr = Signal(addr_width); """debug: address under test"""
+        lanes = range(we_width)
+        self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+        # upstream state, to keep in sync with ours
+        gran = self.data_width // self.we_width
+        self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+        self.dbg_wrote = Signal(); """debug: data is valid"""
+        self.dbg_wrote_phase = Signal(); """debug: the phase data was written"""
+        self.dbg_phase = Signal(); """debug: current phase"""
+
+    def elaborate(self, platform):
+        m = Module()
+        # depth and granularity
+        depth = 1 << self.addr_width
+        gran = self.data_width // self.we_width
+        # instantiate the two phased 1R/1W memory blocks
+        mem0 = PhasedDualPortRegfile(
+            self.addr_width, self.data_width, self.we_width, 0,
+            self.transparent)
+        mem1 = PhasedDualPortRegfile(
+            self.addr_width, self.data_width, self.we_width, 1,
+            self.transparent)
+        m.submodules.mem0 = mem0
+        m.submodules.mem1 = mem1
+        # instantiate the backing memory (FFRAM or LUTRAM)
+        # for the Last Value Table
+        # it should have the same number and port types of the desired
+        # memory, but just one bit per write lane
+        lvt_mem = Memory(width=self.we_width, depth=depth)
+        lvt_wr = lvt_mem.write_port(granularity=1)
+        lvt_rd = lvt_mem.read_port(transparent=self.transparent)
+        if not self.transparent:
+            # for some reason, formal proofs don't recognize the default
+            # reset value for this signal
+            m.d.comb += lvt_rd.en.eq(1)
+        m.submodules.lvt_wr = lvt_wr
+        m.submodules.lvt_rd = lvt_rd
+        # generate and wire the phases for the phased memories
+        phase = Signal()
+        m.d.sync += phase.eq(~phase)
+        m.d.comb += [
+            mem0.phase.eq(phase),
+            mem1.phase.eq(phase),
+        ]
+        m.d.comb += [
+            # wire the write ports, directly
+            mem0.wr_addr_i.eq(self.wr_addr_i),
+            mem1.wr_addr_i.eq(self.wr_addr_i),
+            mem0.wr_we_i.eq(self.wr_we_i),
+            mem1.wr_we_i.eq(self.wr_we_i),
+            mem0.wr_data_i.eq(self.wr_data_i),
+            mem1.wr_data_i.eq(self.wr_data_i),
+            # also wire the read addresses
+            mem0.rd_addr_i.eq(self.rd_addr_i),
+            mem1.rd_addr_i.eq(self.rd_addr_i),
+            # wire read and write ports to the LVT
+            lvt_wr.addr.eq(self.wr_addr_i),
+            lvt_wr.en.eq(self.wr_we_i),
+            lvt_rd.addr.eq(self.rd_addr_i),
+            # the data for the LVT is the phase on which the value was
+            # written
+            lvt_wr.data.eq(Repl(phase, self.we_width)),
+        ]
+        for i in range(self.we_width):
+            # select the right memory to assign to the output read port,
+            # in this byte lane, according to the LVT contents
+            m.d.comb += self.rd_data_o.word_select(i, gran).eq(
+                Mux(
+                    lvt_rd.data[i],
+                    mem1.rd_data_o.word_select(i, gran),
+                    mem0.rd_data_o.word_select(i, gran)))
+
+        if platform == "formal":
+            # pass upstream state to the memories, so they can ensure that
+            # their state are in sync with upstream, for induction
+            m.d.comb += [
+                # address and write lane under test
+                mem0.dbg_addr.eq(self.dbg_addr),
+                mem1.dbg_addr.eq(self.dbg_addr),
+                mem0.dbg_lane.eq(self.dbg_lane),
+                mem1.dbg_lane.eq(self.dbg_lane),
+                # upstream state
+                mem0.dbg_data.eq(self.dbg_data),
+                mem1.dbg_data.eq(self.dbg_data),
+                # the memory, on which the write ends up, depends on which
+                # phase it was written
+                mem0.dbg_wrote.eq(self.dbg_wrote & ~self.dbg_wrote_phase),
+                mem1.dbg_wrote.eq(self.dbg_wrote & self.dbg_wrote_phase),
+            ]
+            # sync phase to upstream
+            m.d.comb += Assert(self.dbg_phase == phase)
+            # this debug port for the LVT is an asynchronous read port,
+            # allowing direct access to a given memory location
+            # by the formal engine
+            m.submodules.dbgport = dbgport = lvt_mem.read_port(domain='comb')
+            # first, get the value stored in our memory location,
+            stored = Signal(self.we_width)
+            m.d.comb += dbgport.addr.eq(self.dbg_addr)
+            m.d.comb += stored.eq(dbgport.data)
+            # now, ensure that the value stored in memory is always in sync
+            # with the expected value (which memory the value was written to)
+            with m.If(self.dbg_wrote):
+                m.d.comb += Assert(stored.bit_select(self.dbg_lane, 1)
+                                   == self.dbg_wrote_phase)
+        return m
+
+    def ports(self):
+        return [
+            self.wr_addr_i,
+            self.wr_data_i,
+            self.wr_we_i,
+            self.rd_addr_i,
+            self.rd_data_o
+        ]
+
+
+class DualPortRegfileTestCase(FHDLTestCase):
+
+    def do_test_dual_port_regfile(self, transparent):
+        """
+        Simulate some read/write/modify operations on the dual port register
+        file
+        """
+        dut = DualPortRegfile(7, 32, 4, transparent)
+        sim = Simulator(dut)
+        sim.add_clock(1e-6)
+
+        expected = None
+        last_expected = None
+
+        # compare read data with previously written data
+        # and start a new read
+        def read(rd_addr_i, next_expected=None):
+            nonlocal expected, last_expected
+            if expected is not None:
+                self.assertEqual((yield dut.rd_data_o), expected)
+            yield dut.rd_addr_i.eq(rd_addr_i)
+            # account for the read latency
+            expected = last_expected
+            last_expected = next_expected
+
+        # start a write
+        def write(wr_addr_i, wr_we_i, wr_data_i):
+            yield dut.wr_addr_i.eq(wr_addr_i)
+            yield dut.wr_we_i.eq(wr_we_i)
+            yield dut.wr_data_i.eq(wr_data_i)
+
+        def process():
+            # write a pair of values, one for each memory
+            yield from read(0)
+            yield from write(0x42, 0b1111, 0x87654321)
+            yield
+            yield from read(0x42, 0x87654321)
+            yield from write(0x43, 0b1111, 0x0FEDCBA9)
+            yield
+            # skip a beat
+            yield from read(0x43, 0x0FEDCBA9)
+            yield from write(0, 0, 0)
+            yield
+            # write again, but now they switch memories
+            yield from read(0)
+            yield from write(0x42, 0b1111, 0x12345678)
+            yield
+            yield from read(0x42, 0x12345678)
+            yield from write(0x43, 0b1111, 0x9ABCDEF0)
+            yield
+            yield from read(0x43, 0x9ABCDEF0)
+            yield from write(0, 0, 0)
+            yield
+            # test partial writes
+            yield from read(0)
+            yield from write(0x42, 0b1001, 0x78FFFF12)
+            yield
+            yield from read(0)
+            yield from write(0x43, 0b0110, 0xFFDEABFF)
+            yield
+            yield from read(0x42, 0x78345612)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0x43, 0x9ADEABF0)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0)
+            yield from write(0, 0, 0)
+            yield
+            if transparent:
+                # returns the value just written
+                yield from read(0x42, 0x78AA5666)
+            else:
+                # returns the old value
+                yield from read(0x42, 0x78345612)
+            yield from write(0x42, 0b0101, 0x55AA9966)
+            yield
+            # after a cycle, always returns the new value
+            yield from read(0x42, 0x78AA5666)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0)
+            yield from write(0, 0, 0)
+
+        sim.add_sync_process(process)
+        debug_file = 'test_dual_port_regfile'
+        if transparent:
+            debug_file += '_transparent'
+        traces = ['clk', 'phase',
+                  {'comment': 'write port'},
+                  'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+                  {'comment': 'read port'},
+                  'rd_addr_i[6:0]', 'rd_data_o[31:0]',
+                  {'comment': 'LVT write port'},
+                  'phase', 'lvt_mem_w_addr[6:0]', 'lvt_mem_w_en[3:0]',
+                  'lvt_mem_w_data[3:0]',
+                  {'comment': 'LVT read port'},
+                  'lvt_mem_r_addr[6:0]', 'lvt_mem_r_data[3:0]',
+                  {'comment': 'backing memory'},
+                  'mem0.rd_data_o[31:0]',
+                  'mem1.rd_data_o[31:0]',
+                  ]
+        write_gtkw(debug_file + '.gtkw',
+                   debug_file + '.vcd',
+                   traces, module='top', zoom=-22)
+        sim_writer = sim.write_vcd(debug_file + '.vcd')
+        with sim_writer:
+            sim.run()
+
+    def test_dual_port_regfile(self):
+        with self.subTest("non-transparent reads"):
+            self.do_test_dual_port_regfile(False)
+        with self.subTest("transparent reads"):
+            self.do_test_dual_port_regfile(True)
+
+    def do_test_dual_port_regfile_proof(self, transparent=True):
+        """
+        Formal proof of the 1W/1R regfile
+        """
+        m = Module()
+        # 128 x 32-bit, 8-bit granularity
+        dut = DualPortRegfile(7, 32, 4, transparent)
+        m.submodules.dut = dut
+        gran = dut.data_width // dut.we_width  # granularity
+        # choose a single random memory location to test
+        a_const = AnyConst(dut.addr_width)
+        # choose a single byte lane to test
+        lane = AnyConst(range(dut.we_width))
+        # holding data register
+        d_reg = Signal(gran)
+        # keep track of the phase, so we can remember which memory
+        # we wrote to
+        phase = Signal()
+        m.d.sync += phase.eq(~phase)
+        # for some reason, simulated formal memory is not zeroed at reset
+        # ... so, remember whether we wrote it, at least once.
+        wrote = Signal()
+        # ... and on which phase it was written
+        wrote_phase = Signal()
+        # if our memory location and byte lane is being written,
+        # capture the data in our holding register
+        with m.If((dut.wr_addr_i == a_const)
+                  & dut.wr_we_i.bit_select(lane, 1)):
+            m.d.sync += d_reg.eq(dut.wr_data_i.word_select(lane, gran))
+            m.d.sync += wrote.eq(1)
+            m.d.sync += wrote_phase.eq(phase)
+        # if our memory location is being read,
+        # and the holding register has valid data,
+        # then its value must match the memory output, on the given lane
+        with m.If(Past(dut.rd_addr_i) == a_const):
+            if transparent:
+                with m.If(wrote):
+                    rd_lane = dut.rd_data_o.word_select(lane, gran)
+                    m.d.sync += Assert(d_reg == rd_lane)
+            else:
+                # with a non-transparent read port, the read value depends
+                # on whether there is a simultaneous write, or not
+                with m.If(Past(dut.wr_addr_i) == a_const):
+                    # simultaneous write -> check against last written value
+                    with m.If(wrote & Past(wrote)):
+                        rd_lane = dut.rd_data_o.word_select(lane, gran)
+                        m.d.sync += Assert(Past(d_reg) == rd_lane)
+                with m.Else():
+                    # otherwise, check against current written value
+                    with m.If(wrote):
+                        rd_lane = dut.rd_data_o.word_select(lane, gran)
+                        m.d.sync += Assert(d_reg == rd_lane)
+
+        m.d.comb += [
+            dut.dbg_addr.eq(a_const),
+            dut.dbg_lane.eq(lane),
+            dut.dbg_data.eq(d_reg),
+            dut.dbg_wrote.eq(wrote),
+            dut.dbg_wrote_phase.eq(wrote_phase),
+            dut.dbg_phase.eq(phase),
+        ]
+
+        self.assertFormal(m, mode="prove", depth=3)
+
+    def test_dual_port_regfile_proof(self):
+        """
+        Formal check of 1W/1R regfile (transparent and not)
+        """
+        with self.subTest("transparent reads"):
+            self.do_test_dual_port_regfile_proof(True)
+        with self.subTest("non-transparent reads"):
+            self.do_test_dual_port_regfile_proof(False)
+
+
+class PhasedReadPhasedWriteFullReadSRAM(Elaboratable):
+    """
+    Builds, from three 1RW blocks, a pseudo 1W/2R SRAM, with:
+
+    * one full read port, which works every cycle,
+    * one write port, which is only available on either even or odd cycles,
+    * an extra transparent read port, available only on the same cycles as the
+      write port
+
+    This type of SRAM is useful for a XOR-based 6x1RW implementation of
+    a 1R/1W register file.
+
+    :param addr_width: width of the address bus
+    :param data_width: width of the data bus
+    :param we_width: number of write enable lines
+    :param write_phase: indicates on which phase the write port will
+                        accept data
+    :param transparent: whether a simultaneous read and write returns the
+                        new value (True) or the old value (False) on the full
+                        read port
+
+    .. note:: The debug read port is meant only to assist in formal proofs!
+    """
+
+    def __init__(self, addr_width, data_width, we_width, write_phase,
+                 transparent=True):
+        self.addr_width = addr_width
+        self.data_width = data_width
+        self.we_width = we_width
+        self.write_phase = write_phase
+        self.transparent = transparent
+        # interface signals
+        self.wr_addr_i = Signal(addr_width); """phased write port address"""
+        self.wr_data_i = Signal(data_width); """phased write port data"""
+        self.wr_we_i = Signal(we_width); """phased write port enable"""
+        self.rd_addr_i = Signal(addr_width); """full read port address"""
+        self.rd_data_o = Signal(data_width); """full read port data"""
+        self.rdp_addr_i = Signal(addr_width); """phased read port address"""
+        self.rdp_data_o = Signal(data_width); """phased read port data"""
+        self.phase = Signal(); """even/odd cycle indicator"""
+        # debug signals, only used in formal proofs
+        self.dbg_addr = Signal(addr_width); """debug: address under test"""
+        lanes = range(we_width)
+        self.dbg_lane = Signal(lanes); """debug: write lane under test"""
+        gran = self.data_width // self.we_width
+        self.dbg_data = Signal(gran); """debug: data to keep in sync"""
+        self.dbg_wrote = Signal(); """debug: data is valid"""
+
+    def elaborate(self, platform):
+        m = Module()
+        # instantiate the 1RW memory blocks
+        mem1 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+        mem2 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+        mem3 = SinglePortSRAM(self.addr_width, self.data_width, self.we_width)
+        m.submodules.mem1 = mem1
+        m.submodules.mem2 = mem2
+        m.submodules.mem3 = mem3
+        # wire input write data to first memory, and its output to the others
+        m.d.comb += [
+            mem1.d.eq(self.wr_data_i),
+            mem2.d.eq(mem1.q),
+            mem3.d.eq(mem1.q)
+        ]
+        # holding registers for the write port of the other memories
+        last_wr_addr = Signal(self.addr_width)
+        last_wr_we = Signal(self.we_width)
+        # do read and write addresses coincide?
+        same_read_write = Signal()
+        same_phased_read_write = Signal()
+        with m.If(self.phase == self.write_phase):
+            # write phase, start a write on the first memory
+            m.d.comb += mem1.a.eq(self.wr_addr_i)
+            m.d.comb += mem1.we.eq(self.wr_we_i)
+            # save write address and write select for repeating the write
+            # on the other memories, one cycle later
+            m.d.sync += last_wr_we.eq(self.wr_we_i)
+            m.d.sync += last_wr_addr.eq(self.wr_addr_i)
+            # start a read on the other memories
+            m.d.comb += mem2.a.eq(self.rd_addr_i)
+            m.d.comb += mem3.a.eq(self.rdp_addr_i)
+            # output previously read data from the first memory
+            m.d.comb += self.rd_data_o.eq(mem1.q)
+            # remember whether we are reading from the same location as we
+            # are writing
+            m.d.sync += same_phased_read_write.eq(
+                self.rdp_addr_i == self.wr_addr_i)
+            if self.transparent:
+                m.d.sync += same_read_write.eq(self.rd_addr_i == self.wr_addr_i)
+        with m.Else():
+            # read phase, write last written data on the other memories
+            m.d.comb += [
+                mem2.a.eq(last_wr_addr),
+                mem2.we.eq(last_wr_we),
+                mem3.a.eq(last_wr_addr),
+                mem3.we.eq(last_wr_we),
+            ]
+            # start a read on the first memory
+            m.d.comb += mem1.a.eq(self.rd_addr_i)
+            # output the read data from the second memory
+            if self.transparent:
+                with m.If(same_read_write):
+                    # when transparent, and read and write addresses coincide,
+                    # output the data just written
+                    m.d.comb += self.rd_data_o.eq(mem1.q)
+                with m.Else():
+                    # otherwise, output previously read data
+                    # from the second memory
+                    m.d.comb += self.rd_data_o.eq(mem2.q)
+            else:
+                # always output the read data from the second memory,
+                # if not transparent
+                m.d.comb += self.rd_data_o.eq(mem2.q)
+            with m.If(same_phased_read_write):
+                # if read and write addresses coincide,
+                # output the data just written
+                m.d.comb += self.rdp_data_o.eq(mem1.q)
+            with m.Else():
+                # otherwise, output previously read data
+                # from the third memory
+                m.d.comb += self.rdp_data_o.eq(mem3.q)
+
+        if platform == "formal":
+            # pass our state to the device under test, so it can ensure that
+            # its state is in sync with ours, for induction
+            m.d.comb += [
+                # pass the address and write lane under test to both memories
+                mem1.dbg_addr.eq(self.dbg_addr),
+                mem2.dbg_addr.eq(self.dbg_addr),
+                mem3.dbg_addr.eq(self.dbg_addr),
+                mem1.dbg_lane.eq(self.dbg_lane),
+                mem2.dbg_lane.eq(self.dbg_lane),
+                mem3.dbg_lane.eq(self.dbg_lane),
+                # the other memories copy their state from the first memory,
+                # after a cycle, so they have a one cycle delay
+                mem1.dbg_data.eq(self.dbg_data),
+                mem2.dbg_data.eq(Past(self.dbg_data)),
+                mem3.dbg_data.eq(Past(self.dbg_data)),
+                mem1.dbg_wrote.eq(self.dbg_wrote),
+                mem2.dbg_wrote.eq(Past(self.dbg_wrote)),
+                mem3.dbg_wrote.eq(Past(self.dbg_wrote)),
+            ]
+
+        return m
+
+
+class PhasedReadPhasedWriteFullReadSRAMTestCase(FHDLTestCase):
+
+    def do_test_case(self, write_phase, transparent):
+        """
+        Simulate some read/write/modify operations
+        """
+        dut = PhasedReadPhasedWriteFullReadSRAM(7, 32, 4, write_phase,
+                                                transparent)
+        sim = Simulator(dut)
+        sim.add_clock(1e-6)
+
+        expected = None
+        last_expected = None
+
+        # compare read data with previously written data
+        # and start a new read
+        def read(rd_addr_i, next_expected=None):
+            nonlocal expected, last_expected
+            if expected is not None:
+                self.assertEqual((yield dut.rd_data_o), expected)
+            yield dut.rd_addr_i.eq(rd_addr_i)
+            # account for the read latency
+            expected = last_expected
+            last_expected = next_expected
+
+        expected2 = None
+
+        # same as above, but for the phased read port
+        def phased_read(rdp_addr_i, next_expected2=None):
+            nonlocal expected2
+            if expected2 is not None:
+                self.assertEqual((yield dut.rdp_data_o), expected2)
+            yield dut.rdp_addr_i.eq(rdp_addr_i)
+            # account for the read latency
+            expected2 = next_expected2
+
+        # start a write
+        def write(wr_addr_i, wr_we_i, wr_data_i):
+            yield dut.wr_addr_i.eq(wr_addr_i)
+            yield dut.wr_we_i.eq(wr_we_i)
+            yield dut.wr_data_i.eq(wr_data_i)
+            yield dut.phase.eq(write_phase)
+
+        # disable writes, and start read phase
+        def skip_write():
+            yield dut.wr_addr_i.eq(0)
+            yield dut.wr_we_i.eq(0)
+            yield dut.wr_data_i.eq(0)
+            yield dut.phase.eq(~write_phase)
+            # also skip reading from the phased read port
+            yield dut.rdp_addr_i.eq(0)
+
+        # writes a few values on the write port, and read them back
+        def process():
+            yield from read(0)
+            yield from phased_read(0)
+            yield from write(0x42, 0b1111, 0x12345678)
+            yield
+            yield from read(0x42, 0x12345678)
+            yield from skip_write()
+            yield
+            yield from read(0x42, 0x12345678)
+            yield from phased_read(0x42, 0x12345678)
+            yield from write(0x43, 0b1111, 0x9ABCDEF0)
+            yield
+            yield from read(0x43, 0x9ABCDEF0)
+            yield from skip_write()
+            yield
+            yield from read(0x42, 0x12345678)
+            yield from phased_read(0x42, 0x12345678)
+            yield from write(0x43, 0b1001, 0xF0FFFF9A)
+            yield
+            yield from read(0x43, 0xF0BCDE9A)
+            yield from skip_write()
+            yield
+            yield from read(0x43, 0xF0BCDE9A)
+            yield from phased_read(0x43, 0xF0BCDE9A)
+            yield from write(0x42, 0b0110, 0xFF5634FF)
+            yield
+            yield from read(0x42, 0x12563478)
+            yield from skip_write()
+            yield
+            yield from read(0)
+            yield from phased_read(0)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0)
+            yield from skip_write()
+            yield
+            # try reading and writing at the same time
+            if transparent:
+                # transparent port, return the value just written
+                yield from read(0x42, 0x12AA3466)
+            else:
+                # ... otherwise, return the old value
+                yield from read(0x42, 0x12563478)
+            # transparent port, always return the value just written
+            yield from phased_read(0x42, 0x12AA3466)
+            yield from write(0x42, 0b0101, 0x55AA9966)
+            yield
+            # after a cycle, always returns the new value
+            yield from read(0x42, 0x12AA3466)
+            yield from skip_write()
+            yield
+            yield from read(0)
+            yield from phased_read(0)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0)
+            yield from skip_write()
+
+        sim.add_sync_process(process)
+        debug_file = 'test_phased_read_write_sram_' + str(write_phase)
+        if transparent:
+            debug_file += '_transparent'
+        traces = ['clk', 'phase',
+                  {'comment': 'phased write port'},
+                  'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+                  {'comment': 'full read port'},
+                  'rd_addr_i[6:0]', 'rd_data_o[31:0]',
+                  {'comment': 'phased read port'},
+                  'rdp_addr_i[6:0]', 'rdp_data_o[31:0]']
+        write_gtkw(debug_file + '.gtkw',
+                   debug_file + '.vcd',
+                   traces, module='top', zoom=-22)
+        sim_writer = sim.write_vcd(debug_file + '.vcd')
+        with sim_writer:
+            sim.run()
+
+    def test_case(self):
+        """test both types (odd and even write ports) of phased memory"""
+        with self.subTest("writes happen on phase 0"):
+            self.do_test_case(0, True)
+        with self.subTest("writes happen on phase 1"):
+            self.do_test_case(1, True)
+        with self.subTest("writes happen on phase 0 (non-transparent reads)"):
+            self.do_test_case(0, False)
+        with self.subTest("writes happen on phase 1 (non-transparent reads)"):
+            self.do_test_case(1, False)
+
+    def do_test_formal(self, write_phase, transparent):
+        """
+        Formal proof of the pseudo 1W/2R regfile
+        """
+        m = Module()
+        # 128 x 32-bit, 8-bit granularity
+        dut = PhasedReadPhasedWriteFullReadSRAM(7, 32, 4, write_phase,
+                                                transparent)
+        m.submodules.dut = dut
+        gran = dut.data_width // dut.we_width  # granularity
+        # choose a single random memory location to test
+        a_const = AnyConst(dut.addr_width)
+        # choose a single byte lane to test
+        lane = AnyConst(range(dut.we_width))
+        # drive alternating phases
+        m.d.comb += Assume(dut.phase != Past(dut.phase))
+        # holding data register
+        d_reg = Signal(gran)
+        # for some reason, simulated formal memory is not zeroed at reset
+        # ... so, remember whether we wrote it, at least once.
+        wrote = Signal()
+        # if our memory location and byte lane is being written,
+        # capture the data in our holding register
+        with m.If((dut.wr_addr_i == a_const)
+                  & dut.wr_we_i.bit_select(lane, 1)
+                  & (dut.phase == dut.write_phase)):
+            m.d.sync += d_reg.eq(dut.wr_data_i.word_select(lane, gran))
+            m.d.sync += wrote.eq(1)
+        # if our memory location is being read,
+        # and the holding register has valid data,
+        # then its value must match the memory output, on the given lane
+        with m.If(Past(dut.rd_addr_i) == a_const):
+            if transparent:
+                with m.If(wrote):
+                    rd_lane = dut.rd_data_o.word_select(lane, gran)
+                    m.d.sync += Assert(d_reg == rd_lane)
+            else:
+                # with a non-transparent read port, the read value depends
+                # on whether there is a simultaneous write, or not
+                with m.If((Past(dut.wr_addr_i) == a_const)
+                          & Past(dut.phase) == dut.write_phase):
+                    # simultaneous write -> check against last written value
+                    with m.If(Past(wrote)):
+                        rd_lane = dut.rd_data_o.word_select(lane, gran)
+                        m.d.sync += Assert(Past(d_reg) == rd_lane)
+                with m.Else():
+                    # otherwise, check against current written value
+                    with m.If(wrote):
+                        rd_lane = dut.rd_data_o.word_select(lane, gran)
+                        m.d.sync += Assert(d_reg == rd_lane)
+        # same for the phased read port, except it's always transparent
+        # and the port works only on the write phase
+        with m.If((Past(dut.rdp_addr_i) == a_const) & wrote
+                  & (Past(dut.phase) == dut.write_phase)):
+            rdp_lane = dut.rdp_data_o.word_select(lane, gran)
+            m.d.sync += Assert(d_reg == rdp_lane)
+
+        # pass our state to the device under test, so it can ensure that
+        # its state is in sync with ours, for induction
+        m.d.comb += [
+            # address and mask under test
+            dut.dbg_addr.eq(a_const),
+            dut.dbg_lane.eq(lane),
+            # state of our holding register
+            dut.dbg_data.eq(d_reg),
+            dut.dbg_wrote.eq(wrote),
+        ]
+
+        self.assertFormal(m, mode="prove", depth=3)
+
+    def test_formal(self):
+        """test both types (odd and even write ports) of phased write memory"""
+        with self.subTest("writes happen on phase 0"):
+            self.do_test_formal(0, False)
+        with self.subTest("writes happen on phase 1"):
+            self.do_test_formal(1, False)
+        # test again, with transparent read ports
+        with self.subTest("writes happen on phase 0 (transparent reads)"):
+            self.do_test_formal(0, True)
+        with self.subTest("writes happen on phase 1 (transparent reads)"):
+            self.do_test_formal(1, True)
+
+
+class DualPortXorRegfile(Elaboratable):
+    """
+    Builds, from a pair of phased 1W/2R blocks, a true 1W/1R RAM, where both
+    write and (non-transparent) read ports work every cycle.
+
+    It employs a XOR trick, as follows:
+
+    1) Like before, there are two memories, each reading on every cycle, and
+       writing on alternate cycles
+    2) Instead of a MUX, the read port is a direct XOR of the two memories.
+    3) Writes happens in two cycles:
+
+        First, read the current value of the *other* memory, at the write
+        location.
+
+        Then, on *this* memory, write that read value, XORed with the desired
+        value.
+
+    This recovers the desired value when read:
+    (other XOR desired) XOR other = desired
+
+    :param addr_width: width of the address bus
+    :param data_width: width of the data bus
+    :param we_width: number of write enable lines
+    :param transparent: whether a simultaneous read and write returns the
+                        new value (True) or the old value (False) on the full
+                        read port
+    """
+
+    def __init__(self, addr_width, data_width, we_width, transparent):
+        self.addr_width = addr_width
+        self.data_width = data_width
+        self.we_width = we_width
+        self.transparent = transparent
+        # interface signals
+        self.wr_addr_i = Signal(addr_width); """write port address"""
+        self.wr_data_i = Signal(data_width); """write port data"""
+        self.wr_we_i = Signal(we_width); """write port enable"""
+        self.rd_addr_i = Signal(addr_width); """read port address"""
+        self.rd_data_o = Signal(data_width); """read port data"""
+
+    def elaborate(self, platform):
+        m = Module()
+        # instantiate the two phased 1W/2R memory blocks
+        mem0 = PhasedReadPhasedWriteFullReadSRAM(
+            self.addr_width, self.data_width, self.we_width, 0, True)
+        mem1 = PhasedReadPhasedWriteFullReadSRAM(
+            self.addr_width, self.data_width, self.we_width, 1, True)
+        m.submodules.mem0 = mem0
+        m.submodules.mem1 = mem1
+        # generate and wire the phases for the phased memories
+        phase = Signal()
+        m.d.sync += phase.eq(~phase)
+        m.d.comb += [
+            mem0.phase.eq(phase),
+            mem1.phase.eq(phase),
+        ]
+        # store the write information for the next cycle
+        last_addr = Signal(self.addr_width)
+        last_we = Signal(self.we_width)
+        last_data = Signal(self.data_width)
+        m.d.sync += [
+            last_addr.eq(self.wr_addr_i),
+            last_we.eq(self.wr_we_i),
+            last_data.eq(self.wr_data_i),
+        ]
+        # read path
+        # wire read address to memories, and XOR their output
+        xor_data = Signal(self.data_width)
+        m.d.comb += [
+            mem0.rd_addr_i.eq(self.rd_addr_i),
+            mem1.rd_addr_i.eq(self.rd_addr_i),
+            xor_data.eq(mem0.rd_data_o ^ mem1.rd_data_o),
+        ]
+        if self.transparent:
+            # do the read and write addresses coincide?
+            same_read_write = Signal()
+            m.d.sync += same_read_write.eq(self.rd_addr_i == self.wr_addr_i)
+            gran = self.data_width // self.we_width
+            for i in range(self.we_width):
+                # when simultaneously reading and writing to the same location
+                # and write lane, bypass the memory, and output the write
+                # holding register instead
+                with m.If(same_read_write & last_we[i]):
+                    m.d.comb += self.rd_data_o.word_select(i, gran).eq(
+                        last_data.word_select(i, gran))
+                # otherwise, output the xor data
+                with m.Else():
+                    m.d.comb += self.rd_data_o.word_select(i, gran).eq(
+                        xor_data.word_select(i, gran))
+        # when not transparent, just output the memory contents (xor data)
+        else:
+            m.d.comb += self.rd_data_o.eq(xor_data)
+        # write path
+        # 1) read the memory location which is about to be written
+        m.d.comb += [
+            mem0.rdp_addr_i.eq(self.wr_addr_i),
+            mem1.rdp_addr_i.eq(self.wr_addr_i),
+        ]
+        # 2) write the XOR of the other memory data, and the desired value
+        m.d.comb += [
+            mem0.wr_addr_i.eq(last_addr),
+            mem1.wr_addr_i.eq(last_addr),
+            mem0.wr_we_i.eq(last_we),
+            mem1.wr_we_i.eq(last_we),
+            mem0.wr_data_i.eq(last_data ^ mem1.rdp_data_o),
+            mem1.wr_data_i.eq(last_data ^ mem0.rdp_data_o),
+        ]
+        return m
+
+
+class DualPortXorRegfileTestCase(FHDLTestCase):
+
+    def do_test_case(self, transparent):
+        """
+        Simulate some read/write/modify operations on the dual port register
+        file
+        """
+        dut = DualPortXorRegfile(7, 32, 4, transparent)
+        sim = Simulator(dut)
+        sim.add_clock(1e-6)
+
+        expected = None
+        last_expected = None
+
+        # compare read data with previously written data
+        # and start a new read
+        def read(rd_addr_i, next_expected=None):
+            nonlocal expected, last_expected
+            if expected is not None:
+                self.assertEqual((yield dut.rd_data_o), expected)
+            yield dut.rd_addr_i.eq(rd_addr_i)
+            # account for the read latency
+            expected = last_expected
+            last_expected = next_expected
+
+        # start a write
+        def write(wr_addr_i, wr_we_i, wr_data_i):
+            yield dut.wr_addr_i.eq(wr_addr_i)
+            yield dut.wr_we_i.eq(wr_we_i)
+            yield dut.wr_data_i.eq(wr_data_i)
+
+        def process():
+            # write a pair of values, one for each memory
+            yield from read(0)
+            yield from write(0x42, 0b1111, 0x87654321)
+            yield
+            yield from read(0x42, 0x87654321)
+            yield from write(0x43, 0b1111, 0x0FEDCBA9)
+            yield
+            # skip a beat
+            yield from read(0x43, 0x0FEDCBA9)
+            yield from write(0, 0, 0)
+            yield
+            # write again, but now they switch memories
+            yield from read(0)
+            yield from write(0x42, 0b1111, 0x12345678)
+            yield
+            yield from read(0x42, 0x12345678)
+            yield from write(0x43, 0b1111, 0x9ABCDEF0)
+            yield
+            yield from read(0x43, 0x9ABCDEF0)
+            yield from write(0, 0, 0)
+            yield
+            # test partial writes
+            yield from read(0)
+            yield from write(0x42, 0b1001, 0x78FFFF12)
+            yield
+            yield from read(0)
+            yield from write(0x43, 0b0110, 0xFFDEABFF)
+            yield
+            yield from read(0x42, 0x78345612)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0x43, 0x9ADEABF0)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0)
+            yield from write(0, 0, 0)
+            yield
+            # test simultaneous read and write
+            if transparent:
+                # transparent reads, returns the new value
+                yield from read(0x42, 0x78AA5666)
+            else:
+                # non-transparent read: returns the old value
+                yield from read(0x42, 0x78345612)
+            yield from write(0x42, 0b0101, 0x55AA9966)
+            yield
+            # after a cycle, returns the new value
+            yield from read(0x42, 0x78AA5666)
+            yield from write(0, 0, 0)
+            yield
+            # settle down
+            yield from read(0)
+            yield from write(0, 0, 0)
+            yield
+            yield from read(0)
+            yield from write(0, 0, 0)
+
+        sim.add_sync_process(process)
+        debug_file = 'test_dual_port_xor_regfile'
+        if transparent:
+            debug_file += '_transparent'
+        traces = ['clk', 'phase',
+                  {'comment': 'write port'},
+                  'wr_addr_i[6:0]', 'wr_we_i[3:0]', 'wr_data_i[31:0]',
+                  {'comment': 'read port'},
+                  'rd_addr_i[6:0]', 'rd_data_o[31:0]',
+                  ]
+        write_gtkw(debug_file + '.gtkw',
+                   debug_file + '.vcd',
+                   traces, module='top', zoom=-22)
+        sim_writer = sim.write_vcd(debug_file + '.vcd')
+        with sim_writer:
+            sim.run()
+
+    def test_case(self):
+        with self.subTest("non-transparent reads"):
+            self.do_test_case(False)
+        with self.subTest("transparent reads"):
+            self.do_test_case(True)
+
+
+if __name__ == "__main__":
+    unittest.main()
index f4393e152d2030de442cc124deca0e707adc7bfb..78a6124020641edca4ceae3dd89a76b6b40fb683 100644 (file)
@@ -18,60 +18,67 @@ from soc.regfile.regfile import RegFileArray
 
 
 class VirtualRegPort(RegFileArray):
-    def __init__(self, bitwidth, n_regs, rd2=False):
+    def __init__(self, bitwidth, n_regs, rd2=False, wr2=False, synced=True):
         self.bitwidth = bitwidth
         self.nregs = n_regs
         self.rd2 = rd2 # eurgh hack
+        self.wr2 = wr2 # eurgh hack
         self.regwidth = regwidth = bitwidth // n_regs
-        super().__init__(self.regwidth, n_regs)
+        super().__init__(self.regwidth, n_regs, synced=synced)
 
         # "full" depth variant of the "external" port
         self.full_wr = RecordObject([("wen", n_regs),
-                                     ("data_i", bitwidth)],  # *full* wid
+                                     ("i_data", bitwidth)],  # *full* wid
                                     name="full_wr")
         self.full_rd = RecordObject([("ren", n_regs),
-                                     ("data_o", bitwidth)],  # *full* wid
+                                     ("o_data", bitwidth)],  # *full* wid
                                     name="full_rd")
-        if not rd2:
-            return
-        self.full_rd2 = RecordObject([("ren", n_regs),
-                                     ("data_o", bitwidth)],  # *full* wid
+        if wr2:
+            self.full_wr2 = RecordObject([("wen", n_regs),
+                                     ("i_data", bitwidth)],  # *full* wid
+                                    name="full_wr2")
+        if rd2:
+            self.full_rd2 = RecordObject([("ren", n_regs),
+                                     ("o_data", bitwidth)],  # *full* wid
                                     name="full_rd2")
 
+    def connect_full_wr(self, m, wfull, name):
+        comb = m.d.comb
+        wr_regs = self.write_reg_port(name)
+
+        # wire up the enable signals from the large (full) port
+        l = map(lambda port: port.i_data, wr_regs)
+        le = map(lambda port: port.wen, wr_regs)  # get port wen(s)
+
+        # get list of all i_data (and wens) and assign to them via Cat
+        comb += Cat(*l).eq(wfull.i_data)
+        comb += Cat(*le).eq(wfull.wen)
+
     def connect_full_rd(self, m, rfull, name):
         comb = m.d.comb
         rd_regs = self.read_reg_port(name)
 
         # wire up the enable signals and chain-accumulate the data
-        l = map(lambda port: port.data_o, rd_regs)  # get port data(s)
+        l = map(lambda port: port.o_data, rd_regs)  # get port data(s)
         le = map(lambda port: port.ren, rd_regs)  # get port ren(s)
 
-        comb += rfull.data_o.eq(Cat(*l))  # we like Cat on lists
+        comb += rfull.o_data.eq(Cat(*l))  # we like Cat on lists
         comb += Cat(*le).eq(rfull.ren)
 
     def elaborate(self, platform):
         m = super().elaborate(platform)
         comb = m.d.comb
 
-        # for internal use only.
-        wr_regs = self.write_reg_port(f"w")
+        # connect up full write port
+        self.connect_full_wr(m, self.full_wr, "w")
+        if self.wr2:
+            self.connect_full_wr(m, self.full_wr2, "w2")
 
         # connect up full read port
         self.connect_full_rd(m, self.full_rd, "r")
         if self.rd2: # hack!
             self.connect_full_rd(m, self.full_rd2, "r2")
 
-        # connect up full write port
-        wfull = self.full_wr
-
-        # wire up the enable signals from the large (full) port
-        l = map(lambda port: port.data_i, wr_regs)
-        le = map(lambda port: port.wen, wr_regs)  # get port wen(s)
-
-        # get list of all data_i (and wens) and assign to them via Cat
-        comb += Cat(*l).eq(wfull.data_i)
-        comb += Cat(*le).eq(wfull.wen)
-
         return m
 
     def __iter__(self):
@@ -82,14 +89,14 @@ class VirtualRegPort(RegFileArray):
 
 def regfile_array_sim(dut, rp1, rp2, rp3, wp):
     # part-port write
-    yield wp.data_i.eq(2)
+    yield wp.i_data.eq(2)
     yield wp.wen.eq(1 << 1)
     yield
     yield wp.wen.eq(0)
     # part-port read
     yield rp1.ren.eq(1 << 1)
     yield
-    data = yield rp1.data_o
+    data = yield rp1.o_data
     print(data)
     assert data == 2
 
@@ -97,38 +104,38 @@ def regfile_array_sim(dut, rp1, rp2, rp3, wp):
     yield rp1.ren.eq(1 << 5)
     yield rp2.ren.eq(1 << 1)
     yield wp.wen.eq(1 << 5)
-    yield wp.data_i.eq(6)
+    yield wp.i_data.eq(6)
     yield
     yield wp.wen.eq(0)
     yield rp1.ren.eq(0)
     yield rp2.ren.eq(0)
-    data1 = yield rp1.data_o
+    data1 = yield rp1.o_data
     print(data1)
     assert data1 == 6, data1
-    data2 = yield rp2.data_o
+    data2 = yield rp2.o_data
     print(data2)
     assert data2 == 2, data2
     yield
-    data = yield rp1.data_o
+    data = yield rp1.o_data
     print(data)
 
     # full port read (whole reg)
     yield dut.full_rd.ren.eq(0xff)
     yield
     yield dut.full_rd.ren.eq(0)
-    data = yield dut.full_rd.data_o
+    data = yield dut.full_rd.o_data
     print(hex(data))
 
     # full port read (part reg)
     yield dut.full_rd.ren.eq(0x1 << 5)
     yield
     yield dut.full_rd.ren.eq(0)
-    data = yield dut.full_rd.data_o
+    data = yield dut.full_rd.o_data
     print(hex(data))
 
     # full port part-write (part masked reg)
     yield dut.full_wr.wen.eq(0x1 << 1)
-    yield dut.full_wr.data_i.eq(0xe0)
+    yield dut.full_wr.i_data.eq(0xe0)
     yield
     yield dut.full_wr.wen.eq(0x0)
 
@@ -136,12 +143,12 @@ def regfile_array_sim(dut, rp1, rp2, rp3, wp):
     yield dut.full_rd.ren.eq(0xff)
     yield
     yield dut.full_rd.ren.eq(0)
-    data = yield dut.full_rd.data_o
+    data = yield dut.full_rd.o_data
     print(hex(data))
 
     # full port write
     yield dut.full_wr.wen.eq(0xff)
-    yield dut.full_wr.data_i.eq(0xcafeface)
+    yield dut.full_wr.i_data.eq(0xcafeface)
     yield
     yield dut.full_wr.wen.eq(0x0)
 
@@ -149,17 +156,17 @@ def regfile_array_sim(dut, rp1, rp2, rp3, wp):
     yield dut.full_rd.ren.eq(0xff)
     yield
     yield dut.full_rd.ren.eq(0)
-    data = yield dut.full_rd.data_o
+    data = yield dut.full_rd.o_data
     print(hex(data))
 
     # part write
-    yield wp.data_i.eq(2)
+    yield wp.i_data.eq(2)
     yield wp.wen.eq(1 << 1)
     yield
     yield wp.wen.eq(0)
     yield rp1.ren.eq(1 << 1)
     yield
-    data = yield rp1.data_o
+    data = yield rp1.o_data
     print(hex(data))
     assert data == 2
 
@@ -167,7 +174,7 @@ def regfile_array_sim(dut, rp1, rp2, rp3, wp):
     yield dut.full_rd.ren.eq(0xff)
     yield
     yield dut.full_rd.ren.eq(0)
-    data = yield dut.full_rd.data_o
+    data = yield dut.full_rd.o_data
     print(hex(data))
 
     # simultaneous read/write: full-write, part-write, 3x part-read
@@ -175,22 +182,22 @@ def regfile_array_sim(dut, rp1, rp2, rp3, wp):
     yield rp2.ren.eq(1 << 1)
     yield rp3.ren.eq(1 << 3)
     yield wp.wen.eq(1 << 3)
-    yield wp.data_i.eq(6)
+    yield wp.i_data.eq(6)
     yield dut.full_wr.wen.eq((1 << 1) | (1 << 5))
-    yield dut.full_wr.data_i.eq((0xa << (1*4)) | (0x3 << (5*4)))
+    yield dut.full_wr.i_data.eq((0xa << (1*4)) | (0x3 << (5*4)))
     yield
     yield dut.full_wr.wen.eq(0)
     yield wp.wen.eq(0)
     yield rp1.ren.eq(0)
     yield rp2.ren.eq(0)
     yield rp3.ren.eq(0)
-    data1 = yield rp1.data_o
+    data1 = yield rp1.o_data
     print(hex(data1))
     assert data1 == 0x3
-    data2 = yield rp2.data_o
+    data2 = yield rp2.o_data
     print(hex(data2))
     assert data2 == 0xa
-    data3 = yield rp3.data_o
+    data3 = yield rp3.o_data
     print(hex(data3))
     assert data3 == 0x6
 
index eee2839806008a0d30667f3ee475144433f5f496..66adafa45d922372a34203eb07bd257a835f7175 100644 (file)
@@ -33,7 +33,7 @@ Notes:
 
 from nmigen.compat.sim import run_simulation, Settle
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Const, Array, Cat, Elaboratable, Repl
+from nmigen import Module, Signal, Const, Cat, Elaboratable, Repl
 from nmigen.lib.coding import Decoder
 from nmigen.utils import log2_int
 
@@ -48,14 +48,14 @@ class PartialAddrMatch(Elaboratable):
         self.n_adr = n_adr
         self.bitwid = bitwid
         # inputs
-        self.addrs_i = Array(Signal(bitwid, name="addr") for i in range(n_adr))
+        self.addrs_i = tuple(Signal(bitwid, name="addr") for i in range(n_adr))
         # self.addr_we_i = Signal(n_adr, reset_less=True) # write-enable
         self.addr_en_i = Signal(n_adr, reset_less=True)  # address latched in
         self.addr_rs_i = Signal(n_adr, reset_less=True)  # address deactivated
 
         # output: a nomatch for each address plus individual nomatch signals
         self.addr_nomatch_o = Signal(n_adr, name="nomatch_o", reset_less=True)
-        self.addr_nomatch_a_o = Array(Signal(n_adr, reset_less=True,
+        self.addr_nomatch_a_o = tuple(Signal(n_adr, reset_less=True,
                                              name="nomatch_array_o")
                                       for i in range(n_adr))
 
@@ -69,7 +69,7 @@ class PartialAddrMatch(Elaboratable):
 
         # array of address-latches
         m.submodules.l = self.l = l = SRLatch(llen=self.n_adr, sync=False)
-        self.adrs_r = adrs_r = Array(Signal(self.bitwid, reset_less=True,
+        self.adrs_r = adrs_r = tuple(Signal(self.bitwid, reset_less=True,
                                             name="a_r")
                                      for i in range(self.n_adr))
 
@@ -183,14 +183,14 @@ class TwinPartialAddrBitmap(PartialAddrMatch):
 
         # input: length of the LOAD/STORE
         expwid = 1+self.lsbwid  # XXX assume LD/ST no greater than 8
-        self.lexp_i = Array(Signal(1 << expwid, reset_less=True,
+        self.lexp_i = tuple(Signal(1 << expwid, reset_less=True,
                                    name="len") for i in range(n_adr))
         # input: full address
-        self.faddrs_i = Array(Signal(bitlen, reset_less=True,
+        self.faddrs_i = tuple(Signal(bitlen, reset_less=True,
                                      name="fadr") for i in range(n_adr))
 
         # registers for expanded len
-        self.len_r = Array(Signal(expwid, reset_less=True, name="l_r")
+        self.len_r = tuple(Signal(expwid, reset_less=True, name="l_r")
                            for i in range(self.n_adr))
 
     def elaborate(self, platform):
@@ -268,20 +268,20 @@ class PartialAddrBitmap(PartialAddrMatch):
         PartialAddrMatch.__init__(self, n_adr, self.midlen)
 
         # input: length of the LOAD/STORE
-        self.len_i = Array(Signal(lsbwid, reset_less=True,
+        self.len_i = tuple(Signal(lsbwid, reset_less=True,
                                   name="len") for i in range(n_adr))
         # input: full address
-        self.faddrs_i = Array(Signal(bitlen, reset_less=True,
+        self.faddrs_i = tuple(Signal(bitlen, reset_less=True,
                                      name="fadr") for i in range(n_adr))
 
         # intermediary: address + 1
-        self.addr1s = Array(Signal(self.midlen, reset_less=True,
+        self.addr1s = tuple(Signal(self.midlen, reset_less=True,
                                    name="adr1")
                             for i in range(n_adr))
 
         # expanded lengths, needed in match
         expwid = 1+self.lsbwid  # XXX assume LD/ST no greater than 8
-        self.lexp = Array(Signal(1 << expwid, reset_less=True,
+        self.lexp = tuple(Signal(1 << expwid, reset_less=True,
                                  name="a_l")
                           for i in range(self.n_adr))
 
@@ -291,7 +291,7 @@ class PartialAddrBitmap(PartialAddrMatch):
 
         # intermediaries
         adrs_r, l = self.adrs_r, self.l
-        len_r = Array(Signal(self.lsbwid, reset_less=True,
+        len_r = tuple(Signal(self.lsbwid, reset_less=True,
                              name="l_r")
                       for i in range(self.n_adr))
 
index 3d197f2e9daffabccce04d9b136d0d86b01e4b90..dd050b3bb1c9f321264147f4d26286039d3d3105 100644 (file)
@@ -8,7 +8,7 @@ Links:
 
 #from soc.experiment.pimem import PortInterface
 
-from nmigen import Elaboratable, Module, Signal, Record, Array, Const, Cat
+from nmigen import Elaboratable, Module, Signal, Record, Const, Cat
 from nmutil.latch import SRLatch, latchregister
 from nmigen.back.pysim import Simulator, Delay
 from nmigen.cli import verilog, rtlil
@@ -27,19 +27,19 @@ class LDLatch(Elaboratable):
     def __init__(self, dwidth, awidth, mlen):
         self.addr_i = Signal(awidth, reset_less=True)
         self.mask_i = Signal(mlen, reset_less=True)
-        self.valid_i = Signal(reset_less=True)
+        self.i_valid = Signal(reset_less=True)
         self.ld_i = LDData(dwidth, "ld_i")
         self.ld_o = LDData(dwidth, "ld_o")
-        self.valid_o = Signal(reset_less=True)
+        self.o_valid = Signal(reset_less=True)
 
     def elaborate(self, platform):
         m = Module()
         comb = m.d.comb
         m.submodules.in_l = in_l = SRLatch(sync=False, name="in_l")
 
-        comb += in_l.s.eq(self.valid_i)
-        comb += self.valid_o.eq(in_l.q & self.valid_i)
-        latchregister(m, self.ld_i, self.ld_o, in_l.q & self.valid_o, "ld_i_r")
+        comb += in_l.s.eq(self.i_valid)
+        comb += self.o_valid.eq(in_l.q & self.i_valid)
+        latchregister(m, self.ld_i, self.ld_o, in_l.q & self.o_valid, "ld_i_r")
 
         return m
 
@@ -50,8 +50,8 @@ class LDLatch(Elaboratable):
         yield self.ld_i.data
         yield self.ld_o.err
         yield self.ld_o.data
-        yield self.valid_i
-        yield self.valid_o
+        yield self.i_valid
+        yield self.o_valid
 
     def ports(self):
         return list(self)
@@ -83,8 +83,8 @@ class LDSTSplitter(Elaboratable):
         self.addr_i = Signal(awidth, reset_less=True)
         # no match in PortInterface
         self.len_i = Signal(dlen, reset_less=True)
-        self.valid_i = Signal(reset_less=True)
-        self.valid_o = Signal(reset_less=True)
+        self.i_valid = Signal(reset_less=True)
+        self.o_valid = Signal(reset_less=True)
 
         self.is_ld_i = Signal(reset_less=True)
         self.is_st_i = Signal(reset_less=True)
@@ -95,14 +95,14 @@ class LDSTSplitter(Elaboratable):
         self.exc = Signal(reset_less=True) # pi.exc TODO
         # TODO : create/connect two outgoing port interfaces
 
-        self.sld_valid_o = Signal(2, reset_less=True)
-        self.sld_valid_i = Signal(2, reset_less=True)
-        self.sld_data_i = Array((LDData(cline_wid, "ld_data_i1"),
+        self.sld_o_valid = Signal(2, reset_less=True)
+        self.sld_i_valid = Signal(2, reset_less=True)
+        self.sld_data_i = tuple((LDData(cline_wid, "ld_data_i1"),
                                  LDData(cline_wid, "ld_data_i2")))
 
-        self.sst_valid_o = Signal(2, reset_less=True)
-        self.sst_valid_i = Signal(2, reset_less=True)
-        self.sst_data_o = Array((LDData(cline_wid, "st_data_i1"),
+        self.sst_o_valid = Signal(2, reset_less=True)
+        self.sst_i_valid = Signal(2, reset_less=True)
+        self.sst_data_o = tuple((LDData(cline_wid, "st_data_i1"),
                                  LDData(cline_wid, "st_data_i2")))
 
     def elaborate(self, platform):
@@ -151,22 +151,22 @@ class LDSTSplitter(Elaboratable):
             # set up connections to LD-split.  note: not active if mask is zero
             for i, (ld, mask) in enumerate(((ld1, mask1),
                                             (ld2, mask2))):
-                ld_valid = Signal(name="ldvalid_i%d" % i, reset_less=True)
-                comb += ld_valid.eq(self.valid_i & self.sld_valid_i[i])
-                comb += ld.valid_i.eq(ld_valid & (mask != mzero))
+                ld_valid = Signal(name="ldi_valid%d" % i, reset_less=True)
+                comb += ld_valid.eq(self.i_valid & self.sld_i_valid[i])
+                comb += ld.i_valid.eq(ld_valid & (mask != mzero))
                 comb += ld.ld_i.eq(self.sld_data_i[i])
-                comb += self.sld_valid_o[i].eq(ld.valid_o)
+                comb += self.sld_o_valid[i].eq(ld.o_valid)
 
             # sort out valid: mask2 zero we ignore 2nd LD
             with m.If(mask2 == mzero):
-                comb += self.valid_o.eq(self.sld_valid_o[0])
+                comb += self.o_valid.eq(self.sld_o_valid[0])
             with m.Else():
-                comb += self.valid_o.eq(self.sld_valid_o.all())
+                comb += self.o_valid.eq(self.sld_o_valid.all())
             ## debug output -- output mask2 and mzero
             ## guess second port is invalid
 
             # all bits valid (including when data error occurs!) decode ld1/ld2
-            with m.If(self.valid_o):
+            with m.If(self.o_valid):
                 # errors cause error condition
                 comb += self.ld_data_o.err.eq(ld1.ld_o.err | ld2.ld_o.err)
 
@@ -179,10 +179,10 @@ class LDSTSplitter(Elaboratable):
             # set busy flag -- required for unit test
             for i, (ld, mask) in enumerate(((ld1, mask1),
                                             (ld2, mask2))):
-                valid = Signal(name="stvalid_i%d" % i, reset_less=True)
-                comb += valid.eq(self.valid_i & self.sst_valid_i[i])
-                comb += ld.valid_i.eq(valid & (mask != mzero))
-                comb += self.sld_valid_o[i].eq(ld.valid_o)
+                valid = Signal(name="sti_valid%d" % i, reset_less=True)
+                comb += valid.eq(self.i_valid & self.sst_i_valid[i])
+                comb += ld.i_valid.eq(valid & (mask != mzero))
+                comb += self.sld_o_valid[i].eq(ld.o_valid)
                 comb += self.sst_data_o[i].data.eq(ld.ld_o.data)
 
             comb += ld1.ld_i.eq((self.st_data_i << (ashift1*8)) & mask1)
@@ -190,12 +190,12 @@ class LDSTSplitter(Elaboratable):
 
             # sort out valid: mask2 zero we ignore 2nd LD
             with m.If(mask2 == mzero):
-                comb += self.valid_o.eq(self.sst_valid_o[0])
+                comb += self.o_valid.eq(self.sst_o_valid[0])
             with m.Else():
-                comb += self.valid_o.eq(self.sst_valid_o.all())
+                comb += self.o_valid.eq(self.sst_o_valid.all())
 
             # all bits valid (including when data error occurs!) decode ld1/ld2
-            with m.If(self.valid_o):
+            with m.If(self.o_valid):
                 # errors cause error condition
                 comb += self.st_data_i.err.eq(ld1.ld_o.err | ld2.ld_o.err)
 
@@ -207,9 +207,9 @@ class LDSTSplitter(Elaboratable):
         yield self.is_ld_i
         yield self.ld_data_o.err
         yield self.ld_data_o.data
-        yield self.valid_i
-        yield self.valid_o
-        yield self.sld_valid_i
+        yield self.i_valid
+        yield self.o_valid
+        yield self.sld_i_valid
         for i in range(2):
             yield self.sld_data_i[i].err
             yield self.sld_data_i[i].data
@@ -247,11 +247,11 @@ def sim(dut):
         yield dut.is_ld_i.eq(1)
         yield dut.len_i.eq(ld_len)
         yield dut.addr_i.eq(addr)
-        yield dut.valid_i.eq(1)
+        yield dut.i_valid.eq(1)
         print("waiting")
         while True:
-            valid_o = yield dut.valid_o
-            if valid_o:
+            o_valid = yield dut.o_valid
+            if o_valid:
                 break
             yield
         exc = yield dut.exc
@@ -267,8 +267,8 @@ def sim(dut):
     def lds():
         print("lds")
         while True:
-            valid_i = yield dut.valid_i
-            if valid_i:
+            i_valid = yield dut.i_valid
+            if i_valid:
                 break
             yield
 
@@ -281,10 +281,10 @@ def sim(dut):
         data2 = (shfdata >> 128) & dmask1
         print("ld data2", 1 << dlen, hex(data >> (1 << dlen)), hex(data2))
         yield dut.sld_data_i[0].data.eq(data1)
-        yield dut.sld_valid_i[0].eq(1)
+        yield dut.sld_i_valid[0].eq(1)
         yield
         yield dut.sld_data_i[1].data.eq(data2)
-        yield dut.sld_valid_i[1].eq(1)
+        yield dut.sld_i_valid[1].eq(1)
         yield
 
     sim.add_sync_process(lds)
index c6cf4259b6f23c3eac88b81553894c20d973efdb..a105d1717bcff381ad34a5dc17d3626d70928463 100644 (file)
@@ -1,9 +1,14 @@
+# (DO NOT REMOVE THESE NOTICES)
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2019, 2020, 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Part of the Libre-SOC Project.
+# Sponsored by NLnet       EU Grant No: 825310 and 825322
+# Sponsored by NGI POINTER EU Grant No: 871528
+
 from nmigen.compat.sim import run_simulation
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl
+from nmigen import Module, Signal, Elaboratable, Cat, Repl
 from nmutil.latch import SRLatch
-from functools import reduce
-from operator import or_
 
 
 class DependencyRow(Elaboratable):
@@ -27,10 +32,11 @@ class DependencyRow(Elaboratable):
         asynchronous) would be reset at the exact moment that GO was requested,
         and the RSEL would be garbage.
     """
-    def __init__(self, n_reg, n_src, cancel_mode=False):
+    def __init__(self, n_reg, n_src, n_dst, cancel_mode=False):
         self.cancel_mode = cancel_mode
         self.n_reg = n_reg
         self.n_src = n_src
+        self.n_dst = n_dst
         # arrays
         src = []
         rsel = []
@@ -40,11 +46,19 @@ class DependencyRow(Elaboratable):
             src.append(Signal(n_reg, name="src%d" % j, reset_less=True))
             rsel.append(Signal(n_reg, name="src%d_rsel_o" % j, reset_less=True))
             fwd.append(Signal(n_reg, name="src%d_fwd_o" % j, reset_less=True))
+        dst = []
+        dsel = []
+        dfwd = []
+        for i in range(n_dst):
+            j = i + 1 # name numbering to match src1/src2
+            dst.append(Signal(n_reg, name="dst%d" % j, reset_less=True))
+            dsel.append(Signal(n_reg, name="dst%d_rsel_o" % j, reset_less=True))
+            dfwd.append(Signal(n_reg, name="dst%d_fwd_o" % j, reset_less=True))
 
         # inputs
-        self.dest_i = Signal(n_reg, reset_less=True)     # Dest in (top)
-        self.src_i = Array(src)     # operands in (top)
-        self.issue_i = Signal(reset_less=True)    # Issue in (top)
+        self.dst_i = tuple(dst)                # Dest in (top)
+        self.src_i = tuple(src)                # operands in (top)
+        self.issue_i = Signal(reset_less=True) # Issue in (top)
 
         self.rd_pend_i = Signal(n_reg, reset_less=True) # Read pend in (top)
         self.wr_pend_i = Signal(n_reg, reset_less=True) # Write pend in (top)
@@ -59,20 +73,32 @@ class DependencyRow(Elaboratable):
             self.go_die_i = Signal(reset_less=True) # Go Die in (left)
 
         # for Register File Select Lines (vertical)
-        self.dest_rsel_o = Signal(n_reg, reset_less=True)  # dest reg sel (bot)
-        self.src_rsel_o = Array(rsel)   # src reg sel (bot)
+        self.dst_rsel_o = tuple(dsel)         # dest reg sel (bot)
+        self.src_rsel_o = tuple(rsel)         # src reg sel (bot)
 
         # for Function Unit "forward progress" (horizontal)
-        self.dest_fwd_o = Signal(n_reg, reset_less=True)   # dest FU fw (right)
-        self.src_fwd_o = Array(fwd)    # src FU fw (right)
+        self.dst_fwd_o = tuple(dfwd)        # dest FU fw (right)
+        self.src_fwd_o = tuple(fwd)         # src FU fw (right)
+
+        # for temporary (transitional) compatibility with old API
+        # number of dests used to be 1 (one) - increasing to n_dst
+        self.dest_i = self.dst_i[0]
+        self.dest_rsel_o = self.dst_rsel_o[0]
+        self.dest_fwd_o = self.dst_fwd_o[0]
 
     def elaborate(self, platform):
         m = Module()
-        m.submodules.dest_c = dest_c = SRLatch(sync=False, llen=self.n_reg)
+        # create source and dest SRLatches
+        dst_c = []
+        for i in range(self.n_dst):
+            dst_l = SRLatch(sync=False, llen=self.n_reg)
+            m.submodules["dst%d_c" % (i+1)] = dst_l
+            dst_c.append(dst_l)
+
         src_c = []
         for i in range(self.n_src):
             src_l = SRLatch(sync=False, llen=self.n_reg)
-            setattr(m.submodules, "src%d_c" % (i+1), src_l)
+            m.submodules["src%d_c" % (i+1)] = src_l
             src_c.append(src_l)
 
         # connect go_rd / go_wr (dest->wr, src->rd)
@@ -84,25 +110,29 @@ class DependencyRow(Elaboratable):
             go_die = Repl(self.go_die_i, self.n_reg)
         m.d.comb += wr_die.eq(Repl(self.go_wr_i, self.n_reg) | go_die)
         m.d.comb += rd_die.eq(Repl(self.go_rd_i, self.n_reg) | go_die)
-        m.d.comb += dest_c.r.eq(wr_die)
+        for i in range(self.n_dst):
+            m.d.comb += dst_c[i].r.eq(wr_die)
         for i in range(self.n_src):
             m.d.comb += src_c[i].r.eq(rd_die)
 
         # connect input reg bit (unary)
         i_ext = Repl(self.issue_i, self.n_reg)
-        m.d.comb += dest_c.s.eq(i_ext & self.dest_i)
+        for i in range(self.n_dst):
+            m.d.comb += dst_c[i].s.eq(i_ext & self.dst_i[i])
         for i in range(self.n_src):
             m.d.comb += src_c[i].s.eq(i_ext & self.src_i[i])
 
         # connect up hazard checks: read-after-write and write-after-read
-        m.d.comb += self.dest_fwd_o.eq(dest_c.q & self.rd_pend_i)
+        for i in range(self.n_dst):
+            m.d.comb += self.dst_fwd_o[i].eq(dst_c[i].q & self.rd_pend_i)
         for i in range(self.n_src):
             m.d.comb += self.src_fwd_o[i].eq(src_c[i].q & self.wr_pend_i)
 
         # connect reg-sel outputs
         rd_ext = Repl(self.go_rd_i, self.n_reg)
         wr_ext = Repl(self.go_wr_i, self.n_reg)
-        m.d.comb += self.dest_rsel_o.eq(dest_c.qlq & wr_ext)
+        for i in range(self.n_dst):
+            m.d.comb += self.dst_rsel_o[i].eq(dst_c[i].qlq & wr_ext)
         for i in range(self.n_src):
             m.d.comb += self.src_rsel_o[i].eq(src_c[i].qlq & rd_ext)
 
@@ -111,13 +141,16 @@ class DependencyRow(Elaboratable):
         src_q = []
         for i in range(self.n_src):
             src_q.append(src_c[i].qlq)
-        m.d.comb += self.v_rd_rsel_o.eq(reduce(or_, src_q))
-        m.d.comb += self.v_wr_rsel_o.eq(dest_c.qlq)
+        m.d.comb += self.v_rd_rsel_o.eq(Cat(*src_q).bool())
+        dst_q = []
+        for i in range(self.n_dst):
+            dst_q.append(dst_c[i].qlq)
+        m.d.comb += self.v_wr_rsel_o.eq(Cat(*dst_q).bool())
 
         return m
 
     def __iter__(self):
-        yield self.dest_i
+        yield from self.dst_i
         yield from self.src_i
         yield self.rd_pend_i
         yield self.wr_pend_i
@@ -125,22 +158,23 @@ class DependencyRow(Elaboratable):
         yield self.go_wr_i
         yield self.go_rd_i
         yield self.go_die_i
-        yield self.dest_rsel_o
+        yield from self.dst_rsel_o
         yield from self.src_rsel_o
-        yield self.dest_fwd_o
+        yield from self.dst_fwd_o
         yield from self.src_fwd_o
 
     def ports(self):
         return list(self)
 
 
+# XXX not up-to-date but hey
 def dcell_sim(dut):
     yield dut.dest_i.eq(1)
     yield dut.issue_i.eq(1)
     yield
     yield dut.issue_i.eq(0)
     yield
-    yield dut.src1_i.eq(1)
+    yield dut.src_i[0].eq(1)
     yield dut.issue_i.eq(1)
     yield
     yield
@@ -157,7 +191,7 @@ def dcell_sim(dut):
     yield
 
 def test_dcell():
-    dut = DependencyRow(4, 2, True)
+    dut = DependencyRow(4, 2, 2, True)
     vl = rtlil.convert(dut, ports=dut.ports())
     with open("test_drow.il", "w") as f:
         f.write(vl)
index 8b55eed1640df84e1dc5a50c4e9e3e906a551032..e6d4c341fbffaef7cfe1b4df0f0e616780840979 100644 (file)
@@ -1,6 +1,6 @@
 from nmigen.compat.sim import run_simulation
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Cat, Array, Const, Elaboratable
+from nmigen import Module, Signal, Cat, Const, Elaboratable
 from nmigen.lib.coding import Decoder
 
 from nmutil.latch import SRLatch, latchregister
@@ -29,7 +29,7 @@ class FnUnit(Elaboratable):
         * dest_i / src1_i / src2_i are in *binary*, whereas...
         * ...g_rd_pend_i / g_wr_pend_i and rd_pend_o / wr_pend_o are UNARY
         * req_rel_i (request release) is the direct equivalent of pipeline
-                    "output valid" (valid_o)
+                    "output valid" (o_valid)
         * recover is a local python variable (actually go_die_o)
         * when shadow_wid = 0, recover and shadown are Consts (i.e. do nothing)
         * wr_pend is set False for the majority of uses: however for
@@ -46,7 +46,7 @@ class FnUnit(Elaboratable):
         if n_dests > 1:
             self.rfile_sel_i = Signal(range(n_dests), reset_less=True)
         else:
-            self.rfile_sel_i = Const(0)  # no selection.  gets Array[0]
+            self.rfile_sel_i = Const(0)  # no selection.  gets 0
         self.dest_i = Signal(range(wid), reset_less=True)  # Dest R# in (top)
         self.src1_i = Signal(range(wid), reset_less=True)  # oper1 R# in (top)
         self.src2_i = Signal(range(wid), reset_less=True)  # oper2 R# in (top)
@@ -56,7 +56,7 @@ class FnUnit(Elaboratable):
         self.go_rd_i = Signal(reset_less=True)  # Go Read in (left)
         self.req_rel_i = Signal(reset_less=True)  # request release (left)
 
-        self.g_xx_pend_i = Array(Signal(wid, reset_less=True, name="g_pend_i")
+        self.g_xx_pend_i = tuple(Signal(wid, reset_less=True, name="g_pend_i")
                                  for i in range(n_dests))  # global rd (right)
         self.g_wr_pend_i = Signal(wid, reset_less=True)  # global wr (right)
 
@@ -68,14 +68,14 @@ class FnUnit(Elaboratable):
 
         # outputs
         self.readable_o = Signal(reset_less=True)  # Readable out (right)
-        self.writable_o = Array(Signal(reset_less=True, name="writable_o")
+        self.writable_o = tuple(Signal(reset_less=True, name="writable_o")
                                 for i in range(n_dests))  # writable out (right)
         self.busy_o = Signal(reset_less=True)  # busy out (left)
 
         self.src1_pend_o = Signal(wid, reset_less=True)  # src1 pending
         self.src2_pend_o = Signal(wid, reset_less=True)  # src1 pending
         self.rd_pend_o = Signal(wid, reset_less=True)  # rd pending (right)
-        self.xx_pend_o = Array(Signal(wid, reset_less=True, name="pend_o")
+        self.xx_pend_o = tuple(Signal(wid, reset_less=True, name="pend_o")
                                for i in range(n_dests))  # wr pending (right)
 
     def elaborate(self, platform):
index cc2c1b9658d59a06f5ac72e48f563dca097c28fa..35e015d7af2957840aaa13efdfdc33265d8a0377 100644 (file)
@@ -1,6 +1,6 @@
 from nmigen.compat.sim import run_simulation
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Const
+from nmigen import Module, Signal, Elaboratable, Cat, Const
 
 from .fu_dep_cell import FUDependenceCell
 from .fu_picker_vec import FU_Pick_Vec
@@ -36,7 +36,7 @@ class FUFUDepMatrix(Elaboratable):
         # ---
         # matrix of dependency cells
         # ---
-        dm = Array(FUDependenceCell(f, self.n_fu_col) \
+        dm = tuple(FUDependenceCell(f, self.n_fu_col) \
                                             for f in range(self.n_fu_row))
         for y in range(self.n_fu_row):
                 setattr(m.submodules, "dm%d" % y, dm[y])
@@ -44,7 +44,7 @@ class FUFUDepMatrix(Elaboratable):
         # ---
         # array of Function Unit Readable/Writable: row-length, horizontal
         # ---
-        fur = Array(FU_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col))
+        fur = tuple(FU_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col))
         for x in range(self.n_fu_col):
             setattr(m.submodules, "fur_x%d" % (x), fur[x])
 
index 47d6bcc217999af813e0186fc2fe6ca405599474..08bdc78e3ae30dcfd915e1da77048c7e0206cc03 100644 (file)
@@ -1,6 +1,6 @@
 from nmigen.compat.sim import run_simulation
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Const
+from nmigen import Module, Signal, Elaboratable, Cat, Const
 
 from soc.scoreboard.fumem_dep_cell import FUMemDependenceCell
 from soc.scoreboard.fu_mem_picker_vec import FUMem_Pick_Vec
@@ -36,7 +36,7 @@ class FUMemDepMatrix(Elaboratable):
         # ---
         # matrix of dependency cells
         # ---
-        dm = Array(FUMemDependenceCell(f, self.n_fu_col) \
+        dm = tuple(FUMemDependenceCell(f, self.n_fu_col) \
                                             for f in range(self.n_fu_row))
         for y in range(self.n_fu_row):
                 setattr(m.submodules, "dm%d" % y, dm[y])
@@ -44,7 +44,7 @@ class FUMemDepMatrix(Elaboratable):
         # ---
         # array of Function Unit Readable/Writable: row-length, horizontal
         # ---
-        fur = Array(FUMem_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col))
+        fur = tuple(FUMem_Pick_Vec(self.n_fu_row) for r in range(self.n_fu_col))
         for x in range(self.n_fu_col):
             setattr(m.submodules, "fur_x%d" % (x), fur[x])
 
index 06380434c8d7d20828d80c3d0e020161bcb2c2e4..3e47b0fecb2bb25eb11cce2b18942f71aea881f1 100644 (file)
@@ -1,13 +1,11 @@
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl
-
-from soc.scoreboard.dependence_cell import DependencyRow
-from soc.scoreboard.fu_wr_pending import FU_RW_Pend
-from soc.scoreboard.reg_select import Reg_Rsv
-from soc.scoreboard.global_pending import GlobalPending
+# (DO NOT REMOVE THESE NOTICES)
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2019, 2020, 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Part of the Libre-SOC Project.
+# Sponsored by NLnet       EU Grant No: 825310 and 825322
+# Sponsored by NGI POINTER EU Grant No: 871528
 
-"""
+"""Mitch Alsup 6600 Dependency Matrices: Function Units to Registers (FU-REGs)
 
  6600 Dependency Table Matrix inputs / outputs
  ---------------------------------------------
@@ -23,30 +21,60 @@ from soc.scoreboard.global_pending import GlobalPending
                  d  s1 s2   d  s1 s2   d  s1 s2   d  s1 s2
                  reg sel    reg sel    reg sel    reg sel
 
+Sub-module allocation:
+
+                <----------- DependenceRow dr_fu0 -------> FU_RW_Pend fu_fu_0
+                <----------- DependenceRow dr_fu1 -------> FU_RW_Pend fu_fu_1
+                <----------- DependenceRow dr_fu2 -------> FU_RW_Pend fu_fu_2
+                 |  |  |    |  |  |    |  |  |    |  |  |
+                 v  v  v    v  v  v    v  v  v    v  v  v
+                 Reg_Rsv    Reg_Rsv    Reg_Rsv    Reg_Rsv
+                 rr_r0      rr_r1      rr_r2      rr_r3
+                 |  |       |  |       |  |       |  |
+                <---------- GlobalPending rd_v --------->
+                <---------- GlobalPending wr_v --------->
 """
 
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen import Module, Signal, Elaboratable, Cat, Repl
+
+from soc.scoreboard.dependence_cell import DependencyRow
+from soc.scoreboard.fu_wr_pending import FU_RW_Pend
+from soc.scoreboard.reg_select import Reg_Rsv
+from soc.scoreboard.global_pending import GlobalPending
+
+
 class FURegDepMatrix(Elaboratable):
     """ implements 11.4.7 mitch alsup FU-to-Reg Dependency Matrix, p26
     """
-    def __init__(self, n_fu_row, n_reg_col, n_src, cancel=None):
+    def __init__(self, n_fu_row, n_reg_col, n_src, n_dst, cancel=None):
         self.n_src = n_src
+        self.n_dst = n_dst
         self.n_fu_row = nf = n_fu_row      # Y (FUs)   ^v
         self.n_reg_col = n_reg = n_reg_col   # X (Regs)  <>
 
         # arrays
         src = []
         rsel = []
+        pend = []
         for i in range(n_src):
             j = i + 1 # name numbering to match src1/src2
             src.append(Signal(n_reg, name="src%d" % j, reset_less=True))
             rsel.append(Signal(n_reg, name="src%d_rsel_o" % j, reset_less=True))
-        pend = []
-        for i in range(nf):
-            j = i + 1 # name numbering to match src1/src2
             pend.append(Signal(nf, name="rd_src%d_pend_o" % j, reset_less=True))
-
-        self.dest_i = Signal(n_reg_col, reset_less=True)     # Dest in (top)
-        self.src_i = Array(src)                              # oper in (top)
+        dst = []
+        dsel = []
+        dpnd = []
+        for i in range(n_dst):
+            j = i + 1 # name numbering to match dst1/dst2
+            dst.append(Signal(n_reg, name="dst%d" % j, reset_less=True))
+            dsel.append(Signal(n_reg, name="dst%d_rsel_o" % j, reset_less=True))
+            dpnd.append(Signal(nf, name="wr_dst%d_pend_o" % j, reset_less=True))
+
+        self.dst_i = tuple(dst)                              # Dest in (top)
+        self.src_i = tuple(src)                              # oper in (top)
+        self.dest_i = self.dst_i[0] # old API
 
         # cancellation array (from Address Matching), ties in with go_die_i
         self.cancel = cancel
@@ -63,13 +91,15 @@ class FURegDepMatrix(Elaboratable):
         self.go_die_i = Signal(n_fu_row, reset_less=True) # Go Die in (left)
 
         # for Register File Select Lines (horizontal), per-reg
-        self.dest_rsel_o = Signal(n_reg_col, reset_less=True) # dest reg (bot)
-        self.src_rsel_o = Array(rsel)                         # src reg (bot)
+        self.dst_rsel_o = tuple(dsel)                         # dest reg (bot)
+        self.src_rsel_o = tuple(rsel)                         # src reg (bot)
+        self.dest_rsel_o = self.dst_rsel_o[0] # old API
 
         # for Function Unit "forward progress" (vertical), per-FU
         self.wr_pend_o = Signal(n_fu_row, reset_less=True) # wr pending (right)
         self.rd_pend_o = Signal(n_fu_row, reset_less=True) # rd pending (right)
-        self.rd_src_pend_o = Array(pend) # src1 pending
+        self.rd_src_pend_o = tuple(pend) # src pending
+        self.wr_dst_pend_o = tuple(dpnd) # dest pending
 
     def elaborate(self, platform):
         m = Module()
@@ -78,45 +108,37 @@ class FURegDepMatrix(Elaboratable):
     def _elaborate(self, m, platform):
 
         # ---
-        # matrix of dependency cells
+        # matrix of dependency cells.  horizontal object, allocated vertically
         # ---
         cancel_mode = self.cancel is not None
-        dm = Array(DependencyRow(self.n_reg_col, self.n_src, cancel_mode) \
+        dm = tuple(DependencyRow(self.n_reg_col, self.n_src, self.n_dst,
+                                 cancel_mode=cancel_mode) \
                     for r in range(self.n_fu_row))
-        for fu in range(self.n_fu_row):
-            setattr(m.submodules, "dr_fu%d" % fu, dm[fu])
+        for fu, dc in enumerate(dm):
+            m.submodules["dr_fu%d" % fu] = dc
 
         # ---
-        # array of Function Unit Pending vectors
+        # array of Function Unit Pending vecs. allocated vertically (per FU)
         # ---
-        fupend = Array(FU_RW_Pend(self.n_reg_col, self.n_src) \
+        fupend = tuple(FU_RW_Pend(self.n_reg_col, self.n_src, self.n_dst) \
                         for f in range(self.n_fu_row))
-        for fu in range(self.n_fu_row):
-            setattr(m.submodules, "fu_fu%d" % (fu), fupend[fu])
+        for fu, fup in enumerate(fupend):
+            m.submodules["fu_fu%d" % (fu)] = fup
 
         # ---
-        # array of Register Reservation vectors
+        # array of Register Reservation vecs.  allocated horizontally (per reg)
         # ---
-        regrsv = Array(Reg_Rsv(self.n_fu_row, self.n_src) \
+        regrsv = tuple(Reg_Rsv(self.n_fu_row, self.n_src, self.n_dst) \
                         for r in range(self.n_reg_col))
         for rn in range(self.n_reg_col):
-            setattr(m.submodules, "rr_r%d" % (rn), regrsv[rn])
+            m.submodules["rr_r%d" % (rn)] = regrsv[rn]
 
         # ---
         # connect Function Unit vector
         # ---
         wr_pend = []
         rd_pend = []
-        for fu in range(self.n_fu_row):
-            dc = dm[fu]
-            fup = fupend[fu]
-            dest_fwd_o = []
-            for rn in range(self.n_reg_col):
-                # accumulate cell fwd outputs for dest/src1/src2
-                dest_fwd_o.append(dc.dest_fwd_o[rn])
-            # connect cell fwd outputs to FU Vector in [Cat is gooood]
-            m.d.comb += [fup.dest_fwd_i.eq(Cat(*dest_fwd_o)),
-                        ]
+        for fup in fupend:
             # accumulate FU Vector outputs
             wr_pend.append(fup.reg_wr_pend_o)
             rd_pend.append(fup.reg_rd_pend_o)
@@ -125,19 +147,31 @@ class FURegDepMatrix(Elaboratable):
         m.d.comb += self.wr_pend_o.eq(Cat(*wr_pend))
         m.d.comb += self.rd_pend_o.eq(Cat(*rd_pend))
 
+        # connect dst fwd vectors
+        for i in range(self.n_dst):
+            wr_dst_pend = []
+            for dc, fup in zip(dm, fupend):
+                dst_fwd_o = []
+                for rn in range(self.n_reg_col):
+                    # accumulate cell fwd outputs for dest
+                    dst_fwd_o.append(dc.dst_fwd_o[i][rn])
+                # connect cell fwd outputs to FU Vector in [Cat is gooood]
+                m.d.comb += fup.dst_fwd_i[i].eq(Cat(*dst_fwd_o))
+                # accumulate FU Vector outputs
+                wr_dst_pend.append(fup.reg_wr_dst_pend_o[i])
+            # ... and output them from this module (vertical, width=FUs)
+            m.d.comb += self.wr_dst_pend_o[i].eq(Cat(*wr_dst_pend))
+
         # same for src
         for i in range(self.n_src):
             rd_src_pend = []
-            for fu in range(self.n_fu_row):
-                dc = dm[fu]
-                fup = fupend[fu]
+            for dc, fup in zip(dm, fupend):
                 src_fwd_o = []
                 for rn in range(self.n_reg_col):
                     # accumulate cell fwd outputs for dest/src1/src2
                     src_fwd_o.append(dc.src_fwd_o[i][rn])
                 # connect cell fwd outputs to FU Vector in [Cat is gooood]
-                m.d.comb += [fup.src_fwd_i[i].eq(Cat(*src_fwd_o)),
-                            ]
+                m.d.comb += fup.src_fwd_i[i].eq(Cat(*src_fwd_o))
                 # accumulate FU Vector outputs
                 rd_src_pend.append(fup.reg_rd_src_pend_o[i])
             # ... and output them from this module (vertical, width=FUs)
@@ -146,63 +180,54 @@ class FURegDepMatrix(Elaboratable):
         # ---
         # connect Reg Selection vector
         # ---
-        dest_rsel = []
-        for rn in range(self.n_reg_col):
-            rsv = regrsv[rn]
-            dest_rsel_o = []
-            for fu in range(self.n_fu_row):
-                dc = dm[fu]
-                # accumulate cell reg-select outputs dest/src1/src2
-                dest_rsel_o.append(dc.dest_rsel_o[rn])
-            # connect cell reg-select outputs to Reg Vector In
-            m.d.comb += rsv.dest_rsel_i.eq(Cat(*dest_rsel_o)),
-
-            # accumulate Reg-Sel Vector outputs
-            dest_rsel.append(rsv.dest_rsel_o)
-
-        # ... and output them from this module (horizontal, width=REGs)
-        m.d.comb += self.dest_rsel_o.eq(Cat(*dest_rsel))
+        for i in range(self.n_dst):
+            dest_rsel = []
+            for rn, rsv in enumerate(regrsv):
+                dst_rsel_o = []
+                # accumulate cell reg-select outputs dest1/2/...
+                for dc in dm:
+                    dst_rsel_o.append(dc.dst_rsel_o[i][rn])
+                # connect cell reg-select outputs to Reg Vector In
+                m.d.comb += rsv.dst_rsel_i[i].eq(Cat(*dst_rsel_o)),
+                # accumulate Reg-Sel Vector outputs
+                dest_rsel.append(rsv.dst_rsel_o[i])
+            # ... and output them from this module (horizontal, width=REGs)
+            m.d.comb += self.dst_rsel_o[i].eq(Cat(*dest_rsel))
 
         # same for src
         for i in range(self.n_src):
             src_rsel = []
-            for rn in range(self.n_reg_col):
-                rsv = regrsv[rn]
+            for rn, rsv in enumerate(regrsv):
                 src_rsel_o = []
-                for fu in range(self.n_fu_row):
-                    dc = dm[fu]
-                    # accumulate cell reg-select outputs dest/src1/src2
+                # accumulate cell reg-select outputs src1/src2
+                for dc in dm:
                     src_rsel_o.append(dc.src_rsel_o[i][rn])
                 # connect cell reg-select outputs to Reg Vector In
                 m.d.comb += rsv.src_rsel_i[i].eq(Cat(*src_rsel_o)),
                 # accumulate Reg-Sel Vector outputs
                 src_rsel.append(rsv.src_rsel_o[i])
-
             # ... and output them from this module (horizontal, width=REGs)
             m.d.comb += self.src_rsel_o[i].eq(Cat(*src_rsel))
 
         # ---
         # connect Dependency Matrix dest/src1/src2/issue to module d/s/s/i
         # ---
-        for fu in range(self.n_fu_row):
-            dc = dm[fu]
+        for dc in dm:
             # wire up inputs from module to row cell inputs (Cat is gooood)
-            m.d.comb += [dc.dest_i.eq(self.dest_i),
-                         dc.rd_pend_i.eq(self.rd_pend_i),
+            m.d.comb += [dc.rd_pend_i.eq(self.rd_pend_i),
                          dc.wr_pend_i.eq(self.wr_pend_i),
                         ]
-        # same for src
-        for i in range(self.n_src):
-            for fu in range(self.n_fu_row):
-                dc = dm[fu]
-                # wire up inputs from module to row cell inputs (Cat is gooood)
+            # for dest: wire up output from module to row cell outputs
+            for i in range(self.n_dst):
+                m.d.comb += dc.dst_i[i].eq(self.dst_i[i])
+            # for src: wire up inputs from module to row cell inputs
+            for i in range(self.n_src):
                 m.d.comb += dc.src_i[i].eq(self.src_i[i])
 
         # accumulate rsel bits into read/write pending vectors.
         rd_pend_v = []
         wr_pend_v = []
-        for fu in range(self.n_fu_row):
-            dc = dm[fu]
+        for dc in dm:
             rd_pend_v.append(dc.v_rd_rsel_o)
             wr_pend_v.append(dc.v_wr_rsel_o)
         rd_v = GlobalPending(self.n_reg_col, rd_pend_v)
@@ -219,8 +244,7 @@ class FURegDepMatrix(Elaboratable):
         go_rd_i = []
         go_wr_i = []
         issue_i = []
-        for fu in range(self.n_fu_row):
-            dc = dm[fu]
+        for dc in dm:
             # accumulate cell fwd outputs for dest/src1/src2
             go_rd_i.append(dc.go_rd_i)
             go_wr_i.append(dc.go_wr_i)
@@ -235,15 +259,13 @@ class FURegDepMatrix(Elaboratable):
         # connect Dep go_die_i
         # ---
         if cancel_mode:
-            for fu in range(self.n_fu_row):
-                dc = dm[fu]
+            for fu, dc in enumerate(dm):
                 go_die = Repl(self.go_die_i[fu], self.n_fu_row)
                 go_die = go_die | self.cancel[fu]
                 m.d.comb += dc.go_die_i.eq(go_die)
         else:
             go_die_i = []
-            for fu in range(self.n_fu_row):
-                dc = dm[fu]
+            for dc in dm:
                 # accumulate cell fwd outputs for dest/src1/src2
                 go_die_i.append(dc.go_die_i)
             # wire up inputs from module to row cell inputs (Cat is gooood)
@@ -251,13 +273,15 @@ class FURegDepMatrix(Elaboratable):
         return m
 
     def __iter__(self):
+        if self.cancel is not None:
+            yield self.cancel
         yield self.dest_i
         yield from self.src_i
         yield self.issue_i
         yield self.go_wr_i
         yield self.go_rd_i
         yield self.go_die_i
-        yield self.dest_rsel_o
+        yield from self.dst_rsel_o
         yield from self.src_rsel_o
         yield self.wr_pend_o
         yield self.rd_pend_o
@@ -266,6 +290,7 @@ class FURegDepMatrix(Elaboratable):
         yield self.v_wr_rsel_o
         yield self.v_rd_rsel_o
         yield from self.rd_src_pend_o
+        yield from self.wr_dst_pend_o
 
     def ports(self):
         return list(self)
@@ -278,7 +303,7 @@ def d_matrix_sim(dut):
     yield
     yield dut.issue_i.eq(0)
     yield
-    yield dut.src1_i.eq(1)
+    yield dut.src_i[0].eq(1)
     yield dut.issue_i.eq(1)
     yield
     yield dut.issue_i.eq(0)
@@ -293,7 +318,9 @@ def d_matrix_sim(dut):
     yield
 
 def test_d_matrix():
-    dut = FURegDepMatrix(n_fu_row=3, n_reg_col=4, n_src=2)
+    cancel = Signal(3)
+    dut = FURegDepMatrix(n_fu_row=3, n_reg_col=4, n_src=2, n_dst=2,
+                         cancel=cancel)
     vl = rtlil.convert(dut, ports=dut.ports())
     with open("test_fu_reg_matrix.il", "w") as f:
         f.write(vl)
index d0bcb954301fd82396dc52b20009928a738f2268..0fd8e9cb1c00abf3dca5aafe51fc00d9638e8c83 100644 (file)
@@ -1,29 +1,67 @@
-from nmigen import Elaboratable, Module, Signal, Array
+# (DO NOT REMOVE THESE NOTICES)
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2019, 2020, 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Part of the Libre-SOC Project.
+# Sponsored by NLnet       EU Grant No: 825310 and 825322
+# Sponsored by NGI POINTER EU Grant No: 871528
+
+from nmigen import Elaboratable, Module, Signal
+from nmigen.cli import verilog, rtlil
 
 
 class FU_RW_Pend(Elaboratable):
     """ these are allocated per-FU (horizontally),
         and are of length reg_count
     """
-    def __init__(self, reg_count, n_src):
+    def __init__(self, reg_count, n_src, n_dst):
         self.n_src = n_src
+        self.n_dst = n_dst
         self.reg_count = reg_count
-        self.dest_fwd_i = Signal(reg_count, reset_less=True)
+        # create dest forwarding array
+        dst = []
+        for i in range(n_dst):
+            j = i + 1 # name numbering to match dst1/dst2
+            dst.append(Signal(reg_count, name="dst%d" % j, reset_less=True))
+        self.dst_fwd_i = tuple(dst)
+        self.dest_fwd_i = self.dst_fwd_i[0] # old API
+        # create src forwarding array
         src = []
         for i in range(n_src):
             j = i + 1 # name numbering to match src1/src2
             src.append(Signal(reg_count, name="src%d" % j, reset_less=True))
-        self.src_fwd_i = Array(src)
+        self.src_fwd_i = tuple(src)
 
         self.reg_wr_pend_o = Signal(reset_less=True)
         self.reg_rd_pend_o = Signal(reset_less=True)
         self.reg_rd_src_pend_o = Signal(n_src, reset_less=True)
+        self.reg_wr_dst_pend_o = Signal(n_dst, reset_less=True)
 
     def elaborate(self, platform):
         m = Module()
-        m.d.comb += self.reg_wr_pend_o.eq(self.dest_fwd_i.bool())
+        for i in range(self.n_dst):
+            m.d.comb += self.reg_wr_dst_pend_o[i].eq(self.dst_fwd_i[i].bool())
+        m.d.comb += self.reg_wr_pend_o.eq(self.reg_wr_dst_pend_o.bool())
         for i in range(self.n_src):
             m.d.comb += self.reg_rd_src_pend_o[i].eq(self.src_fwd_i[i].bool())
         m.d.comb += self.reg_rd_pend_o.eq(self.reg_rd_src_pend_o.bool())
         return m
 
+    def __iter__(self):
+        yield self.reg_wr_pend_o
+        yield self.reg_rd_pend_o
+        yield self.reg_rd_src_pend_o
+        yield self.reg_wr_dst_pend_o
+        yield from self.dst_fwd_i
+        yield from self.src_fwd_i
+
+    def ports(self):
+        return list(self)
+
+def test_fu_rw_pend():
+    dut = FU_RW_Pend(4, 2, 2)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_fu_rw_pend.il", "w") as f:
+        f.write(vl)
+
+if __name__ == '__main__':
+    test_fu_rw_pend()
index af1bb7659e53c1ea64f93ed9d51e73ab9b0a7dd0..45ff1b41a6c438714e41d4e27de0060d23afcec3 100644 (file)
@@ -45,7 +45,7 @@ In theory (and in practice!) the following is possible:
 
 from nmigen.compat.sim import run_simulation
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array
+from nmigen import Module, Signal, Elaboratable
 
 #from nmutil.picker import MultiPriorityPicker as MPP
 from nmutil.picker import PriorityPicker
@@ -75,14 +75,14 @@ class GroupPicker(Elaboratable):
             wi.append(Signal(wid, name="writable%d_i" % i, reset_less=True))
 
         # inputs
-        self.rd_rel_i = Array(rdr)  # go read in (top)
-        self.req_rel_i = Array(wrr) # release request in (top)
-        self.readable_i = Array(ri) # readable in (top)
-        self.writable_i = Array(wi) # writable in (top)
+        self.rd_rel_i = tuple(rdr)  # go read in (top)
+        self.req_rel_i = tuple(wrr) # release request in (top)
+        self.readable_i = tuple(ri) # readable in (top)
+        self.writable_i = tuple(wi) # writable in (top)
 
         # outputs
-        self.go_rd_o = Array(rd)  # go read (bottom)
-        self.go_wr_o = Array(wr)  # go write (bottom)
+        self.go_rd_o = tuple(rd)  # go read (bottom)
+        self.go_wr_o = tuple(wr)  # go write (bottom)
 
     def elaborate(self, platform):
         m = Module()
index 9c3d58d8a9300056ab3557b095bf6e5942d8fb35..651821691ad3dc73c2798fe80de470aeef9b0a06 100644 (file)
@@ -46,15 +46,15 @@ class InstructionQ(Elaboratable):
         self.n_out = n_out
         mqbits = (int(log(iqlen) / log(2))+2, False)
 
-        self.p_add_i = Signal(mqbits)  # instructions to add (from data_i)
-        self.p_ready_o = Signal()  # instructions were added
-        self.data_i = Instruction._nq(n_in, "data_i")
+        self.p_add_i = Signal(mqbits)  # instructions to add (from i_data)
+        self.p_o_ready = Signal()  # instructions were added
+        self.i_data = Instruction._nq(n_in, "i_data")
 
-        self.data_o = Instruction._nq(n_out, "data_o")
+        self.o_data = Instruction._nq(n_out, "o_data")
         self.n_sub_i = Signal(mqbits)  # number of instructions to remove
         self.n_sub_o = Signal(mqbits)  # number of instructions removed
 
-        self.qsz = shape(self.data_o[0])[0]
+        self.qsz = shape(self.o_data[0])[0]
         q = []
         for i in range(iqlen):
             q.append(Signal(self.qsz, name="q%d" % i))
@@ -91,28 +91,28 @@ class InstructionQ(Elaboratable):
         comb += left.eq(self.qlen_o)  # - self.n_sub_o)
         comb += spare.eq(mqlen - self.p_add_i)
         comb += qmaxed.eq(left <= spare)
-        comb += self.p_ready_o.eq(qmaxed & (self.p_add_i != 0))
+        comb += self.p_o_ready.eq(qmaxed & (self.p_add_i != 0))
 
         # put q (flattened) into output
         for i in range(self.n_out):
             opos = Signal(mqbits)
             comb += opos.eq(end_q + i)
-            comb += cat(self.data_o[i]).eq(self.q[opos])
+            comb += cat(self.o_data[i]).eq(self.q[opos])
 
         with m.If(self.n_sub_o):
             # ok now the end's moved
             sync += end_q.eq(end_q + self.n_sub_o)
 
-        with m.If(self.p_ready_o):
+        with m.If(self.p_o_ready):
             # copy in the input... insanely gate-costly... *sigh*...
             for i in range(self.n_in):
                 with m.If(self.p_add_i > Const(i, len(self.p_add_i))):
                     ipos = Signal(mqbits)
                     comb += ipos.eq(start_q + i)  # should roll round
-                    sync += self.q[ipos].eq(cat(self.data_i[i]))
+                    sync += self.q[ipos].eq(cat(self.i_data[i]))
             sync += start_q.eq(start_q + self.p_add_i)
 
-        with m.If(self.p_ready_o):
+        with m.If(self.p_o_ready):
             # update the queue length
             add2 = Signal(mqbits+1)
             comb += add2.eq(self.qlen_o + self.p_add_i)
@@ -125,12 +125,12 @@ class InstructionQ(Elaboratable):
     def __iter__(self):
         yield from self.q
 
-        yield self.p_ready_o
-        for o in self.data_i:
+        yield self.p_o_ready
+        for o in self.i_data:
             yield from list(o)
         yield self.p_add_i
 
-        for o in self.data_o:
+        for o in self.o_data:
             yield from list(o)
         yield self.n_sub_i
         yield self.n_sub_o
index e8911241b8b38c0c9b28d56e6651aa43f1b5e721..79b822490d4eebe3a1abd26e4743d2fe5ed467ab 100644 (file)
@@ -32,7 +32,7 @@ Notes:
 
 from nmigen.compat.sim import run_simulation
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Const
+from nmigen import Module, Signal, Elaboratable, Cat, Const
 
 from .ldst_dep_cell import LDSTDepCell
 
@@ -69,7 +69,7 @@ class LDSTDepMatrix(Elaboratable):
         # ---
         # matrix of dependency cells.  actually, LDSTDepCell is a row, now
         # ---
-        dm = Array(LDSTDepCell(self.n_ldst) for f in range(self.n_ldst))
+        dm = tuple(LDSTDepCell(self.n_ldst) for f in range(self.n_ldst))
         for fu in range(self.n_ldst):
             setattr(m.submodules, "dm_fu%d" % (fu), dm[fu])
 
index aa79980fef5ddc7874f3a20e7c1878be65c1e252..470bc2f90a9fd41d35fac70957a2be3b6c663fbd 100644 (file)
@@ -10,7 +10,7 @@ class FUMemMatchMatrix(FURegDepMatrix, PartialAddrMatch):
     """
     def __init__(self, n_fu, addrbitwid):
         PartialAddrMatch.__init__(self, n_fu, addrbitwid)
-        FURegDepMatrix.__init__(self, n_fu, n_fu, 1, self.addr_nomatch_o)
+        FURegDepMatrix.__init__(self, n_fu, n_fu, 1, 1, self.addr_nomatch_o)
 
     def elaborate(self, platform):
         m = Module()
index 2958d864cec75480b97a0725d9b3c44f53d2e7a0..382400b705b101bea9945a540bdbefd9b965ada6 100644 (file)
@@ -1,6 +1,6 @@
 from nmigen.compat.sim import run_simulation
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat, Repl
+from nmigen import Module, Signal, Elaboratable, Cat, Repl
 from nmutil.latch import SRLatch
 
 
index 6b9ce140312290a26babe2e3e3d821ae3036e3ab..53c74010f7508c98b38ed29edf7798c9cc812426 100644 (file)
@@ -1,6 +1,6 @@
 from nmigen.compat.sim import run_simulation
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Elaboratable, Array, Cat
+from nmigen import Module, Signal, Elaboratable, Cat
 
 from soc.scoreboard.mem_dependence_cell import MemDepRow
 from soc.scoreboard.mem_fu_pending import MemFU_Pend
@@ -45,21 +45,21 @@ class MemFUDepMatrix(Elaboratable):
         # ---
         # matrix of dependency cells
         # ---
-        dm = Array(MemDepRow(self.n_reg_col) for r in range(self.n_fu_row))
+        dm = tuple(MemDepRow(self.n_reg_col) for r in range(self.n_fu_row))
         for fu in range(self.n_fu_row):
             setattr(m.submodules, "dr_fu%d" % fu, dm[fu])
 
         # ---
         # array of Function Unit Pending vectors
         # ---
-        fupend = Array(MemFU_Pend(self.n_reg_col) for f in range(self.n_fu_row))
+        fupend = tuple(MemFU_Pend(self.n_reg_col) for f in range(self.n_fu_row))
         for fu in range(self.n_fu_row):
             setattr(m.submodules, "fu_fu%d" % (fu), fupend[fu])
 
         # ---
         # array of Register Reservation vectors
         # ---
-        regrsv = Array(Mem_Rsv(self.n_fu_row) for r in range(self.n_reg_col))
+        regrsv = tuple(Mem_Rsv(self.n_fu_row) for r in range(self.n_reg_col))
         for rn in range(self.n_reg_col):
             setattr(m.submodules, "rr_r%d" % (rn), regrsv[rn])
 
index 553ebb5e37c95bc7e05c6022b7252261f7fae507..fd0902ad679bdc81eb48536f9de8a6345f25a639 100644 (file)
@@ -1,6 +1,6 @@
 from nmigen.compat.sim import run_simulation
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Array, Elaboratable
+from nmigen import Module, Signal, Elaboratable
 
 from soc.scoreboard.fu_fu_matrix import FUFUDepMatrix
 from soc.scoreboard.mdm import FUMemMatchMatrix
@@ -31,7 +31,7 @@ class MemFunctionUnits(Elaboratable):
         self.fn_issue_i = Signal(n_ldsts, reset_less=True)
 
         # address matching
-        self.addrs_i = Array(Signal(self.bitwid, name="addrs_i%d" % i) \
+        self.addrs_i = tuple(Signal(self.bitwid, name="addrs_i%d" % i) \
                              for i in range(n_ldsts))
         #self.addr_we_i = Signal(n_ldsts) # write-enable for incoming address
         self.addr_en_i = Signal(n_ldsts) # address latched in
index 3919cce313c25527c2755f3714ec1e58b83c32e6..e87aee05b8a9a18918e7323f29881da74383525d 100644 (file)
@@ -1,24 +1,55 @@
-from nmigen import Elaboratable, Module, Signal, Array
+# (DO NOT REMOVE THESE NOTICES)
+# SPDX-License-Identifier: LGPLv3+
+# Copyright (C) 2019, 2020, 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Part of the Libre-SOC Project.
+# Sponsored by NLnet       EU Grant No: 825310 and 825322
+# Sponsored by NGI POINTER EU Grant No: 871528
+
+from nmigen.cli import verilog, rtlil
+from nmigen import Elaboratable, Module, Signal
 
 
 class Reg_Rsv(Elaboratable):
     """ these are allocated per-Register (vertically),
         and are each of length fu_count
     """
-    def __init__(self, fu_count, n_src):
+    def __init__(self, fu_count, n_src, n_dst):
         self.n_src = n_src
+        self.n_dst = n_dst
         self.fu_count = fu_count
-        self.dest_rsel_i = Signal(fu_count, reset_less=True)
-        self.src_rsel_i = Array(Signal(fu_count, name="src_rsel_i",
+        self.dst_rsel_i = tuple(Signal(fu_count, name="dst%i_rsel_i" % (i+1),
+                                       reset_less=True) \
+                                for i in range(n_dst))
+        self.src_rsel_i = tuple(Signal(fu_count, name="src%i_rsel_i" % (i+1),
                                        reset_less=True) \
                                 for i in range(n_src))
-        self.dest_rsel_o = Signal(reset_less=True)
+        self.dst_rsel_o = Signal(n_dst, reset_less=True)
         self.src_rsel_o = Signal(n_src, reset_less=True)
 
     def elaborate(self, platform):
         m = Module()
-        m.d.comb += self.dest_rsel_o.eq(self.dest_rsel_i.bool())
+        for i in range(self.n_dst):
+            m.d.comb += self.dst_rsel_o[i].eq(self.dst_rsel_i[i].bool())
         for i in range(self.n_src):
             m.d.comb += self.src_rsel_o[i].eq(self.src_rsel_i[i].bool())
         return m
 
+    def __iter__(self):
+        yield from self.dst_rsel_i
+        yield from self.src_rsel_i
+        yield self.dst_rsel_o
+        yield self.src_rsel_o
+
+    def ports(self):
+        return list(self)
+
+
+def test_reg_rsv():
+    dut = Reg_Rsv(4, 2, 2)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_reg_rsv.il", "w") as f:
+        f.write(vl)
+
+
+if __name__ == '__main__':
+    test_reg_rsv()
index d99d37a8d2026e6cd9480f408d000c9f85b8bb58..36f9250973020f5fd24055b85d7e36a44945c912 100644 (file)
@@ -1,6 +1,6 @@
 from nmigen.compat.sim import run_simulation
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Cat, Array, Const, Elaboratable, Repl
+from nmigen import Module, Signal, Cat, Const, Elaboratable, Repl
 from nmigen.lib.coding import Decoder
 
 from soc.scoreboard.shadow_fn import ShadowFn
@@ -42,11 +42,11 @@ class ShadowMatrix(Elaboratable):
         # inputs
         self.issue_i = Signal(n_fus, reset_less=True)
         self.reset_i = Signal(n_fus, reset_less=True)
-        self.shadow_i = Array(Signal(shadow_wid, name="sh_i", reset_less=True) \
+        self.shadow_i = tuple(Signal(shadow_wid, name="sh_i", reset_less=True) \
                             for f in range(n_fus))
-        self.s_fail_i = Array(Signal(shadow_wid, name="fl_i", reset_less=True) \
+        self.s_fail_i = tuple(Signal(shadow_wid, name="fl_i", reset_less=True) \
                             for f in range(n_fus))
-        self.s_good_i = Array(Signal(shadow_wid, name="gd_i", reset_less=True) \
+        self.s_good_i = tuple(Signal(shadow_wid, name="gd_i", reset_less=True) \
                             for f in range(n_fus))
         # outputs
         self.go_die_o = Signal(n_fus, reset_less=True)
@@ -176,7 +176,7 @@ class WaWGrid(Elaboratable):
         self.shadow_i = Signal(shadow_wid, reset_less=True)
         self.fu_i = Signal(n_fus, reset_less=True)
 
-        self.waw_o = Array(Signal(shadow_wid, name="waw_o", reset_less=True) \
+        self.waw_o = tuple(Signal(shadow_wid, name="waw_o", reset_less=True) \
                             for f in range(n_fus))
 
     def elaborate(self, platform):
index fb67d263c47f58372fe6b8bab05e923abce64c6c..d0131e94bf3c9881ff2532cb9a1e4fae370b1f36 100644 (file)
@@ -28,16 +28,16 @@ class IQSim:
             print("sendlen", len(self.iq)-i, sendlen)
             for idx in range(sendlen):
                 instr = self.iq[i+idx]
-                yield from eq(self.dut.data_i[idx], instr)
-                di = yield self.dut.data_i[idx]  # .src1_i
+                yield from eq(self.dut.i_data[idx], instr)
+                di = yield self.dut.i_data[idx]  # .src1_i
                 print("senddata %d %x" % ((i+idx), di))
                 self.oq.append(di)
             yield self.dut.p_add_i.eq(sendlen)
             yield
-            o_p_ready = yield self.dut.p_ready_o
+            o_p_ready = yield self.dut.p_o_ready
             while not o_p_ready:
                 yield
-                o_p_ready = yield self.dut.p_ready_o
+                o_p_ready = yield self.dut.p_o_ready
 
             yield self.dut.p_add_i.eq(0)
 
@@ -76,7 +76,7 @@ class IQSim:
             n_sub_o = yield self.dut.n_sub_o
             print("recv", n_sub_o)
             for j in range(n_sub_o):
-                r = yield self.dut.data_o[j]  # .src1_i
+                r = yield self.dut.o_data[j]  # .src1_i
                 print("recvdata %x %s" % (r, repr(self.iq[i+j])))
                 assert r == self.oq[i+j]
             yield
index 9fd84b2487f8eee282cae93902da8e8924b9384a..dd5afc05e0d105e43833b22488d752099511c23b 100644 (file)
@@ -290,11 +290,11 @@ class Scoreboard(Elaboratable):
         # branch is active (TODO: a better signal: this is over-using the
         # go_write signal - actually the branch should not be "writing")
         with m.If(br1.go_wr_i):
-            sync += self.branch_direction_o.eq(br1.data_o+Const(1, 2))
+            sync += self.branch_direction_o.eq(br1.o_data+Const(1, 2))
             sync += bspec.active_i.eq(0)
             comb += bspec.br_i.eq(1)
             # branch occurs if data == 1, failed if data == 0
-            comb += bspec.br_ok_i.eq(br1.data_o == 1)
+            comb += bspec.br_ok_i.eq(br1.o_data == 1)
             for i in range(n_intfus):
                 # *expected* direction of the branch matched against *actual*
                 comb += bshadow.s_good_i[i][0].eq(bspec.match_g_o[i])
@@ -309,9 +309,9 @@ class Scoreboard(Elaboratable):
         comb += int_src2.ren.eq(intfus.src2_rsel_o)
 
         # connect ALUs to regfule
-        comb += int_dest.data_i.eq(cu.data_o)
-        comb += cu.src1_i.eq(int_src1.data_o)
-        comb += cu.src2_i.eq(int_src2.data_o)
+        comb += int_dest.i_data.eq(cu.o_data)
+        comb += cu.src1_i.eq(int_src1.o_data)
+        comb += cu.src2_i.eq(int_src2.o_data)
 
         # connect ALU Computation Units
         comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
index 949d0ff21bc6b280be20b7d59902157a0e89d67c..d02c38136acf6a3aff05796d50ec234860c6a317 100644 (file)
@@ -392,11 +392,11 @@ class Scoreboard(Elaboratable):
         # branch is active (TODO: a better signal: this is over-using the
         # go_write signal - actually the branch should not be "writing")
         with m.If(br1.go_wr_i):
-            sync += self.branch_direction_o.eq(br1.data_o+Const(1, 2))
+            sync += self.branch_direction_o.eq(br1.o_data+Const(1, 2))
             sync += bspec.active_i.eq(0)
             comb += bspec.br_i.eq(1)
             # branch occurs if data == 1, failed if data == 0
-            comb += bspec.br_ok_i.eq(br1.data_o == 1)
+            comb += bspec.br_ok_i.eq(br1.o_data == 1)
             for i in range(n_intfus):
                 # *expected* direction of the branch matched against *actual*
                 comb += bshadow.s_good_i[i][0].eq(bspec.match_g_o[i])
@@ -411,9 +411,9 @@ class Scoreboard(Elaboratable):
         comb += int_src2.ren.eq(intfus.src2_rsel_o)
 
         # connect ALUs to regfule
-        comb += int_dest.data_i.eq(cu.data_o)
-        comb += cu.src1_i.eq(int_src1.data_o)
-        comb += cu.src2_i.eq(int_src2.data_o)
+        comb += int_dest.i_data.eq(cu.o_data)
+        comb += cu.src1_i.eq(int_src1.o_data)
+        comb += cu.src2_i.eq(int_src2.o_data)
 
         # connect ALU Computation Units
         comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
index 84555c37b5cf83ec0c3b099f15fa6d54090cd8aa..9a4abacc3135e647ae4be3d9a8b7882e7ce68fe4 100644 (file)
@@ -17,38 +17,41 @@ the brain-dead part of this module is that even though there is no
 conflict of access, regfile read/write hazards are *not* analysed,
 and consequently it is safer to wait for the Function Unit to complete
 before allowing a new instruction to proceed.
+(update: actually this is being added now:
+https://bugs.libre-soc.org/show_bug.cgi?id=737)
 """
 
-from nmigen import Elaboratable, Module, Signal, ResetSignal, Cat, Mux
+from nmigen import (Elaboratable, Module, Signal, ResetSignal, Cat, Mux,
+                    Const)
 from nmigen.cli import rtlil
 
 from openpower.decoder.power_decoder2 import PowerDecodeSubset
-from openpower.decoder.power_regspec_map import regspec_decode_read
-from openpower.decoder.power_regspec_map import regspec_decode_write
+from openpower.decoder.power_regspec_map import regspec_decode
 from openpower.sv.svp64 import SVP64Rec
 
 from nmutil.picker import PriorityPicker
 from nmutil.util import treereduce
+from nmutil.singlepipe import ControlBase
 
-from soc.fu.compunits.compunits import AllFunctionUnits
+from soc.fu.compunits.compunits import AllFunctionUnits, LDSTFunctionUnit
 from soc.regfile.regfiles import RegFiles
-from openpower.decoder.decode2execute1 import Decode2ToExecute1Type
-from openpower.decoder.decode2execute1 import IssuerDecode2ToOperand
 from openpower.decoder.power_decoder2 import get_rdflags
-from openpower.decoder.decode2execute1 import Data
 from soc.experiment.l0_cache import TstL0CacheBuffer  # test only
 from soc.config.test.test_loadstore import TestMemPspec
-from openpower.decoder.power_enums import MicrOp
-from soc.config.state import CoreState
+from openpower.decoder.power_enums import MicrOp, Function
+from soc.simple.core_data import CoreInput, CoreOutput
 
+from collections import defaultdict, namedtuple
 import operator
 
 from nmutil.util import rising_edge
 
+FUSpec = namedtuple("FUSpec", ["funame", "fu", "idx"])
+ByRegSpec = namedtuple("ByRegSpec", ["okflag", "regport", "wid", "specs"])
 
 # helper function for reducing a list of signals down to a parallel
 # ORed single signal.
-def ortreereduce(tree, attr="data_o"):
+def ortreereduce(tree, attr="o_data"):
     return treereduce(tree, operator.or_, lambda x: getattr(x, attr))
 
 
@@ -68,7 +71,50 @@ def sort_fuspecs(fuspecs):
     return res  # enumerate(res)
 
 
-class NonProductionCore(Elaboratable):
+# a hazard bitvector "remap" function which returns an AST expression
+# that remaps read/write hazard regfile port numbers to either a full
+# bitvector or a reduced subset one.  SPR for example is reduced to a
+# single bit.
+# CRITICALLY-IMPORTANT NOTE: these bitvectors *have* to match up per
+# regfile!  therefore the remapping is per regfile, *NOT* per regfile
+# port and certainly not based on whether it is a read port or write port.
+# note that any reductions here will result in degraded performance due
+# to conflicts, but at least it keeps the hazard matrix sizes down to "sane"
+def bitvector_remap(regfile, rfile, port):
+    # 8-bits (at the moment, no SVP64), CR is unary: no remap
+    if regfile == 'CR':
+        return port
+    # 3 bits, unary alrady: return the port
+    if regfile == 'XER':
+        return port
+    # 3 bits, unary: return the port
+    if regfile == 'XER':
+        return port
+    # 5 bits, unary: return the port
+    if regfile == 'STATE':
+        return port
+    # 9 bits (9 entries), might be unary already
+    if regfile == 'FAST':
+        if rfile.unary: # FAST might be unary already
+            return port
+        else:
+            return 1 << port
+    # 10 bits (!!) - reduce to one
+    if regfile == 'SPR':
+        if rfile.unary: # FAST might be unary already
+            return port
+        else:
+            return 1 << port
+    if regfile == 'INT':
+        if rfile.unary: # INT, check if unary/binary
+            return port
+        else:
+            return 1 << port
+
+
+# derive from ControlBase rather than have a separate Stage instance,
+# this is simpler to do
+class NonProductionCore(ControlBase):
     def __init__(self, pspec):
         self.pspec = pspec
 
@@ -79,6 +125,20 @@ class NonProductionCore(Elaboratable):
         self.regreduce_en = (hasattr(pspec, "regreduce") and
                              (pspec.regreduce == True))
 
+        # test to see if overlapping of instructions is allowed
+        # (not normally enabled for TestIssuer FSM but useful for checking
+        # the bitvector hazard detection, before doing In-Order)
+        self.allow_overlap = (hasattr(pspec, "allow_overlap") and
+                             (pspec.allow_overlap == True))
+
+        # test core type
+        self.make_hazard_vecs = self.allow_overlap
+        self.core_type = "fsm"
+        if hasattr(pspec, "core_type"):
+            self.core_type = pspec.core_type
+
+        super().__init__(stage=self)
+
         # single LD/ST funnel for memory access
         self.l0 = l0 = TstL0CacheBuffer(pspec, n_units=1)
         pi = l0.l0.dports[0]
@@ -89,45 +149,56 @@ class NonProductionCore(Elaboratable):
 
         # link LoadStore1 into MMU
         mmu = self.fus.get_fu('mmu0')
+        ldst0 = self.fus.get_fu('ldst0')
         print ("core pspec", pspec.ldst_ifacetype)
         print ("core mmu", mmu)
-        print ("core lsmem.lsi", l0.cmpi.lsmem.lsi)
         if mmu is not None:
-            mmu.alu.set_ldst_interface(l0.cmpi.lsmem.lsi)
+            lsi = l0.cmpi.lsmem.lsi # a LoadStore1 Interface object
+            print ("core lsmem.lsi", lsi)
+            mmu.alu.set_ldst_interface(lsi)
+            # urr store I-Cache in core so it is easier to get at
+            self.icache = lsi.icache
+
+        # alternative reset values for STATE regs. these probably shouldn't
+        # be set, here, instead have them done by Issuer. which they are.
+        # as well. because core.state overrides them. sigh.
+        self.msr_at_reset = 0x0
+        self.pc_at_reset = 0x0
+        if hasattr(pspec, "msr_reset") and isinstance(pspec.msr_reset, int):
+            self.msr_at_reset = pspec.msr_reset
+        if hasattr(pspec, "pc_reset") and isinstance(pspec.pc_reset, int):
+            self.pc_at_reset = pspec.pc_reset
+        state_resets = [self.pc_at_reset,  # PC at reset
+                        self.msr_at_reset, # MSR at reset
+                        0x0,               # SVSTATE at reset
+                        0x0,               # DEC at reset
+                        0x0]               # TB at reset
 
         # register files (yes plural)
-        self.regs = RegFiles(pspec)
-
-        # instruction decoder - needs a Trap-capable Record (captures EINT etc.)
-        self.e = Decode2ToExecute1Type("core", opkls=IssuerDecode2ToOperand,
-                                regreduce_en=self.regreduce_en)
-
-        # SVP64 RA_OR_ZERO needs to know if the relevant EXTRA2/3 field is zero
-        self.sv_a_nz = Signal()
-
-        # state and raw instruction (and SVP64 ReMap fields)
-        self.state = CoreState("core")
-        self.raw_insn_i = Signal(32) # raw instruction
-        self.bigendian_i = Signal() # bigendian - TODO, set by MSR.BE
-        if self.svp64_en:
-            self.sv_rm = SVP64Rec(name="core_svp64_rm") # SVP64 RM field
-            self.is_svp64_mode = Signal() # set if SVP64 mode is enabled
-            self.use_svp64_ldst_dec = Signal() # use alternative LDST decoder
-            self.sv_pred_sm = Signal() # TODO: SIMD width
-            self.sv_pred_dm = Signal() # TODO: SIMD width
-
-        # issue/valid/busy signalling
-        self.ivalid_i = Signal(reset_less=True) # instruction is valid
-        self.issue_i = Signal(reset_less=True)
-        self.busy_o = Signal(name="corebusy_o", reset_less=True)
-
-        # start/stop and terminated signalling
-        self.core_terminate_o = Signal(reset=0)  # indicates stopped
-
-        # create per-FU instruction decoders (subsetted)
+        self.regs = RegFiles(pspec, make_hazard_vecs=self.make_hazard_vecs,
+                                    state_resets=state_resets)
+
+        # set up input and output: unusual requirement to set data directly
+        # (due to the way that the core is set up in a different domain,
+        # see TestIssuer.setup_peripherals
+        self.p.i_data, self.n.o_data = self.new_specs(None)
+        self.i, self.o = self.p.i_data, self.n.o_data
+
+        # actual internal input data used (captured)
+        self.ireg = self.ispec()
+
+        # create per-FU instruction decoders (subsetted).  these "satellite"
+        # decoders reduce wire fan-out from the one (main) PowerDecoder2
+        # (used directly by the trap unit) to the *twelve* (or more)
+        # Function Units.  we can either have 32 wires (the instruction)
+        # to each, or we can have well over a 200 wire fan-out (to 12
+        # ALUs). it's an easy choice to make.
         self.decoders = {}
         self.des = {}
 
+        # eep, these should be *per FU* i.e. for FunctionUnitBaseMulti
+        # they should be shared (put into the ALU *once*).
+
         for funame, fu in self.fus.fus.items():
             f_name = fu.fnunit.name
             fnunit = fu.fnunit.value
@@ -136,18 +207,43 @@ class NonProductionCore(Elaboratable):
                 # TRAP decoder is the *main* decoder
                 self.trapunit = funame
                 continue
+            assert funame not in self.decoders
             self.decoders[funame] = PowerDecodeSubset(None, opkls, f_name,
                                                       final=True,
-                                                      state=self.state,
+                                                      state=self.ireg.state,
                                             svp64_en=self.svp64_en,
                                             regreduce_en=self.regreduce_en)
             self.des[funame] = self.decoders[funame].do
+            print ("create decoder subset", funame, opkls, self.des[funame])
 
+        # create per-Function Unit write-after-write hazard signals
+        # yes, really, this should have been added in ReservationStations
+        # but hey.
+        for funame, fu in self.fus.fus.items():
+            fu._waw_hazard = Signal(name="waw_%s" % funame)
+
+        # share the SPR decoder with the MMU if it exists
         if "mmu0" in self.decoders:
             self.decoders["mmu0"].mmu0_spr_dec = self.decoders["spr0"]
 
+        # allow pausing of the DEC/TB FSM back in Issuer, by spotting
+        # if there is an MTSPR instruction
+        self.pause_dec_tb = Signal()
+
+    # next 3 functions are Stage API Compliance
+    def setup(self, m, i):
+        pass
+
+    def ispec(self):
+        return CoreInput(self.pspec, self.svp64_en, self.regreduce_en)
+
+    def ospec(self):
+        return CoreOutput()
+
+    # elaborate function to create HDL
     def elaborate(self, platform):
-        m = Module()
+        m = super().elaborate(platform)
+
         # for testing purposes, to cut down on build time in coriolis2
         if hasattr(self.pspec, "nocore") and self.pspec.nocore == True:
             x = Signal() # dummy signal
@@ -161,35 +257,63 @@ class NonProductionCore(Elaboratable):
         regs = self.regs
         fus = self.fus.fus
 
+        # amalgamate write-hazards into a single top-level Signal
+        self.waw_hazard = Signal()
+        whaz = []
+        for funame, fu in self.fus.fus.items():
+            whaz.append(fu._waw_hazard)
+        comb += self.waw_hazard.eq(Cat(*whaz).bool())
+
         # connect decoders
-        for k, v in self.decoders.items():
-            setattr(m.submodules, "dec_%s" % v.fn_name, v)
-            comb += v.dec.raw_opcode_in.eq(self.raw_insn_i)
-            comb += v.dec.bigendian.eq(self.bigendian_i)
-            # sigh due to SVP64 RA_OR_ZERO detection connect these too
-            comb += v.sv_a_nz.eq(self.sv_a_nz)
-            if self.svp64_en:
-                comb += v.pred_sm.eq(self.sv_pred_sm)
-                comb += v.pred_dm.eq(self.sv_pred_dm)
-                if k != self.trapunit:
-                    comb += v.sv_rm.eq(self.sv_rm) # pass through SVP64 ReMap
-                    comb += v.is_svp64_mode.eq(self.is_svp64_mode)
-                    # only the LDST PowerDecodeSubset *actually* needs to
-                    # know to use the alternative decoder.  this is all
-                    # a terrible hack
-                    if k.lower().startswith("ldst"):
-                        comb += v.use_svp64_ldst_dec.eq(self.use_svp64_ldst_dec)
+        self.connect_satellite_decoders(m)
 
         # ssh, cheat: trap uses the main decoder because of the rewriting
-        self.des[self.trapunit] = self.e.do
-
-        # connect up Function Units, then read/write ports
-        fu_bitdict = self.connect_instruction(m)
-        self.connect_rdports(m, fu_bitdict)
-        self.connect_wrports(m, fu_bitdict)
+        self.des[self.trapunit] = self.ireg.e.do
+
+        # connect up Function Units, then read/write ports, and hazard conflict
+        self.issue_conflict = Signal()
+        fu_bitdict, fu_selected = self.connect_instruction(m)
+        raw_hazard = self.connect_rdports(m, fu_bitdict, fu_selected)
+        self.connect_wrports(m, fu_bitdict, fu_selected)
+        if self.allow_overlap:
+            comb += self.issue_conflict.eq(raw_hazard)
+
+        # note if an exception happened.  in a pipelined or OoO design
+        # this needs to be accompanied by "shadowing" (or stalling)
+        el = []
+        for exc in self.fus.excs.values():
+            el.append(exc.happened)
+        if len(el) > 0: # at least one exception
+            comb += self.o.exc_happened.eq(Cat(*el).bool())
 
         return m
 
+    def connect_satellite_decoders(self, m):
+        comb = m.d.comb
+        for k, v in self.decoders.items():
+            # connect each satellite decoder and give it the instruction.
+            # as subset decoders this massively reduces wire fanout given
+            # the large number of ALUs
+            m.submodules["dec_%s" % k] = v
+            comb += v.dec.raw_opcode_in.eq(self.ireg.raw_insn_i)
+            comb += v.dec.bigendian.eq(self.ireg.bigendian_i)
+            # sigh due to SVP64 RA_OR_ZERO detection connect these too
+            comb += v.sv_a_nz.eq(self.ireg.sv_a_nz)
+            if not self.svp64_en:
+                continue
+            comb += v.pred_sm.eq(self.ireg.sv_pred_sm)
+            comb += v.pred_dm.eq(self.ireg.sv_pred_dm)
+            if k == self.trapunit:
+                continue
+            comb += v.sv_rm.eq(self.ireg.sv_rm) # pass through SVP64 RM
+            comb += v.is_svp64_mode.eq(self.ireg.is_svp64_mode)
+            # only the LDST PowerDecodeSubset *actually* needs to
+            # know to use the alternative decoder.  this is all
+            # a terrible hack
+            if not k.lower().startswith("ldst"):
+                continue
+            comb += v.use_svp64_ldst_dec.eq( self.ireg.use_svp64_ldst_dec)
+
     def connect_instruction(self, m):
         """connect_instruction
 
@@ -205,59 +329,216 @@ class NonProductionCore(Elaboratable):
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
 
-        # enable-signals for each FU, get one bit for each FU (by name)
+        # indicate if core is busy
+        busy_o = self.o.busy_o
+        any_busy_o = self.o.any_busy_o
+
+        # connect up temporary copy of incoming instruction. the FSM will
+        # either blat the incoming instruction (if valid) into self.ireg
+        # or if the instruction could not be delivered, keep dropping the
+        # latched copy into ireg
+        ilatch = self.ispec()
+        self.instr_active = Signal()
+
+        # enable/busy-signals for each FU, get one bit for each FU (by name)
         fu_enable = Signal(len(fus), reset_less=True)
+        fu_busy = Signal(len(fus), reset_less=True)
         fu_bitdict = {}
+        fu_selected = {}
         for i, funame in enumerate(fus.keys()):
             fu_bitdict[funame] = fu_enable[i]
-
-        # enable the required Function Unit based on the opcode decode
-        # note: this *only* works correctly for simple core when one and
-        # *only* one FU is allocated per instruction
+            fu_selected[funame] = fu_busy[i]
+
+        # identify function units and create a list by fnunit so that
+        # PriorityPickers can be created for selecting one of them that
+        # isn't busy at the time the incoming instruction needs passing on
+        by_fnunit = defaultdict(list)
+        for fname, member in Function.__members__.items():
+            for funame, fu in fus.items():
+                fnunit = fu.fnunit.value
+                if member.value & fnunit: # this FU handles this type of op
+                    by_fnunit[fname].append((funame, fu)) # add by Function
+
+        # ok now just print out the list of FUs by Function, because we can
+        for fname, fu_list in by_fnunit.items():
+            print ("FUs by type", fname, fu_list)
+
+        # now create a PriorityPicker per FU-type such that only one
+        # non-busy FU will be picked
+        issue_pps = {}
+        fu_found = Signal() # take a note if no Function Unit was available
+        for fname, fu_list in by_fnunit.items():
+            i_pp = PriorityPicker(len(fu_list))
+            m.submodules['i_pp_%s' % fname] = i_pp
+            i_l = []
+            for i, (funame, fu) in enumerate(fu_list):
+                # match the decoded instruction (e.do.fn_unit) against the
+                # "capability" of this FU, gate that by whether that FU is
+                # busy, and drop that into the PriorityPicker.
+                # this will give us an output of the first available *non-busy*
+                # Function Unit (Reservation Statio) capable of handling this
+                # instruction.
+                fnunit = fu.fnunit.value
+                en_req = Signal(name="issue_en_%s" % funame, reset_less=True)
+                fnmatch = (self.ireg.e.do.fn_unit & fnunit).bool()
+                comb += en_req.eq(fnmatch & ~fu.busy_o &
+                                    self.instr_active)
+                i_l.append(en_req) # store in list for doing the Cat-trick
+                # picker output, gated by enable: store in fu_bitdict
+                po = Signal(name="o_issue_pick_"+funame) # picker output
+                comb += po.eq(i_pp.o[i] & i_pp.en_o)
+                comb += fu_bitdict[funame].eq(po)
+                comb += fu_selected[funame].eq(fu.busy_o | po)
+                # if we don't do this, then when there are no FUs available,
+                # the "p.o_ready" signal will go back "ok we accepted this
+                # instruction" which of course isn't true.
+                with m.If(i_pp.en_o):
+                    comb += fu_found.eq(1)
+            # for each input, Cat them together and drop them into the picker
+            comb += i_pp.i.eq(Cat(*i_l))
+
+        # rdmask, which is for registers needs to come from the *main* decoder
         for funame, fu in fus.items():
-            fnunit = fu.fnunit.value
-            enable = Signal(name="en_%s" % funame, reset_less=True)
-            comb += enable.eq((self.e.do.fn_unit & fnunit).bool())
-            comb += fu_bitdict[funame].eq(enable)
+            rdmask = get_rdflags(m, self.ireg.e, fu)
+            comb += fu.rdmaskn.eq(~rdmask)
 
         # sigh - need a NOP counter
         counter = Signal(2)
         with m.If(counter != 0):
             sync += counter.eq(counter - 1)
-            comb += self.busy_o.eq(1)
-
-        with m.If(self.ivalid_i): # run only when valid
-            with m.Switch(self.e.do.insn_type):
-                # check for ATTN: halt if true
-                with m.Case(MicrOp.OP_ATTN):
-                    m.d.sync += self.core_terminate_o.eq(1)
-
-                with m.Case(MicrOp.OP_NOP):
-                    sync += counter.eq(2)
-                    comb += self.busy_o.eq(1)
-
-                with m.Default():
-                    # connect up instructions.  only one enabled at a time
+            comb += busy_o.eq(1)
+
+        # default to reading from incoming instruction: may be overridden
+        # by copy from latch when "waiting"
+        comb += self.ireg.eq(self.i)
+        # always say "ready" except if overridden
+        comb += self.p.o_ready.eq(1)
+
+        with m.FSM():
+            with m.State("READY"):
+                with m.If(self.p.i_valid): # run only when valid
+                    with m.Switch(self.ireg.e.do.insn_type):
+                        # check for ATTN: halt if true
+                        with m.Case(MicrOp.OP_ATTN):
+                            m.d.sync += self.o.core_terminate_o.eq(1)
+
+                        # fake NOP - this isn't really used (Issuer detects NOP)
+                        with m.Case(MicrOp.OP_NOP):
+                            sync += counter.eq(2)
+                            comb += busy_o.eq(1)
+
+                        with m.Default():
+                            comb += self.instr_active.eq(1)
+                            comb += self.p.o_ready.eq(0)
+                            # connect instructions. only one enabled at a time
+                            for funame, fu in fus.items():
+                                do = self.des[funame]
+                                enable = fu_bitdict[funame]
+
+                                # run this FunctionUnit if enabled route op,
+                                # issue, busy, read flags and mask to FU
+                                with m.If(enable):
+                                    # operand comes from the *local*  decoder
+                                    # do not actually issue, though, if there
+                                    # is a waw hazard. decoder has to still
+                                    # be asserted in order to detect that, tho
+                                    comb += fu.oper_i.eq_from(do)
+                                    if funame == 'mmu0':
+                                        # URRR this is truly dreadful.
+                                        # OP_FETCH_FAILED is a "fake" op.
+                                        # no instruction creates it.  OP_TRAP
+                                        # uses the *main* decoder: this is
+                                        # a *Satellite* decoder that reacts
+                                        # on *insn_in*... not fake ops. gaah.
+                                        main_op = self.ireg.e.do
+                                        with m.If(main_op.insn_type ==
+                                                  MicrOp.OP_FETCH_FAILED):
+                                            comb += fu.oper_i.insn_type.eq(
+                                                  MicrOp.OP_FETCH_FAILED)
+                                            comb += fu.oper_i.fn_unit.eq(
+                                                  Function.MMU)
+                                    # issue when valid (and no write-hazard)
+                                    comb += fu.issue_i.eq(~self.waw_hazard)
+                                    # instruction ok, indicate ready
+                                    comb += self.p.o_ready.eq(1)
+
+                            if self.allow_overlap:
+                                with m.If(~fu_found | self.waw_hazard):
+                                    # latch copy of instruction
+                                    sync += ilatch.eq(self.i)
+                                    comb += self.p.o_ready.eq(1) # accept
+                                    comb += busy_o.eq(1)
+                                    m.next = "WAITING"
+
+            with m.State("WAITING"):
+                comb += self.instr_active.eq(1)
+                comb += self.p.o_ready.eq(0)
+                comb += busy_o.eq(1)
+                # using copy of instruction, keep waiting until an FU is free
+                comb += self.ireg.eq(ilatch)
+                with m.If(fu_found): # wait for conflict to clear
+                    # connect instructions. only one enabled at a time
                     for funame, fu in fus.items():
                         do = self.des[funame]
                         enable = fu_bitdict[funame]
 
-                        # run this FunctionUnit if enabled
-                        # route op, issue, busy, read flags and mask to FU
+                        # run this FunctionUnit if enabled route op,
+                        # issue, busy, read flags and mask to FU
                         with m.If(enable):
-                            # operand comes from the *local*  decoder
+                            # operand comes from the *local* decoder,
+                            # which is asserted even if not issued,
+                            # so that WaW-detection can check for hazards.
+                            # only if the waw hazard is clear does the
+                            # instruction actually get issued
                             comb += fu.oper_i.eq_from(do)
-                            #comb += fu.oper_i.eq_from_execute1(e)
-                            comb += fu.issue_i.eq(self.issue_i)
-                            comb += self.busy_o.eq(fu.busy_o)
-                            # rdmask, which is for registers, needs to come
-                            # from the *main* decoder
-                            rdmask = get_rdflags(self.e, fu)
-                            comb += fu.rdmaskn.eq(~rdmask)
-
-        return fu_bitdict
-
-    def connect_rdport(self, m, fu_bitdict, rdpickers, regfile, regname, fspec):
+                            # issue when valid
+                            comb += fu.issue_i.eq(~self.waw_hazard)
+                            with m.If(~self.waw_hazard):
+                                comb += self.p.o_ready.eq(1)
+                                comb += busy_o.eq(0)
+                                m.next = "READY"
+
+        print ("core: overlap allowed", self.allow_overlap)
+        # true when any FU is busy (including the cycle where it is perhaps
+        # to be issued - because that's what fu_busy is)
+        comb += any_busy_o.eq(fu_busy.bool())
+        if not self.allow_overlap:
+            # for simple non-overlap, if any instruction is busy, set
+            # busy output for core.
+            comb += busy_o.eq(any_busy_o)
+        else:
+            # sigh deal with a fun situation that needs to be investigated
+            # and resolved
+            with m.If(self.issue_conflict):
+                comb += busy_o.eq(1)
+            # make sure that LDST, SPR, MMU, Branch and Trap all say "busy"
+            # and do not allow overlap.  these are all the ones that
+            # are non-forward-progressing: exceptions etc. that otherwise
+            # change CoreState for some reason (MSR, PC, SVSTATE)
+            for funame, fu in fus.items():
+                if (funame.lower().startswith('ldst') or
+                    funame.lower().startswith('branch') or
+                    funame.lower().startswith('mmu') or
+                    funame.lower().startswith('spr') or
+                    funame.lower().startswith('trap')):
+                    with m.If(fu.busy_o):
+                        comb += busy_o.eq(1)
+                # for SPR pipeline pause dec/tb FSM to avoid race condition
+                # TODO: really this should be much more sophisticated,
+                # spot MTSPR, spot that DEC/TB is what is to be updated.
+                # a job for PowerDecoder2, there
+                if funame.lower().startswith('spr'):
+                    with m.If(fu.busy_o #& fu.oper_i.insn_type == OP_MTSPR
+                        ):
+                        comb += self.pause_dec_tb.eq(1)
+
+        # return both the function unit "enable" dict as well as the "busy".
+        # the "busy-or-issued" can be passed in to the Read/Write port
+        # connecters to give them permission to request access to regfiles
+        return fu_bitdict, fu_selected
+
+    def connect_rdport(self, m, fu_bitdict, fu_selected,
+                                rdpickers, regfile, regname, fspec):
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
         regs = self.regs
@@ -270,59 +551,108 @@ class NonProductionCore(Elaboratable):
         print("read regfile", rpidx, regfile, regs.rf.keys(),
                               rfile, rfile.unary)
 
+        # for checking if the read port has an outstanding write
+        if self.make_hazard_vecs:
+            wv = regs.wv[regfile.lower()]
+            wvchk = wv.q_int # write-vec bit-level hazard check
+
+        # if a hazard is detected on this read port, simply blithely block
+        # every FU from reading on it.  this is complete overkill but very
+        # simple for now.
+        hazard_detected = Signal(name="raw_%s_%s" % (regfile, rpidx))
+
         fspecs = fspec
         if not isinstance(fspecs, list):
             fspecs = [fspecs]
 
         rdflags = []
         pplen = 0
-        reads = []
         ppoffs = []
         for i, fspec in enumerate(fspecs):
             # get the regfile specs for this regfile port
-            (rf, read, write, wid, fuspec) = fspec
-            print ("fpsec", i, fspec, len(fuspec))
+            print ("fpsec", i, fspec, len(fspec.specs))
+            name = "%s_%s_%d" % (regfile, regname, i)
             ppoffs.append(pplen) # record offset for picker
-            pplen += len(fuspec)
-            name = "rdflag_%s_%s_%d" % (regfile, regname, i)
-            rdflag = Signal(name=name, reset_less=True)
-            comb += rdflag.eq(rf)
+            pplen += len(fspec.specs)
+            rdflag = Signal(name="rdflag_"+name, reset_less=True)
+            comb += rdflag.eq(fspec.okflag)
             rdflags.append(rdflag)
-            reads.append(read)
 
         print ("pplen", pplen)
 
         # create a priority picker to manage this port
         rdpickers[regfile][rpidx] = rdpick = PriorityPicker(pplen)
-        setattr(m.submodules, "rdpick_%s_%s" % (regfile, rpidx), rdpick)
+        m.submodules["rdpick_%s_%s" % (regfile, rpidx)] = rdpick
 
         rens = []
         addrs = []
+        wvens = []
+
         for i, fspec in enumerate(fspecs):
-            (rf, read, write, wid, fuspec) = fspec
+            (rf, _read, wid, fuspecs) = \
+                (fspec.okflag, fspec.regport, fspec.wid, fspec.specs)
             # connect up the FU req/go signals, and the reg-read to the FU
             # and create a Read Broadcast Bus
-            for pi, (funame, fu, idx) in enumerate(fuspec):
+            for pi, fuspec in enumerate(fspec.specs):
+                (funame, fu, idx) = (fuspec.funame, fuspec.fu, fuspec.idx)
                 pi += ppoffs[i]
+                name = "%s_%s_%s_%i" % (regfile, rpidx, funame, pi)
+                fu_active = fu_selected[funame]
+                fu_issued = fu_bitdict[funame]
+
+                # get (or set up) a latched copy of read register number
+                # and (sigh) also the read-ok flag
+                # TODO: use nmutil latchregister
+                rhname = "%s_%s_%d" % (regfile, regname, i)
+                rdflag = Signal(name="rdflag_%s_%s" % (funame, rhname),
+                                reset_less=True)
+                if rhname not in fu.rf_latches:
+                    rfl = Signal(name="rdflag_latch_%s_%s" % (funame, rhname))
+                    fu.rf_latches[rhname] = rfl
+                    with m.If(fu.issue_i):
+                        sync += rfl.eq(rdflags[i])
+                else:
+                    rfl = fu.rf_latches[rhname]
+
+                # now the register port
+                rname = "%s_%s_%s_%d" % (funame, regfile, regname, pi)
+                read = Signal.like(_read, name="read_"+rname)
+                if rname not in fu.rd_latches:
+                    rdl = Signal.like(_read, name="rdlatch_"+rname)
+                    fu.rd_latches[rname] = rdl
+                    with m.If(fu.issue_i):
+                        sync += rdl.eq(_read)
+                else:
+                    rdl = fu.rd_latches[rname]
+
+                # make the read immediately available on issue cycle
+                # after the read cycle, otherwies use the latched copy.
+                # this captures the regport and okflag on issue
+                with m.If(fu.issue_i):
+                    comb += read.eq(_read)
+                    comb += rdflag.eq(rdflags[i])
+                with m.Else():
+                    comb += read.eq(rdl)
+                    comb += rdflag.eq(rfl)
 
                 # connect request-read to picker input, and output to go-rd
-                fu_active = fu_bitdict[funame]
-                name = "%s_%s_%s_%i" % (regfile, rpidx, funame, pi)
-                addr_en = Signal.like(reads[i], name="addr_en_"+name)
+                addr_en = Signal.like(read, name="addr_en_"+name)
                 pick = Signal(name="pick_"+name)     # picker input
                 rp = Signal(name="rp_"+name)         # picker output
                 delay_pick = Signal(name="dp_"+name) # read-enable "underway"
+                rhazard = Signal(name="rhaz_"+name)
 
                 # exclude any currently-enabled read-request (mask out active)
-                comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflags[i] &
-                                ~delay_pick)
+                # entirely block anything hazarded from being picked
+                comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflag &
+                                ~delay_pick & ~rhazard)
                 comb += rdpick.i[pi].eq(pick)
                 comb += fu.go_rd_i[idx].eq(delay_pick) # pass in *delayed* pick
 
                 # if picked, select read-port "reg select" number to port
                 comb += rp.eq(rdpick.o[pi] & rdpick.en_o)
                 sync += delay_pick.eq(rp) # delayed "pick"
-                comb += addr_en.eq(Mux(rp, reads[i], 0))
+                comb += addr_en.eq(Mux(rp, read, 0))
 
                 # the read-enable happens combinatorially (see mux-bus below)
                 # but it results in the data coming out on a one-cycle delay.
@@ -338,9 +668,35 @@ class NonProductionCore(Elaboratable):
                     src = fu.src_i[idx]
                     print("reg connect widths",
                           regfile, regname, pi, funame,
-                          src.shape(), rport.data_o.shape())
+                          src.shape(), rport.o_data.shape())
                     # all FUs connect to same port
-                    comb += src.eq(rport.data_o)
+                    comb += src.eq(rport.o_data)
+
+                if not self.make_hazard_vecs:
+                    continue
+
+                # read the write-hazard bitvector (wv) for any bit that is
+                wvchk_en = Signal(len(wvchk), name="wv_chk_addr_en_"+name)
+                issue_active = Signal(name="rd_iactive_"+name)
+                # XXX combinatorial loop here
+                comb += issue_active.eq(fu_active & rdflag)
+                with m.If(issue_active):
+                    if rfile.unary:
+                        comb += wvchk_en.eq(read)
+                    else:
+                        comb += wvchk_en.eq(1<<read)
+                # if FU is busy (which doesn't get set at the same time as
+                # issue) and no hazard was detected, clear wvchk_en (i.e.
+                # stop checking for hazards).  there is a loop here, but it's
+                # via a DFF, so is ok. some linters may complain, but hey.
+                with m.If(fu.busy_o & ~rhazard):
+                        comb += wvchk_en.eq(0)
+
+                # read-hazard is ANDed with (filtered by) what is actually
+                # being requested.
+                comb += rhazard.eq((wvchk & wvchk_en).bool())
+
+                wvens.append(wvchk_en)
 
         # or-reduce the muxed read signals
         if rfile.unary:
@@ -352,7 +708,17 @@ class NonProductionCore(Elaboratable):
             comb += rport.ren.eq(Cat(*rens).bool())
             print ("binary", regfile, rpidx, rport, rport.ren, rens, addrs)
 
-    def connect_rdports(self, m, fu_bitdict):
+        if not self.make_hazard_vecs:
+            return Const(0) # declare "no hazards"
+
+        # enable the read bitvectors for this issued instruction
+        # and return whether any write-hazard bit is set
+        wvchk_and = Signal(len(wvchk), name="wv_chk_"+name)
+        comb += wvchk_and.eq(wvchk & ortreereduce_sig(wvens))
+        comb += hazard_detected.eq(wvchk_and.bool())
+        return hazard_detected
+
+    def connect_rdports(self, m, fu_bitdict, fu_selected):
         """connect read ports
 
         orders the read regspecs into a dict-of-dicts, by regfile, by
@@ -362,15 +728,15 @@ class NonProductionCore(Elaboratable):
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
         regs = self.regs
+        rd_hazard = []
 
         # dictionary of lists of regfile read ports
-        byregfiles_rd, byregfiles_rdspec = self.get_byregfiles(True)
+        byregfiles_rdspec = self.get_byregfiles(m, True)
 
         # okaay, now we need a PriorityPicker per regfile per regfile port
         # loootta pickers... peter piper picked a pack of pickled peppers...
         rdpickers = {}
-        for regfile, spec in byregfiles_rd.items():
-            fuspecs = byregfiles_rdspec[regfile]
+        for regfile, fuspecs in byregfiles_rdspec.items():
             rdpickers[regfile] = {}
 
             # argh.  an experiment to merge RA and RB in the INT regfile
@@ -388,24 +754,110 @@ class NonProductionCore(Elaboratable):
                         fuspecs['fast1'].append(fuspecs.pop('fast3'))
 
             # for each named regfile port, connect up all FUs to that port
+            # also return (and collate) hazard detection)
             for (regname, fspec) in sort_fuspecs(fuspecs):
                 print("connect rd", regname, fspec)
-                self.connect_rdport(m, fu_bitdict, rdpickers, regfile,
+                rh = self.connect_rdport(m, fu_bitdict, fu_selected,
+                                       rdpickers, regfile,
                                        regname, fspec)
+                rd_hazard.append(rh)
+
+        return Cat(*rd_hazard).bool()
+
+    def make_hazards(self, m, regfile, rfile, wvclr, wvset,
+                    funame, regname, idx,
+                    addr_en, wp, fu, fu_active, wrflag, write,
+                    fu_wrok):
+        """make_hazards: a setter and a clearer for the regfile write ports
 
-    def connect_wrport(self, m, fu_bitdict, wrpickers, regfile, regname, fspec):
+        setter is at issue time (using PowerDecoder2 regfile write numbers)
+        clearer is at regfile write time (when FU has said what to write to)
+
+        there is *one* unusual case here which has to be dealt with:
+        when the Function Unit does *NOT* request a write to the regfile
+        (has its data.ok bit CLEARED).  this is perfectly legitimate.
+        and a royal pain.
+        """
+        comb, sync = m.d.comb, m.d.sync
+        name = "%s_%s_%d" % (funame, regname, idx)
+
+        # connect up the bitvector write hazard.  unlike the
+        # regfile writeports, a ONE must be written to the corresponding
+        # bit of the hazard bitvector (to indicate the existence of
+        # the hazard)
+
+        # the detection of what shall be written to is based
+        # on *issue*.  it is delayed by 1 cycle so that instructions
+        # "addi 5,5,0x2" do not cause combinatorial loops due to
+        # fake-dependency on *themselves*.  this will totally fail
+        # spectacularly when doing multi-issue
+        print ("write vector (for regread)", regfile, wvset)
+        wviaddr_en = Signal(len(wvset), name="wv_issue_addr_en_"+name)
+        issue_active = Signal(name="iactive_"+name)
+        sync += issue_active.eq(fu.issue_i & fu_active & wrflag)
+        with m.If(issue_active):
+            if rfile.unary:
+                comb += wviaddr_en.eq(write)
+            else:
+                comb += wviaddr_en.eq(1<<write)
+
+        # deal with write vector clear: this kicks in when the regfile
+        # is written to, and clears the corresponding bitvector entry
+        print ("write vector", regfile, wvclr)
+        wvaddr_en = Signal(len(wvclr), name="wvaddr_en_"+name)
+        if rfile.unary:
+            comb += wvaddr_en.eq(addr_en)
+        else:
+            with m.If(wp):
+                comb += wvaddr_en.eq(1<<addr_en)
+
+        # XXX ASSUME that LDSTFunctionUnit always sets the data it intends to
+        # this may NOT be the case when an exception occurs
+        if isinstance(fu, LDSTFunctionUnit):
+            return wvaddr_en, wviaddr_en
+
+        # okaaay, this is preparation for the awkward case.
+        # * latch a copy of wrflag when issue goes high.
+        # * when the fu_wrok (data.ok) flag is NOT set,
+        #   but the FU is done, the FU is NEVER going to write
+        #   so the bitvector has to be cleared.
+        latch_wrflag = Signal(name="latch_wrflag_"+name)
+        with m.If(~fu.busy_o):
+            sync += latch_wrflag.eq(0)
+        with m.If(fu.issue_i & fu_active):
+            sync += latch_wrflag.eq(wrflag)
+        with m.If(fu.alu_done_o & latch_wrflag & ~fu_wrok):
+            if rfile.unary:
+                comb += wvaddr_en.eq(write) # addr_en gated with wp, don't use
+            else:
+                comb += wvaddr_en.eq(1<<addr_en) # binary addr_en not gated
+
+        return wvaddr_en, wviaddr_en
+
+    def connect_wrport(self, m, fu_bitdict, fu_selected,
+                                wrpickers, regfile, regname, fspec):
         comb, sync = m.d.comb, m.d.sync
         fus = self.fus.fus
         regs = self.regs
 
-        print("connect wr", regname, fspec)
         rpidx = regname
 
         # select the required write port.  these are pre-defined sizes
-        print(regfile, regs.rf.keys())
         rfile = regs.rf[regfile.lower()]
         wport = rfile.w_ports[rpidx]
 
+        print("connect wr", regname, "unary", rfile.unary, fspec)
+        print(regfile, regs.rf.keys())
+
+        # select the write-protection hazard vector.  note that this still
+        # requires to WRITE to the hazard bitvector!  read-requests need
+        # to RAISE the bitvector (set it to 1), which, duh, requires a WRITE
+        if self.make_hazard_vecs:
+            wv = regs.wv[regfile.lower()]
+            wvset = wv.s # write-vec bit-level hazard ctrl
+            wvclr = wv.r # write-vec bit-level hazard ctrl
+            wvchk = wv.q # write-after-write hazard check
+
         fspecs = fspec
         if not isinstance(fspecs, list):
             fspecs = [fspecs]
@@ -413,47 +865,84 @@ class NonProductionCore(Elaboratable):
         pplen = 0
         writes = []
         ppoffs = []
+        wrflags = []
         for i, fspec in enumerate(fspecs):
             # get the regfile specs for this regfile port
-            (rf, read, write, wid, fuspec) = fspec
-            print ("fpsec", i, fspec, len(fuspec))
+            (wf, _write, wid, fuspecs) = \
+                (fspec.okflag, fspec.regport, fspec.wid, fspec.specs)
+            print ("fpsec", i, "wrflag", wf, fspec, len(fuspecs))
             ppoffs.append(pplen) # record offset for picker
-            pplen += len(fuspec)
+            pplen += len(fuspecs)
+
+            name = "%s_%s_%d" % (regfile, regname, i)
+            wrflag = Signal(name="wr_flag_"+name)
+            if wf is not None:
+                comb += wrflag.eq(wf)
+            else:
+                comb += wrflag.eq(0)
+            wrflags.append(wrflag)
 
         # create a priority picker to manage this port
         wrpickers[regfile][rpidx] = wrpick = PriorityPicker(pplen)
-        setattr(m.submodules, "wrpick_%s_%s" % (regfile, rpidx), wrpick)
+        m.submodules["wrpick_%s_%s" % (regfile, rpidx)] = wrpick
 
         wsigs = []
         wens = []
+        wvsets = []
+        wvseten = []
+        wvclren = []
+        #wvens = [] - not needed: reading of writevec is permanently held hi
         addrs = []
         for i, fspec in enumerate(fspecs):
             # connect up the FU req/go signals and the reg-read to the FU
             # these are arbitrated by Data.ok signals
-            (rf, read, write, wid, fuspec) = fspec
-            for pi, (funame, fu, idx) in enumerate(fuspec):
+            (wf, _write, wid, fuspecs) = \
+                (fspec.okflag, fspec.regport, fspec.wid, fspec.specs)
+            for pi, fuspec in enumerate(fspec.specs):
+                (funame, fu, idx) = (fuspec.funame, fuspec.fu, fuspec.idx)
+                fu_requested = fu_bitdict[funame]
                 pi += ppoffs[i]
+                name = "%s_%s_%s_%d" % (funame, regfile, regname, idx)
+                # get (or set up) a write-latched copy of write register number
+                write = Signal.like(_write, name="write_"+name)
+                rname = "%s_%s_%s_%d" % (funame, regfile, regname, idx)
+                if rname not in fu.wr_latches:
+                    wrl = Signal.like(_write, name="wrlatch_"+rname)
+                    fu.wr_latches[rname] = write
+                    # do not depend on fu.issue_i here, it creates a
+                    # combinatorial loop on waw checking. using the FU
+                    # "enable" bitdict entry for this FU is sufficient,
+                    # because the PowerDecoder2 read/write nums are
+                    # valid continuously when the instruction is valid
+                    with m.If(fu_requested):
+                        sync += wrl.eq(_write)
+                        comb += write.eq(_write)
+                    with m.Else():
+                        comb += write.eq(wrl)
+                else:
+                    write = fu.wr_latches[rname]
 
                 # write-request comes from dest.ok
                 dest = fu.get_out(idx)
                 fu_dest_latch = fu.get_fu_out(idx)  # latched output
-                name = "wrflag_%s_%s_%d" % (funame, regname, idx)
-                wrflag = Signal(name=name, reset_less=True)
-                comb += wrflag.eq(dest.ok & fu.busy_o)
+                name = "%s_%s_%d" % (funame, regname, idx)
+                fu_wrok = Signal(name="fu_wrok_"+name, reset_less=True)
+                comb += fu_wrok.eq(dest.ok & fu.busy_o)
 
                 # connect request-write to picker input, and output to go-wr
-                fu_active = fu_bitdict[funame]
-                pick = fu.wr.rel_o[idx] & fu_active  # & wrflag
+                fu_active = fu_selected[funame]
+                pick = fu.wr.rel_o[idx] & fu_active
                 comb += wrpick.i[pi].eq(pick)
                 # create a single-pulse go write from the picker output
-                wr_pick = Signal()
+                wr_pick = Signal(name="wpick_%s_%s_%d" % (funame, regname, idx))
                 comb += wr_pick.eq(wrpick.o[pi] & wrpick.en_o)
                 comb += fu.go_wr_i[idx].eq(rising_edge(m, wr_pick))
 
                 # connect the regspec write "reg select" number to this port
                 # only if one FU actually requests (and is granted) the port
                 # will the write-enable be activated
-                addr_en = Signal.like(write)
+                wname = "waddr_en_%s_%s_%d" % (funame, regname, idx)
+                addr_en = Signal.like(write, name=wname)
                 wp = Signal()
                 comb += wp.eq(wr_pick & wrpick.en_o)
                 comb += addr_en.eq(Mux(wp, write, 0))
@@ -466,11 +955,60 @@ class NonProductionCore(Elaboratable):
                 # connect regfile port to input
                 print("reg connect widths",
                       regfile, regname, pi, funame,
-                      dest.shape(), wport.data_i.shape())
+                      dest.shape(), wport.i_data.shape())
                 wsigs.append(fu_dest_latch)
 
+                # now connect up the bitvector write hazard
+                if not self.make_hazard_vecs:
+                    continue
+                res = self.make_hazards(m, regfile, rfile, wvclr, wvset,
+                                        funame, regname, idx,
+                                        addr_en, wp, fu, fu_active,
+                                        wrflags[i], write, fu_wrok)
+                wvaddr_en, wv_issue_en = res
+                wvclren.append(wvaddr_en)   # set only: no data => clear bit
+                wvseten.append(wv_issue_en) # set data same as enable
+
+                # read the write-hazard bitvector (wv) for any bit that is
+                fu_requested = fu_bitdict[funame]
+                wvchk_en = Signal(len(wvchk), name="waw_chk_addr_en_"+name)
+                issue_active = Signal(name="waw_iactive_"+name)
+                whazard = Signal(name="whaz_"+name)
+                if wf is None:
+                    # XXX EEK! STATE regfile (branch) does not have an
+                    # write-active indicator in regspec_decode_write()
+                    print ("XXX FIXME waw_iactive", issue_active,
+                                                    fu_requested, wf)
+                else:
+                    # check bits from the incoming instruction.  note (back
+                    # in connect_instruction) that the decoder is held for
+                    # us to be able to do this, here... *without* issue being
+                    # held HI.  we MUST NOT gate this with fu.issue_i or
+                    # with fu_bitdict "enable": it would create a loop
+                    comb += issue_active.eq(wf)
+                with m.If(issue_active):
+                    if rfile.unary:
+                        comb += wvchk_en.eq(write)
+                    else:
+                        comb += wvchk_en.eq(1<<write)
+                # if FU is busy (which doesn't get set at the same time as
+                # issue) and no hazard was detected, clear wvchk_en (i.e.
+                # stop checking for hazards).  there is a loop here, but it's
+                # via a DFF, so is ok. some linters may complain, but hey.
+                with m.If(fu.busy_o & ~whazard):
+                        comb += wvchk_en.eq(0)
+
+                # write-hazard is ANDed with (filtered by) what is actually
+                # being requested.  the wvchk data is on a one-clock delay,
+                # and wvchk_en comes directly from the main decoder
+                comb += whazard.eq((wvchk & wvchk_en).bool())
+                with m.If(whazard):
+                    comb += fu._waw_hazard.eq(1)
+
+                #wvens.append(wvchk_en)
+
         # here is where we create the Write Broadcast Bus. simple, eh?
-        comb += wport.data_i.eq(ortreereduce_sig(wsigs))
+        comb += wport.i_data.eq(ortreereduce_sig(wsigs))
         if rfile.unary:
             # for unary-addressed
             comb += wport.wen.eq(ortreereduce_sig(wens))
@@ -479,7 +1017,22 @@ class NonProductionCore(Elaboratable):
             comb += wport.addr.eq(ortreereduce_sig(addrs))
             comb += wport.wen.eq(ortreereduce_sig(wens))
 
-    def connect_wrports(self, m, fu_bitdict):
+        if not self.make_hazard_vecs:
+            return [], []
+
+        # return these here rather than set wvclr/wvset directly,
+        # because there may be more than one write-port to a given
+        # regfile.  example: XER has a write-port for SO, CA, and OV
+        # and the *last one added* of those would overwrite the other
+        # two.  solution: have connect_wrports collate all the
+        # or-tree-reduced bitvector set/clear requests and drop them
+        # in as a single "thing".  this can only be done because the
+        # set/get is an unary bitvector.
+        print ("make write-vecs", regfile, regname, wvset, wvclr)
+        return (wvclren, # clear (regfile write)
+                wvseten) # set (issue time)
+
+    def connect_wrports(self, m, fu_bitdict, fu_selected):
         """connect write ports
 
         orders the write regspecs into a dict-of-dicts, by regfile,
@@ -494,13 +1047,14 @@ class NonProductionCore(Elaboratable):
         fus = self.fus.fus
         regs = self.regs
         # dictionary of lists of regfile write ports
-        byregfiles_wr, byregfiles_wrspec = self.get_byregfiles(False)
+        byregfiles_wrspec = self.get_byregfiles(m, False)
 
         # same for write ports.
         # BLECH!  complex code-duplication! BLECH!
         wrpickers = {}
-        for regfile, spec in byregfiles_wr.items():
-            fuspecs = byregfiles_wrspec[regfile]
+        wvclrers = defaultdict(list)
+        wvseters = defaultdict(list)
+        for regfile, fuspecs in byregfiles_wrspec.items():
             wrpickers[regfile] = {}
 
             if self.regreduce_en:
@@ -515,65 +1069,95 @@ class NonProductionCore(Elaboratable):
                     if 'fast3' in fuspecs:
                         fuspecs['fast1'].append(fuspecs.pop('fast3'))
 
+            # collate these and record them by regfile because there
+            # are sometimes more write-ports per regfile
             for (regname, fspec) in sort_fuspecs(fuspecs):
-                self.connect_wrport(m, fu_bitdict, wrpickers,
+                wvclren, wvseten = self.connect_wrport(m,
+                                        fu_bitdict, fu_selected,
+                                        wrpickers,
                                         regfile, regname, fspec)
-
-    def get_byregfiles(self, readmode):
+                wvclrers[regfile.lower()] += wvclren
+                wvseters[regfile.lower()] += wvseten
+
+        if not self.make_hazard_vecs:
+            return
+
+        # for write-vectors: reduce the clr-ers and set-ers down to
+        # a single set of bits.  otherwise if there are two write
+        # ports (on some regfiles), the last one doing comb += on
+        # the reg.wv[regfile] instance "wins" (and all others are ignored,
+        # whoops).  if there was only one write-port per wv regfile this would
+        # not be an issue.
+        for regfile in wvclrers.keys():
+            wv = regs.wv[regfile]
+            wvset = wv.s # write-vec bit-level hazard ctrl
+            wvclr = wv.r # write-vec bit-level hazard ctrl
+            wvclren = wvclrers[regfile]
+            wvseten = wvseters[regfile]
+            comb += wvclr.eq(ortreereduce_sig(wvclren)) # clear (regfile write)
+            comb += wvset.eq(ortreereduce_sig(wvseten)) # set (issue time)
+
+    def get_byregfiles(self, m, readmode):
 
         mode = "read" if readmode else "write"
         regs = self.regs
         fus = self.fus.fus
-        e = self.e # decoded instruction to execute
+        e = self.ireg.e # decoded instruction to execute
+
+        # dictionary of dictionaries of lists/tuples of regfile ports.
+        # first key: regfile.  second key: regfile port name
+        byregfiles_spec = defaultdict(dict)
 
-        # dictionary of lists of regfile ports
-        byregfiles = {}
-        byregfiles_spec = {}
         for (funame, fu) in fus.items():
+            # create in each FU a receptacle for the read/write register
+            # hazard numbers (and okflags for read).  to be latched in
+            # connect_rd/write_ports
+            if readmode:
+                fu.rd_latches = {} # read reg number latches
+                fu.rf_latches = {} # read flag latches
+            else:
+                fu.wr_latches = {}
+
+            # construct regfile specs: read uses inspec, write outspec
             print("%s ports for %s" % (mode, funame))
             for idx in range(fu.n_src if readmode else fu.n_dst):
-                if readmode:
-                    (regfile, regname, wid) = fu.get_in_spec(idx)
-                else:
-                    (regfile, regname, wid) = fu.get_out_spec(idx)
+                (regfile, regname, wid) = fu.get_io_spec(readmode, idx)
                 print("    %d %s %s %s" % (idx, regfile, regname, str(wid)))
-                if readmode:
-                    rdflag, read = regspec_decode_read(e, regfile, regname)
-                    write = None
-                else:
-                    rdflag, read = None, None
-                    wrport, write = regspec_decode_write(e, regfile, regname)
-                if regfile not in byregfiles:
-                    byregfiles[regfile] = {}
-                    byregfiles_spec[regfile] = {}
+
+                # the PowerDecoder2 (main one, not the satellites) contains
+                # the decoded regfile numbers. obtain these now
+                decinfo = regspec_decode(m, readmode, e, regfile, regname)
+                okflag, regport = decinfo.okflag, decinfo.regport
+
+                # construct the dictionary of regspec information by regfile
                 if regname not in byregfiles_spec[regfile]:
                     byregfiles_spec[regfile][regname] = \
-                        (rdflag, read, write, wid, [])
-                # here we start to create "lanes"
-                if idx not in byregfiles[regfile]:
-                    byregfiles[regfile][idx] = []
-                fuspec = (funame, fu, idx)
-                byregfiles[regfile][idx].append(fuspec)
-                byregfiles_spec[regfile][regname][4].append(fuspec)
-
-        # ok just print that out, for convenience
-        for regfile, spec in byregfiles.items():
+                        ByRegSpec(okflag, regport, wid, [])
+
+                # here we start to create "lanes" where each Function Unit
+                # requiring access to a given [single-contended resource]
+                # regfile port is appended to a list, so that PriorityPickers
+                # can be created to give uncontested access to it
+                fuspec = FUSpec(funame, fu, idx)
+                byregfiles_spec[regfile][regname].specs.append(fuspec)
+
+        # ok just print that all out, for convenience
+        for regfile, fuspecs in byregfiles_spec.items():
             print("regfile %s ports:" % mode, regfile)
-            fuspecs = byregfiles_spec[regfile]
             for regname, fspec in fuspecs.items():
-                [rdflag, read, write, wid, fuspec] = fspec
+                [okflag, regport, wid, fuspecs] = fspec
                 print("  rf %s port %s lane: %s" % (mode, regfile, regname))
-                print("  %s" % regname, wid, read, write, rdflag)
-                for (funame, fu, idx) in fuspec:
+                print("  %s" % regname, wid, okflag, regport)
+                for (funame, fu, idx) in fuspecs:
                     fusig = fu.src_i[idx] if readmode else fu.dest[idx]
-                    print("    ", funame, fu, idx, fusig)
+                    print("    ", funame, fu.__class__.__name__, idx, fusig)
                     print()
 
-        return byregfiles, byregfiles_spec
+        return byregfiles_spec
 
     def __iter__(self):
         yield from self.fus.ports()
-        yield from self.e.ports()
+        yield from self.i.e.ports()
         yield from self.l0.ports()
         # TODO: regs
 
@@ -584,7 +1168,8 @@ class NonProductionCore(Elaboratable):
 if __name__ == '__main__':
     pspec = TestMemPspec(ldst_ifacetype='testpi',
                          imem_ifacetype='',
-                         addr_wid=48,
+                         addr_wid=64,
+                         allow_overlap=True,
                          mask_wid=8,
                          reg_wid=64)
     dut = NonProductionCore(pspec)
diff --git a/src/soc/simple/core_data.py b/src/soc/simple/core_data.py
new file mode 100644 (file)
index 0000000..8cc0b34
--- /dev/null
@@ -0,0 +1,131 @@
+"""simple core input data
+
+"""
+
+from nmigen import Signal
+
+from openpower.sv.svp64 import SVP64Rec
+
+from openpower.decoder.decode2execute1 import Decode2ToExecute1Type
+from openpower.decoder.decode2execute1 import IssuerDecode2ToOperand
+from soc.config.state import CoreState
+
+
+class FetchInput:
+    """FetchInput: the input to the Fetch Unit
+
+    * pc - the current Program Counter
+
+    pretty much it for now!
+
+    """
+    def __init__(self):
+
+        self.pc = Signal(64)
+        self.msr = Signal(64)
+
+    def eq(self, i):
+        return [self.pc.eq(i.pc), self.msr.eq(i.msr),
+               ]
+
+
+class FetchOutput:
+    """FetchOutput: the output from the fetch unit: one single instruction
+
+    * state.  this contains PC, MSR, and SVSTATE. this is crucial information.
+      (TODO: bigendian_i should really be read from the relevant MSR bit)
+
+    * the raw instruction.  no decoding has been done - at all.
+
+      (TODO: provide a *pair* of raw instructions so that packet
+       inspection can be done, and SVP64 decoding and future 64-bit
+       prefix analysis carried out.  however right now that is *not*
+       the focus)
+    """
+    def __init__(self): #, svp64_en):
+        #self.svp64_en = svp64_en
+
+        # state and raw instruction (and SVP64 ReMap fields)
+        self.state = CoreState("core_fetched")
+        self.raw_insn_i = Signal(32) # one raw instruction
+        self.bigendian_i = Signal() # bigendian - TODO, set by MSR.BE
+
+    def eq(self, i):
+        return [self.state.eq(i.state),
+                self.raw_insn_i.eq(i.raw_insn_i),
+                self.bigendian_i.eq(i.bigendian_i),
+               ]
+
+
+class CoreInput:
+    """CoreInput: this is the input specification for Signals coming into core.
+
+    * state.  this contains PC, MSR, and SVSTATE. this is crucial information.
+      (TODO: bigendian_i should really be read from the relevant MSR bit)
+
+    * the previously-decoded instruction goes into the Decode2Execute1Type
+      data structure. no need for Core to re-decode that.  however note
+      that *satellite* decoders *are* part of Core.
+
+    * the raw instruction. this is used by satellite decoders internal to
+      Core, to provide Function-Unit-specific information.  really, they
+      should be part of the actual ALU itself (in order to reduce wires),
+      but hey.
+
+    * other stuff is related to SVP64.  the 24-bit SV REMAP field containing
+      Vector context, etc.
+    """
+    def __init__(self, pspec, svp64_en, regreduce_en):
+        self.pspec = pspec
+        self.svp64_en = svp64_en
+        self.e = Decode2ToExecute1Type("core", opkls=IssuerDecode2ToOperand,
+                                regreduce_en=regreduce_en)
+
+        # SVP64 RA_OR_ZERO needs to know if the relevant EXTRA2/3 field is zero
+        self.sv_a_nz = Signal()
+
+        # state and raw instruction (and SVP64 ReMap fields)
+        self.state = CoreState("core")
+        self.raw_insn_i = Signal(32) # raw instruction
+        self.bigendian_i = Signal() # bigendian - TODO, set by MSR.BE
+        if svp64_en:
+            self.sv_rm = SVP64Rec(name="core_svp64_rm") # SVP64 RM field
+            self.is_svp64_mode = Signal() # set if SVP64 mode is enabled
+            self.use_svp64_ldst_dec = Signal() # use alternative LDST decoder
+            self.sv_pred_sm = Signal() # TODO: SIMD width
+            self.sv_pred_dm = Signal() # TODO: SIMD width
+
+    def eq(self, i):
+        res = [self.e.eq(i.e),
+                self.sv_a_nz.eq(i.sv_a_nz),
+                self.state.eq(i.state),
+                self.raw_insn_i.eq(i.raw_insn_i),
+                self.bigendian_i.eq(i.bigendian_i),
+               ]
+        if not self.svp64_en:
+            return res
+        res += [ self.sv_rm.eq(i.sv_rm),
+                self.is_svp64_mode.eq(i.is_svp64_mode),
+                self.use_svp64_ldst_dec.eq(i.use_svp64_ldst_dec),
+                self.sv_pred_sm.eq(i.sv_pred_sm),
+                self.sv_pred_dm.eq(i.sv_pred_dm),
+                ]
+        return res
+
+
+class CoreOutput:
+    def __init__(self):
+        # start/stop and terminated signalling
+        self.core_terminate_o = Signal()  # indicates stopped
+        self.busy_o = Signal(name="corebusy_o")  # ALU is busy, no input
+        self.any_busy_o = Signal(name="any_busy_o")  # at least one ALU busy
+        self.exc_happened = Signal()             # exception happened
+
+    def eq(self, i):
+        return [self.core_terminate_o.eq(i.core_terminate_o),
+                self.busy_o.eq(i.busy_o),
+                self.any_busy_o.eq(i.any_busy_o),
+                self.exc_happened.eq(i.exc_happened),
+               ]
+
+
diff --git a/src/soc/simple/inorder.py b/src/soc/simple/inorder.py
new file mode 100644 (file)
index 0000000..03a101a
--- /dev/null
@@ -0,0 +1,532 @@
+"""simple core issuer
+
+not in any way intended for production use.  this runs a FSM that:
+
+* reads the Program Counter from StateRegs
+* reads an instruction from a fixed-size Test Memory
+* issues it to the Simple Core
+* waits for it to complete
+* increments the PC
+* does it all over again
+
+the purpose of this module is to verify the functional correctness
+of the Function Units in the absolute simplest and clearest possible
+way, and to at provide something that can be further incrementally
+improved.
+"""
+
+from nmigen import (Elaboratable, Module, Signal,
+                    Mux, Const, Repl, Cat)
+from nmigen.cli import rtlil
+from nmigen.cli import main
+import sys
+
+from nmutil.singlepipe import ControlBase
+from soc.simple.core_data import FetchOutput, FetchInput
+
+from openpower.consts import MSR
+from openpower.decoder.power_enums import MicrOp
+from openpower.state import CoreState
+from soc.regfile.regfiles import StateRegs
+from soc.config.test.test_loadstore import TestMemPspec
+from soc.experiment.icache import ICache
+
+from nmutil.util import rising_edge
+
+from soc.simple.issuer import TestIssuerBase
+
+def get_insn(f_instr_o, pc):
+    if f_instr_o.width == 32:
+        return f_instr_o
+    else:
+        # 64-bit: bit 2 of pc decides which word to select
+        return f_instr_o.word_select(pc[2], 32)
+
+
+# Fetch Finite State Machine.
+# WARNING: there are currently DriverConflicts but it's actually working.
+# TODO, here: everything that is global in nature, information from the
+# main TestIssuerInternal, needs to move to either ispec() or ospec().
+# not only that: TestIssuerInternal.imem can entirely move into here
+# because imem is only ever accessed inside the FetchFSM.
+class FetchFSM(ControlBase):
+    def __init__(self, allow_overlap, imem, core_rst,
+                 pdecode2, cur_state,
+                 dbg, core, svstate, nia):
+        self.allow_overlap = allow_overlap
+        self.imem = imem
+        self.core_rst = core_rst
+        self.pdecode2 = pdecode2
+        self.cur_state = cur_state
+        self.dbg = dbg
+        self.core = core
+        self.svstate = svstate
+        self.nia = nia
+
+        # set up pipeline ControlBase and allocate i/o specs
+        # (unusual: normally done by the Pipeline API)
+        super().__init__(stage=self)
+        self.p.i_data, self.n.o_data = self.new_specs(None)
+        self.i, self.o = self.p.i_data, self.n.o_data
+
+    # next 3 functions are Stage API Compliance
+    def setup(self, m, i):
+        pass
+
+    def ispec(self):
+        return FetchInput()
+
+    def ospec(self):
+        return FetchOutput()
+
+    def elaborate(self, platform):
+        """fetch FSM
+
+        this FSM performs fetch of raw instruction data, partial-decodes
+        it 32-bit at a time to detect SVP64 prefixes, and will optionally
+        read a 2nd 32-bit quantity if that occurs.
+        """
+        m = super().elaborate(platform)
+
+        dbg = self.dbg
+        core = self.core
+        pc = self.i.pc
+        msr = self.i.msr
+        svstate = self.svstate
+        nia = self.nia
+        fetch_pc_o_ready = self.p.o_ready
+        fetch_pc_i_valid = self.p.i_valid
+        fetch_insn_o_valid = self.n.o_valid
+        fetch_insn_i_ready = self.n.i_ready
+
+        comb = m.d.comb
+        sync = m.d.sync
+        pdecode2 = self.pdecode2
+        cur_state = self.cur_state
+        dec_opcode_o = pdecode2.dec.raw_opcode_in  # raw opcode
+
+        # also note instruction fetch failed
+        if hasattr(core, "icache"):
+            fetch_failed = core.icache.i_out.fetch_failed
+            flush_needed = True
+        else:
+            fetch_failed = Const(0, 1)
+            flush_needed = False
+
+        # set priv / virt mode on I-Cache, sigh
+        if isinstance(self.imem, ICache):
+            comb += self.imem.i_in.priv_mode.eq(~msr[MSR.PR])
+            comb += self.imem.i_in.virt_mode.eq(msr[MSR.DR])
+
+        with m.FSM(name='fetch_fsm'):
+
+            # allow fetch to not run at startup due to I-Cache reset not
+            # having time to settle.  power-on-reset holds dbg.core_stopped_i
+            with m.State("PRE_IDLE"):
+                with m.If(~dbg.core_stopped_i & ~dbg.core_stop_o):
+                    m.next = "IDLE"
+
+            # waiting (zzz)
+            with m.State("IDLE"):
+                with m.If(~dbg.stopping_o & ~fetch_failed):
+                    comb += fetch_pc_o_ready.eq(1)
+                with m.If(fetch_pc_i_valid & ~fetch_failed):
+                    # instruction allowed to go: start by reading the PC
+                    # capture the PC and also drop it into Insn Memory
+                    # we have joined a pair of combinatorial memory
+                    # lookups together.  this is Generally Bad.
+                    comb += self.imem.a_pc_i.eq(pc)
+                    comb += self.imem.a_i_valid.eq(1)
+                    comb += self.imem.f_i_valid.eq(1)
+                    sync += cur_state.pc.eq(pc)
+                    sync += cur_state.svstate.eq(svstate)  # and svstate
+                    sync += cur_state.msr.eq(msr)  # and msr
+
+                    m.next = "INSN_READ"  # move to "wait for bus" phase
+
+            # dummy pause to find out why simulation is not keeping up
+            with m.State("INSN_READ"):
+                if self.allow_overlap:
+                    stopping = dbg.stopping_o
+                else:
+                    stopping = Const(0)
+                with m.If(stopping):
+                    # stopping: jump back to idle
+                    m.next = "IDLE"
+                with m.Else():
+                    with m.If(self.imem.f_busy_o & ~fetch_failed):  # zzz...
+                        # busy but not fetch failed: stay in wait-read
+                        comb += self.imem.a_i_valid.eq(1)
+                        comb += self.imem.f_i_valid.eq(1)
+                    with m.Else():
+                        # not busy (or fetch failed!): instruction fetched
+                        # when fetch failed, the instruction gets ignored
+                        # by the decoder
+                        insn = get_insn(self.imem.f_instr_o, cur_state.pc)
+                        # not SVP64 - 32-bit only
+                        sync += nia.eq(cur_state.pc + 4)
+                        sync += dec_opcode_o.eq(insn)
+                        m.next = "INSN_READY"
+
+            with m.State("INSN_READY"):
+                # hand over the instruction, to be decoded
+                comb += fetch_insn_o_valid.eq(1)
+                with m.If(fetch_insn_i_ready):
+                    m.next = "IDLE"
+
+        # whatever was done above, over-ride it if core reset is held
+        with m.If(self.core_rst):
+            sync += nia.eq(0)
+
+        return m
+
+
+class TestIssuerInternalInOrder(TestIssuerBase):
+    """TestIssuer - reads instructions from TestMemory and issues them
+
+    efficiency and speed is not the main goal here: functional correctness
+    and code clarity is.  optimisations (which almost 100% interfere with
+    easy understanding) come later.
+    """
+
+    def issue_fsm(self, m, core, nia,
+                  dbg, core_rst,
+                  fetch_pc_o_ready, fetch_pc_i_valid,
+                  fetch_insn_o_valid, fetch_insn_i_ready,
+                  exec_insn_i_valid, exec_insn_o_ready,
+                  exec_pc_o_valid, exec_pc_i_ready):
+        """issue FSM
+
+        decode / issue FSM.  this interacts with the "fetch" FSM
+        through fetch_insn_ready/valid (incoming) and fetch_pc_ready/valid
+        (outgoing). also interacts with the "execute" FSM
+        through exec_insn_ready/valid (outgoing) and exec_pc_ready/valid
+        (incoming).
+        SVP64 RM prefixes have already been set up by the
+        "fetch" phase, so execute is fairly straightforward.
+        """
+
+        comb = m.d.comb
+        sync = m.d.sync
+        pdecode2 = self.pdecode2
+        cur_state = self.cur_state
+
+        # temporaries
+        dec_opcode_i = pdecode2.dec.raw_opcode_in  # raw opcode
+
+        # note if an exception happened.  in a pipelined or OoO design
+        # this needs to be accompanied by "shadowing" (or stalling)
+        exc_happened = self.core.o.exc_happened
+        # also note instruction fetch failed
+        if hasattr(core, "icache"):
+            fetch_failed = core.icache.i_out.fetch_failed
+            flush_needed = True
+            # set to fault in decoder
+            # update (highest priority) instruction fault
+            rising_fetch_failed = rising_edge(m, fetch_failed)
+            with m.If(rising_fetch_failed):
+                sync += pdecode2.instr_fault.eq(1)
+        else:
+            fetch_failed = Const(0, 1)
+            flush_needed = False
+
+        with m.FSM(name="issue_fsm"):
+
+            # sync with the "fetch" phase which is reading the instruction
+            # at this point, there is no instruction running, that
+            # could inadvertently update the PC.
+            with m.State("ISSUE_START"):
+                # reset instruction fault
+                sync += pdecode2.instr_fault.eq(0)
+                # wait on "core stop" release, before next fetch
+                # need to do this here, in case we are in a VL==0 loop
+                with m.If(~dbg.core_stop_o & ~core_rst):
+                    comb += fetch_pc_i_valid.eq(1)  # tell fetch to start
+                    with m.If(fetch_pc_o_ready):   # fetch acknowledged us
+                        m.next = "INSN_WAIT"
+                with m.Else():
+                    # tell core it's stopped, and acknowledge debug handshake
+                    comb += dbg.core_stopped_i.eq(1)
+
+            # wait for an instruction to arrive from Fetch
+            with m.State("INSN_WAIT"):
+                if self.allow_overlap:
+                    stopping = dbg.stopping_o
+                else:
+                    stopping = Const(0)
+                with m.If(stopping):
+                    # stopping: jump back to idle
+                    m.next = "ISSUE_START"
+                    if flush_needed:
+                        # request the icache to stop asserting "failed"
+                        comb += core.icache.flush_in.eq(1)
+                    # stop instruction fault
+                    sync += pdecode2.instr_fault.eq(0)
+                with m.Else():
+                    comb += fetch_insn_i_ready.eq(1)
+                    with m.If(fetch_insn_o_valid):
+                        # loop into ISSUE_START if it's a SVP64 instruction
+                        # and VL == 0.  this because VL==0 is a for-loop
+                        # from 0 to 0 i.e. always, always a NOP.
+                        m.next = "DECODE_SV"  # skip predication
+
+            # after src/dst step have been updated, we are ready
+            # to decode the instruction
+            with m.State("DECODE_SV"):
+                # decode the instruction
+                with m.If(~fetch_failed):
+                    sync += pdecode2.instr_fault.eq(0)
+                sync += core.i.e.eq(pdecode2.e)
+                sync += core.i.state.eq(cur_state)
+                sync += core.i.raw_insn_i.eq(dec_opcode_i)
+                sync += core.i.bigendian_i.eq(self.core_bigendian_i)
+                # after decoding, reset any previous exception condition,
+                # allowing it to be set again during the next execution
+                sync += pdecode2.ldst_exc.eq(0)
+
+                m.next = "INSN_EXECUTE"  # move to "execute"
+
+            # handshake with execution FSM, move to "wait" once acknowledged
+            with m.State("INSN_EXECUTE"):
+                comb += exec_insn_i_valid.eq(1)  # trigger execute
+                with m.If(exec_insn_o_ready):   # execute acknowledged us
+                    m.next = "EXECUTE_WAIT"
+
+            with m.State("EXECUTE_WAIT"):
+                # wait on "core stop" release, at instruction end
+                # need to do this here, in case we are in a VL>1 loop
+                with m.If(~dbg.core_stop_o & ~core_rst):
+                    comb += exec_pc_i_ready.eq(1)
+                    # see https://bugs.libre-soc.org/show_bug.cgi?id=636
+                    # the exception info needs to be blatted into
+                    # pdecode.ldst_exc, and the instruction "re-run".
+                    # when ldst_exc.happened is set, the PowerDecoder2
+                    # reacts very differently: it re-writes the instruction
+                    # with a "trap" (calls PowerDecoder2.trap()) which
+                    # will *overwrite* whatever was requested and jump the
+                    # PC to the exception address, as well as alter MSR.
+                    # nothing else needs to be done other than to note
+                    # the change of PC and MSR (and, later, SVSTATE)
+                    with m.If(exc_happened):
+                        mmu = core.fus.get_exc("mmu0")
+                        ldst = core.fus.get_exc("ldst0")
+                        if mmu is not None:
+                            with m.If(fetch_failed):
+                                # instruction fetch: exception is from MMU
+                                # reset instr_fault (highest priority)
+                                sync += pdecode2.ldst_exc.eq(mmu)
+                                sync += pdecode2.instr_fault.eq(0)
+                                if flush_needed:
+                                    # request icache to stop asserting "failed"
+                                    comb += core.icache.flush_in.eq(1)
+                        with m.If(~fetch_failed):
+                            # otherwise assume it was a LDST exception
+                            sync += pdecode2.ldst_exc.eq(ldst)
+
+                    with m.If(exec_pc_o_valid):
+
+                        # return directly to Decode if Execute generated an
+                        # exception.
+                        with m.If(pdecode2.ldst_exc.happened):
+                            m.next = "DECODE_SV"
+
+                        # if MSR, PC or SVSTATE were changed by the previous
+                        # instruction, go directly back to Fetch, without
+                        # updating either MSR PC or SVSTATE
+                        with m.Elif(self.msr_changed | self.pc_changed |
+                                    self.sv_changed):
+                            m.next = "ISSUE_START"
+
+                        with m.Else():
+                            # before going back to fetch, update the PC state
+                            # register with the NIA.
+                            # ok here we are not reading the branch unit.
+                            # TODO: this just blithely overwrites whatever
+                            #       pipeline updated the PC
+                            comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
+                            comb += self.state_w_pc.i_data.eq(nia)
+                            m.next = "ISSUE_START"
+
+                with m.Else():
+                    comb += dbg.core_stopped_i.eq(1)
+                    if flush_needed:
+                        # request the icache to stop asserting "failed"
+                        comb += core.icache.flush_in.eq(1)
+                    # stop instruction fault
+                    sync += pdecode2.instr_fault.eq(0)
+                    if flush_needed:
+                        # request the icache to stop asserting "failed"
+                        comb += core.icache.flush_in.eq(1)
+                    # stop instruction fault
+                    sync += pdecode2.instr_fault.eq(0)
+
+    def execute_fsm(self, m, core,
+                    exec_insn_i_valid, exec_insn_o_ready,
+                    exec_pc_o_valid, exec_pc_i_ready):
+        """execute FSM
+
+        execute FSM. this interacts with the "issue" FSM
+        through exec_insn_ready/valid (incoming) and exec_pc_ready/valid
+        (outgoing). SVP64 RM prefixes have already been set up by the
+        "issue" phase, so execute is fairly straightforward.
+        """
+
+        comb = m.d.comb
+        sync = m.d.sync
+        pdecode2 = self.pdecode2
+
+        # temporaries
+        core_busy_o = core.n.o_data.busy_o  # core is busy
+        core_ivalid_i = core.p.i_valid              # instruction is valid
+
+        if hasattr(core, "icache"):
+            fetch_failed = core.icache.i_out.fetch_failed
+        else:
+            fetch_failed = Const(0, 1)
+
+        with m.FSM(name="exec_fsm"):
+
+            # waiting for instruction bus (stays there until not busy)
+            with m.State("INSN_START"):
+                comb += exec_insn_o_ready.eq(1)
+                with m.If(exec_insn_i_valid):
+                    comb += core_ivalid_i.eq(1)  # instruction is valid/issued
+                    sync += self.sv_changed.eq(0)
+                    sync += self.pc_changed.eq(0)
+                    sync += self.msr_changed.eq(0)
+                    with m.If(core.p.o_ready):  # only move if accepted
+                        m.next = "INSN_ACTIVE"  # move to "wait completion"
+
+            # instruction started: must wait till it finishes
+            with m.State("INSN_ACTIVE"):
+                # note changes to MSR, PC and SVSTATE
+                # XXX oops, really must monitor *all* State Regfile write
+                # ports looking for changes!
+                with m.If(self.state_nia.wen & (1 << StateRegs.SVSTATE)):
+                    sync += self.sv_changed.eq(1)
+                with m.If(self.state_nia.wen & (1 << StateRegs.MSR)):
+                    sync += self.msr_changed.eq(1)
+                with m.If(self.state_nia.wen & (1 << StateRegs.PC)):
+                    sync += self.pc_changed.eq(1)
+                with m.If(~core_busy_o):  # instruction done!
+                    comb += exec_pc_o_valid.eq(1)
+                    with m.If(exec_pc_i_ready):
+                        # when finished, indicate "done".
+                        # however, if there was an exception, the instruction
+                        # is *not* yet done.  this is an implementation
+                        # detail: we choose to implement exceptions by
+                        # taking the exception information from the LDST
+                        # unit, putting that *back* into the PowerDecoder2,
+                        # and *re-running the entire instruction*.
+                        # if we erroneously indicate "done" here, it is as if
+                        # there were *TWO* instructions:
+                        # 1) the failed LDST 2) a TRAP.
+                        with m.If(~pdecode2.ldst_exc.happened &
+                                  ~fetch_failed):
+                            comb += self.insn_done.eq(1)
+                        m.next = "INSN_START"  # back to fetch
+
+    def elaborate(self, platform):
+        m = super().elaborate(platform)
+        # convenience
+        comb, sync = m.d.comb, m.d.sync
+        cur_state = self.cur_state
+        pdecode2 = self.pdecode2
+        dbg = self.dbg
+        core = self.core
+
+        # set up peripherals and core
+        core_rst = self.core_rst
+
+        # indicate to outside world if any FU is still executing
+        comb += self.any_busy.eq(core.n.o_data.any_busy_o)  # any FU executing
+
+        # address of the next instruction, in the absence of a branch
+        # depends on the instruction size
+        nia = Signal(64)
+
+        # connect up debug signals
+        with m.If(core.o.core_terminate_o):
+            comb += dbg.terminate_i.eq(1)
+
+        # there are *THREE^WFOUR-if-SVP64-enabled* FSMs, fetch (32/64-bit)
+        # issue, decode/execute, now joined by "Predicate fetch/calculate".
+        # these are the handshake signals between each
+
+        # fetch FSM can run as soon as the PC is valid
+        fetch_pc_i_valid = Signal()  # Execute tells Fetch "start next read"
+        fetch_pc_o_ready = Signal()  # Fetch Tells SVSTATE "proceed"
+
+        # fetch FSM hands over the instruction to be decoded / issued
+        fetch_insn_o_valid = Signal()
+        fetch_insn_i_ready = Signal()
+
+        # issue FSM delivers the instruction to the be executed
+        exec_insn_i_valid = Signal()
+        exec_insn_o_ready = Signal()
+
+        # execute FSM, hands over the PC/SVSTATE back to the issue FSM
+        exec_pc_o_valid = Signal()
+        exec_pc_i_ready = Signal()
+
+        # the FSMs here are perhaps unusual in that they detect conditions
+        # then "hold" information, combinatorially, for the core
+        # (as opposed to using sync - which would be on a clock's delay)
+        # this includes the actual opcode, valid flags and so on.
+
+        # Fetch, then predicate fetch, then Issue, then Execute.
+        # Issue is where the VL for-loop # lives.  the ready/valid
+        # signalling is used to communicate between the four.
+
+        # set up Fetch FSM
+        fetch = FetchFSM(self.allow_overlap,
+                         self.imem, core_rst, pdecode2, cur_state,
+                         dbg, core,
+                         dbg.state.svstate, # combinatorially same
+                         nia)
+        m.submodules.fetch = fetch
+        # connect up in/out data to existing Signals
+        comb += fetch.p.i_data.pc.eq(dbg.state.pc)   # combinatorially same
+        comb += fetch.p.i_data.msr.eq(dbg.state.msr) # combinatorially same
+        # and the ready/valid signalling
+        comb += fetch_pc_o_ready.eq(fetch.p.o_ready)
+        comb += fetch.p.i_valid.eq(fetch_pc_i_valid)
+        comb += fetch_insn_o_valid.eq(fetch.n.o_valid)
+        comb += fetch.n.i_ready.eq(fetch_insn_i_ready)
+
+        self.issue_fsm(m, core, nia,
+                       dbg, core_rst,
+                       fetch_pc_o_ready, fetch_pc_i_valid,
+                       fetch_insn_o_valid, fetch_insn_i_ready,
+                       exec_insn_i_valid, exec_insn_o_ready,
+                       exec_pc_o_valid, exec_pc_i_ready)
+
+        self.execute_fsm(m, core,
+                         exec_insn_i_valid, exec_insn_o_ready,
+                         exec_pc_o_valid, exec_pc_i_ready)
+
+        return m
+
+
+# XXX TODO: update this
+
+if __name__ == '__main__':
+    units = {'alu': 1, 'cr': 1, 'branch': 1, 'trap': 1, 'logical': 1,
+             'spr': 1,
+             'div': 1,
+             'mul': 1,
+             'shiftrot': 1
+             }
+    pspec = TestMemPspec(ldst_ifacetype='bare_wb',
+                         imem_ifacetype='bare_wb',
+                         addr_wid=64,
+                         mask_wid=8,
+                         reg_wid=64,
+                         units=units)
+    dut = TestIssuer(pspec)
+    vl = main(dut, ports=dut.ports(), name="test_issuer")
+
+    if len(sys.argv) == 1:
+        vl = rtlil.convert(dut, ports=dut.external_ports(), name="test_issuer")
+        with open("test_issuer.il", "w") as f:
+            f.write(vl)
index f28b45817a2133706b438ee1dda40e6ee06dc6bc..15bd1760a5ab93f233d8cb7cdff813d7b0833096 100644 (file)
@@ -21,6 +21,9 @@ from nmigen.cli import rtlil
 from nmigen.cli import main
 import sys
 
+from nmutil.singlepipe import ControlBase
+from soc.simple.core_data import FetchOutput, FetchInput
+
 from nmigen.lib.coding import PriorityEncoder
 
 from openpower.decoder.power_decoder import create_pdecode
@@ -28,10 +31,10 @@ from openpower.decoder.power_decoder2 import PowerDecode2, SVP64PrefixDecoder
 from openpower.decoder.decode2execute1 import IssuerDecode2ToOperand
 from openpower.decoder.decode2execute1 import Data
 from openpower.decoder.power_enums import (MicrOp, SVP64PredInt, SVP64PredCR,
-                                     SVP64PredMode)
+                                           SVP64PredMode)
 from openpower.state import CoreState
-from openpower.consts import (CR, SVP64CROffs)
-from soc.experiment.testmem import TestMemory # test only for instructions
+from openpower.consts import (CR, SVP64CROffs, MSR)
+from soc.experiment.testmem import TestMemory  # test only for instructions
 from soc.regfile.regfiles import StateRegs, FastRegs
 from soc.simple.core import NonProductionCore
 from soc.config.test.test_loadstore import TestMemPspec
@@ -45,10 +48,11 @@ from soc.bus.SPBlock512W64B8W import SPBlock512W64B8W
 from soc.clock.select import ClockSelect
 from soc.clock.dummypll import DummyPLL
 from openpower.sv.svstate import SVSTATERec
-
+from soc.experiment.icache import ICache
 
 from nmutil.util import rising_edge
 
+
 def get_insn(f_instr_o, pc):
     if f_instr_o.width == 32:
         return f_instr_o
@@ -57,11 +61,12 @@ def get_insn(f_instr_o, pc):
         return f_instr_o.word_select(pc[2], 32)
 
 # gets state input or reads from state regfile
-def state_get(m, core_rst, state_i, name, regfile, regnum):
+
+
+def state_get(m, res, core_rst, state_i, name, regfile, regnum):
     comb = m.d.comb
     sync = m.d.sync
-    # read the PC
-    res = Signal(64, reset_less=True, name=name)
+    # read the {insert state variable here}
     res_ok_delay = Signal(name="%s_ok_delay" % name)
     with m.If(~core_rst):
         sync += res_ok_delay.eq(~state_i.ok)
@@ -69,12 +74,12 @@ def state_get(m, core_rst, state_i, name, regfile, regnum):
             # incoming override (start from pc_i)
             comb += res.eq(state_i.data)
         with m.Else():
-            # otherwise read StateRegs regfile for PC...
-            comb += regfile.ren.eq(1<<regnum)
+            # otherwise read StateRegs regfile for {insert state here}...
+            comb += regfile.ren.eq(1 << regnum)
         # ... but on a 1-clock delay
         with m.If(res_ok_delay):
-            comb += res.eq(regfile.data_o)
-    return res
+            comb += res.eq(regfile.o_data)
+
 
 def get_predint(m, mask, name):
     """decode SVP64 predicate integer mask field to reg number and invert
@@ -115,6 +120,7 @@ def get_predint(m, mask, name):
             comb += invert.eq(1)
     return regread, invert, unary, all1s
 
+
 def get_predcr(m, mask, name):
     """decode SVP64 predicate CR to reg number field and invert status
     this is identical to _get_predcr in ISACaller
@@ -150,37 +156,67 @@ def get_predcr(m, mask, name):
     return idx, invert
 
 
-class TestIssuerInternal(Elaboratable):
-    """TestIssuer - reads instructions from TestMemory and issues them
+class TestIssuerBase(Elaboratable):
+    """TestIssuerBase - common base class for Issuers
 
-    efficiency and speed is not the main goal here: functional correctness
-    and code clarity is.  optimisations (which almost 100% interfere with
-    easy understanding) come later.
+    takes care of power-on reset, peripherals, debug, DEC/TB,
+    and gets PC/MSR/SVSTATE from the State Regfile etc.
     """
+
     def __init__(self, pspec):
 
+        # test if microwatt compatibility is to be enabled
+        self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+                                 (pspec.microwatt_compat == True))
+        self.alt_reset = Signal(reset_less=True) # not connected yet (microwatt)
+        # test if fabric compatibility is to be enabled
+        self.fabric_compat = (hasattr(pspec, "fabric_compat") and
+                                 (pspec.fabric_compat == True))
+
+        if self.microwatt_compat or self.fabric_compat:
+
+            if hasattr(pspec, "microwatt_old"):
+                self.microwatt_old = pspec.microwatt_old
+            else:
+                self.microwatt_old = True # PLEASE DO NOT ALTER THIS
+
+            if hasattr(pspec, "microwatt_debug"):
+                self.microwatt_debug = pspec.microwatt_debug
+            else:
+                self.microwatt_debug = True # set to False when using an FPGA
+
         # test is SVP64 is to be enabled
         self.svp64_en = hasattr(pspec, "svp64") and (pspec.svp64 == True)
 
         # and if regfiles are reduced
         self.regreduce_en = (hasattr(pspec, "regreduce") and
-                                            (pspec.regreduce == True))
+                             (pspec.regreduce == True))
+
+        # and if overlap requested
+        self.allow_overlap = (hasattr(pspec, "allow_overlap") and
+                              (pspec.allow_overlap == True))
+
+        # and get the core domain
+        self.core_domain = "coresync"
+        if (hasattr(pspec, "core_domain") and
+            isinstance(pspec.core_domain, str)):
+            self.core_domain = pspec.core_domain
 
         # JTAG interface.  add this right at the start because if it's
         # added it *modifies* the pspec, by adding enable/disable signals
         # for parts of the rest of the core
         self.jtag_en = hasattr(pspec, "debug") and pspec.debug == 'jtag'
-        self.dbg_domain = "sync" # sigh "dbgsunc" too problematic
-        #self.dbg_domain = "dbgsync" # domain for DMI/JTAG clock
+        #self.dbg_domain = "sync"  # sigh "dbgsunc" too problematic
+        self.dbg_domain = "dbgsync" # domain for DMI/JTAG clock
         if self.jtag_en:
-            # XXX MUST keep this up-to-date with litex, and
+            # XXX MUST keep this up-to-date with fabric, and
             # soc-cocotb-sim, and err.. all needs sorting out, argh
             subset = ['uart',
                       'mtwi',
                       'eint', 'gpio', 'mspi0',
                       # 'mspi1', - disabled for now
                       # 'pwm', 'sd0', - disabled for now
-                       'sdr']
+                      'sdr']
             self.jtag = JTAG(get_pinspecs(subset=subset),
                              domain=self.dbg_domain)
             # add signals to pspec to enable/disable icache and dcache
@@ -201,7 +237,7 @@ class TestIssuerInternal(Elaboratable):
             self.sram4k = []
             for i in range(4):
                 self.sram4k.append(SPBlock512W64B8W(name="sram4k_%d" % i,
-                                                    #features={'err'}
+                                                    # features={'err'}
                                                     ))
 
         # add interrupt controller?
@@ -210,6 +246,8 @@ class TestIssuerInternal(Elaboratable):
             self.xics_icp = XICS_ICP()
             self.xics_ics = XICS_ICS()
             self.int_level_i = self.xics_ics.int_level_i
+        else:
+            self.ext_irq = Signal()
 
         # add GPIO peripheral?
         self.gpio = hasattr(pspec, "gpio") and pspec.gpio == True
@@ -219,68 +257,584 @@ class TestIssuerInternal(Elaboratable):
 
         # main instruction core.  suitable for prototyping / demo only
         self.core = core = NonProductionCore(pspec)
-        self.core_rst = ResetSignal("coresync")
+        self.core_rst = ResetSignal(self.core_domain)
 
         # instruction decoder.  goes into Trap Record
-        pdecode = create_pdecode()
-        self.cur_state = CoreState("cur") # current state (MSR/PC/SVSTATE)
-        self.pdecode2 = PowerDecode2(pdecode, state=self.cur_state,
+        #pdecode = create_pdecode()
+        self.cur_state = CoreState("cur")  # current state (MSR/PC/SVSTATE)
+        self.pdecode2 = PowerDecode2(None, state=self.cur_state,
                                      opkls=IssuerDecode2ToOperand,
                                      svp64_en=self.svp64_en,
                                      regreduce_en=self.regreduce_en)
+        pdecode = self.pdecode2.dec
+
         if self.svp64_en:
-            self.svp64 = SVP64PrefixDecoder() # for decoding SVP64 prefix
+            self.svp64 = SVP64PrefixDecoder()  # for decoding SVP64 prefix
+
+        self.update_svstate = Signal()  # set this if updating svstate
+        self.new_svstate = new_svstate = SVSTATERec("new_svstate")
 
         # Test Instruction memory
+        if hasattr(core, "icache"):
+            # XXX BLECH! use pspec to transfer the I-Cache to ConfigFetchUnit
+            # truly dreadful.  needs a huge reorg.
+            pspec.icache = core.icache
         self.imem = ConfigFetchUnit(pspec).fu
 
         # DMI interface
         self.dbg = CoreDebug()
+        self.dbg_rst_i = Signal(reset_less=True)
 
         # instruction go/monitor
         self.pc_o = Signal(64, reset_less=True)
-        self.pc_i = Data(64, "pc_i") # set "ok" to indicate "please change me"
-        self.svstate_i = Data(32, "svstate_i") # ditto
-        self.core_bigendian_i = Signal() # TODO: set based on MSR.LE
+        self.pc_i = Data(64, "pc_i")  # set "ok" to indicate "please change me"
+        self.msr_i = Data(64, "msr_i") # set "ok" to indicate "please change me"
+        self.svstate_i = Data(64, "svstate_i")  # ditto
+        self.core_bigendian_i = Signal()  # TODO: set based on MSR.LE
         self.busy_o = Signal(reset_less=True)
         self.memerr_o = Signal(reset_less=True)
 
         # STATE regfile read /write ports for PC, MSR, SVSTATE
         staterf = self.core.regs.rf['state']
-        self.state_r_pc = staterf.r_ports['cia'] # PC rd
-        self.state_w_pc = staterf.w_ports['d_wr1'] # PC wr
-        self.state_r_msr = staterf.r_ports['msr'] # MSR rd
-        self.state_r_sv = staterf.r_ports['sv'] # SVSTATE rd
-        self.state_w_sv = staterf.w_ports['sv'] # SVSTATE wr
+        self.state_r_msr = staterf.r_ports['msr']  # MSR rd
+        self.state_r_pc = staterf.r_ports['cia']  # PC rd
+        self.state_r_sv = staterf.r_ports['sv']  # SVSTATE rd
+
+        self.state_w_msr = staterf.w_ports['d_wr2']  # MSR wr
+        self.state_w_pc = staterf.w_ports['d_wr1']  # PC wr
+        self.state_w_sv = staterf.w_ports['sv']  # SVSTATE wr
 
         # DMI interface access
         intrf = self.core.regs.rf['int']
+        fastrf = self.core.regs.rf['fast']
         crrf = self.core.regs.rf['cr']
         xerrf = self.core.regs.rf['xer']
-        self.int_r = intrf.r_ports['dmi'] # INT read
-        self.cr_r = crrf.r_ports['full_cr_dbg'] # CR read
-        self.xer_r = xerrf.r_ports['full_xer'] # XER read
+        self.int_r = intrf.r_ports['dmi']  # INT DMI read
+        self.cr_r = crrf.r_ports['full_cr_dbg']  # CR DMI read
+        self.xer_r = xerrf.r_ports['full_xer']  # XER DMI read
+        self.fast_r = fastrf.r_ports['dmi']  # FAST DMI read
 
         if self.svp64_en:
             # for predication
-            self.int_pred = intrf.r_ports['pred'] # INT predicate read
-            self.cr_pred = crrf.r_ports['cr_pred'] # CR predicate read
+            self.int_pred = intrf.r_ports['pred']  # INT predicate read
+            self.cr_pred = crrf.r_ports['cr_pred']  # CR predicate read
 
         # hack method of keeping an eye on whether branch/trap set the PC
         self.state_nia = self.core.regs.rf['state'].w_ports['nia']
         self.state_nia.wen.name = 'state_nia_wen'
+        # and whether SPR pipeline sets DEC or TB (fu/spr/main_stage.py)
+        self.state_spr = self.core.regs.rf['state'].w_ports['state1']
 
         # pulse to synchronize the simulator at instruction end
         self.insn_done = Signal()
 
+        # indicate any instruction still outstanding, in execution
+        self.any_busy = Signal()
+
         if self.svp64_en:
             # store copies of predicate masks
             self.srcmask = Signal(64)
             self.dstmask = Signal(64)
 
-    def fetch_fsm(self, m, core, pc, svstate, nia, is_svp64_mode,
-                        fetch_pc_ready_o, fetch_pc_valid_i,
-                        fetch_insn_valid_o, fetch_insn_ready_i):
+        # sigh, the wishbone addresses are not wishbone-compliant
+        # in old versions of microwatt, tplaten_3d_game is a new one
+        if self.microwatt_compat or self.fabric_compat:
+            self.ibus_adr = Signal(32, name='wishbone_insn_out.adr')
+            self.dbus_adr = Signal(32, name='wishbone_data_out.adr')
+
+        # add an output of the PC and instruction, and whether it was requested
+        # this is for verilator debug purposes
+        if self.microwatt_compat or self.fabric_compat:
+            self.nia = Signal(64)
+            self.msr_o = Signal(64)
+            self.nia_req = Signal(1)
+            self.insn = Signal(32)
+            self.ldst_req = Signal(1)
+            self.ldst_addr = Signal(1)
+
+        # for pausing dec/tb during an SPR pipeline event, this
+        # ensures that an SPR write (mtspr) to TB or DEC does not
+        # get overwritten by the DEC/TB FSM
+        self.pause_dec_tb = Signal()
+
+    def setup_peripherals(self, m):
+        comb, sync = m.d.comb, m.d.sync
+
+        # okaaaay so the debug module must be in coresync clock domain
+        # but NOT its reset signal. to cope with this, set every single
+        # submodule explicitly in coresync domain, debug and JTAG
+        # in their own one but using *external* reset.
+        csd = DomainRenamer(self.core_domain)
+        dbd = DomainRenamer(self.dbg_domain)
+
+        if self.microwatt_compat or self.fabric_compat:
+            m.submodules.core = core = self.core
+        else:
+            m.submodules.core = core = csd(self.core)
+
+        # this _so_ needs sorting out.  ICache is added down inside
+        # LoadStore1 and is already a submodule of LoadStore1
+        if not isinstance(self.imem, ICache):
+            m.submodules.imem = imem = csd(self.imem)
+
+        # set up JTAG Debug Module (in correct domain)
+        m.submodules.dbg = dbg = dbd(self.dbg)
+        if self.jtag_en:
+            m.submodules.jtag = jtag = dbd(self.jtag)
+            # TODO: UART2GDB mux, here, from external pin
+            # see https://bugs.libre-soc.org/show_bug.cgi?id=499
+            sync += dbg.dmi.connect_to(jtag.dmi)
+
+        # fixup the clocks in microwatt-compat mode (but leave resets alone
+        # so that microwatt soc.vhdl can pull a reset on the core or DMI
+        # can do it, just like in TestIssuer)
+        if self.microwatt_compat or self.fabric_compat:
+            intclk = ClockSignal(self.core_domain)
+            dbgclk = ClockSignal(self.dbg_domain)
+            if self.core_domain != 'sync':
+                comb += intclk.eq(ClockSignal())
+            if self.dbg_domain != 'sync':
+                comb += dbgclk.eq(ClockSignal())
+
+        # if using old version of microwatt
+        # drop the first 3 bits of the incoming wishbone addresses
+        if self.microwatt_compat or self.fabric_compat:
+            ibus = self.imem.ibus
+            dbus = self.core.l0.cmpi.wb_bus()
+            if self.microwatt_old:
+                comb += self.ibus_adr.eq(Cat(Const(0, 3), ibus.adr))
+                comb += self.dbus_adr.eq(Cat(Const(0, 3), dbus.adr))
+            else:
+                comb += self.ibus_adr.eq(ibus.adr)
+                comb += self.dbus_adr.eq(dbus.adr)
+            if self.microwatt_debug:
+                # microwatt verilator debug purposes
+                pi = self.core.l0.cmpi.pi.pi
+                comb += self.ldst_req.eq(pi.addr_ok_o)
+                comb += self.ldst_addr.eq(pi.addr)
+
+        cur_state = self.cur_state
+
+        # 4x 4k SRAM blocks.  these simply "exist", they get routed in fabric
+        if self.sram4x4k:
+            for i, sram in enumerate(self.sram4k):
+                m.submodules["sram4k_%d" % i] = csd(sram)
+                comb += sram.enable.eq(self.wb_sram_en)
+
+        # XICS interrupt handler
+        if self.xics:
+            m.submodules.xics_icp = icp = csd(self.xics_icp)
+            m.submodules.xics_ics = ics = csd(self.xics_ics)
+            comb += icp.ics_i.eq(ics.icp_o)           # connect ICS to ICP
+            sync += cur_state.eint.eq(icp.core_irq_o)  # connect ICP to core
+        else:
+            sync += cur_state.eint.eq(self.ext_irq)  # connect externally
+
+        # GPIO test peripheral
+        if self.gpio:
+            m.submodules.simple_gpio = simple_gpio = csd(self.simple_gpio)
+
+        # connect one GPIO output to ICS bit 15 (like in microwatt soc.vhdl)
+        # XXX causes fabric ECP5 test to get wrong idea about input and output
+        # (but works with verilator sim *sigh*)
+        # if self.gpio and self.xics:
+        #   comb += self.int_level_i[15].eq(simple_gpio.gpio_o[0])
+
+        # instruction decoder
+        pdecode = create_pdecode()
+        m.submodules.dec2 = pdecode2 = csd(self.pdecode2)
+        if self.svp64_en:
+            m.submodules.svp64 = svp64 = csd(self.svp64)
+
+        # clock delay power-on reset
+        cd_por = ClockDomain(reset_less=True)
+        cd_sync = ClockDomain()
+        m.domains += cd_por, cd_sync
+        core_sync = ClockDomain(self.core_domain)
+        if self.core_domain != "sync":
+            m.domains += core_sync
+        if self.dbg_domain != "sync":
+            dbg_sync = ClockDomain(self.dbg_domain)
+            m.domains += dbg_sync
+
+        # create a delay, but remember it is in the power-on-reset clock domain!
+        ti_rst = Signal(reset_less=True)
+        delay = Signal(range(4), reset=3)
+        stop_delay = Signal(range(16), reset=5)
+        with m.If(delay != 0):
+            m.d.por += delay.eq(delay - 1) # decrement... in POR domain!
+        with m.If(stop_delay != 0):
+            m.d.por += stop_delay.eq(stop_delay - 1) # likewise
+        comb += cd_por.clk.eq(ClockSignal())
+
+        # power-on reset delay
+        core_rst = ResetSignal(self.core_domain)
+        if self.core_domain != "sync":
+            comb += ti_rst.eq(delay != 0 | dbg.core_rst_o | ResetSignal())
+            comb += core_rst.eq(ti_rst)
+        else:
+            with m.If(delay != 0 | dbg.core_rst_o):
+                comb += core_rst.eq(1)
+        with m.If(stop_delay != 0):
+            # run DMI core-stop as well but on an extra couple of cycles
+            comb += dbg.core_stopped_i.eq(1)
+
+        # connect external reset signal to DMI Reset
+        if self.dbg_domain != "sync":
+            dbg_rst = ResetSignal(self.dbg_domain)
+            comb += dbg_rst.eq(self.dbg_rst_i)
+
+        # busy/halted signals from core
+        core_busy_o = ~core.p.o_ready | core.n.o_data.busy_o  # core is busy
+        comb += self.busy_o.eq(core_busy_o)
+        comb += pdecode2.dec.bigendian.eq(self.core_bigendian_i)
+
+        # temporary hack: says "go" immediately for both address gen and ST
+        # XXX: st.go_i is set to 1 cycle delay to reduce combinatorial chains
+        l0 = core.l0
+        ldst = core.fus.fus['ldst0']
+        st_go_edge = rising_edge(m, ldst.st.rel_o)
+        # link addr-go direct to rel
+        m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o)
+        m.d.sync += ldst.st.go_i.eq(st_go_edge)  # link store-go to rising rel
+
+    def do_dmi(self, m, dbg):
+        """deals with DMI debug requests
+
+        currently only provides read requests for the INT regfile, CR and XER
+        it will later also deal with *writing* to these regfiles.
+        """
+        comb = m.d.comb
+        sync = m.d.sync
+        dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer
+        d_fast = dbg.d_fast
+        intrf = self.core.regs.rf['int']
+        fastrf = self.core.regs.rf['fast']
+
+        with m.If(d_reg.req):  # request for regfile access being made
+            # TODO: error-check this
+            # XXX should this be combinatorial?  sync better?
+            if intrf.unary:
+                comb += self.int_r.ren.eq(1 << d_reg.addr)
+            else:
+                comb += self.int_r.addr.eq(d_reg.addr)
+                comb += self.int_r.ren.eq(1)
+        d_reg_delay = Signal()
+        sync += d_reg_delay.eq(d_reg.req)
+        with m.If(d_reg_delay):
+            # data arrives one clock later
+            comb += d_reg.data.eq(self.int_r.o_data)
+            comb += d_reg.ack.eq(1)
+
+        # fast regfile
+        with m.If(d_fast.req):  # request for regfile access being made
+            if fastrf.unary:
+                comb += self.fast_r.ren.eq(1 << d_fast.addr)
+            else:
+                comb += self.fast_r.addr.eq(d_fast.addr)
+                comb += self.fast_r.ren.eq(1)
+        d_fast_delay = Signal()
+        sync += d_fast_delay.eq(d_fast.req)
+        with m.If(d_fast_delay):
+            # data arrives one clock later
+            comb += d_fast.data.eq(self.fast_r.o_data)
+            comb += d_fast.ack.eq(1)
+
+        # sigh same thing for CR debug
+        with m.If(d_cr.req):  # request for regfile access being made
+            comb += self.cr_r.ren.eq(0b11111111)  # enable all
+        d_cr_delay = Signal()
+        sync += d_cr_delay.eq(d_cr.req)
+        with m.If(d_cr_delay):
+            # data arrives one clock later
+            comb += d_cr.data.eq(self.cr_r.o_data)
+            comb += d_cr.ack.eq(1)
+
+        # aaand XER...
+        with m.If(d_xer.req):  # request for regfile access being made
+            comb += self.xer_r.ren.eq(0b111111)  # enable all
+        d_xer_delay = Signal()
+        sync += d_xer_delay.eq(d_xer.req)
+        with m.If(d_xer_delay):
+            # data arrives one clock later
+            comb += d_xer.data.eq(self.xer_r.o_data)
+            comb += d_xer.ack.eq(1)
+
+    def tb_dec_fsm(self, m, spr_dec):
+        """tb_dec_fsm
+
+        this is a FSM for updating either dec or tb.  it runs alternately
+        DEC, TB, DEC, TB.  note that SPR pipeline could have written a new
+        value to DEC, however the regfile has "passthrough" on it so this
+        *should* be ok.
+
+        see v3.0B p1097-1099 for Timer Resource and p1065 and p1076
+        """
+
+        comb, sync = m.d.comb, m.d.sync
+        state_rf = self.core.regs.rf['state']
+        state_r_dectb = state_rf.r_ports['issue']  # DEC/TB
+        state_w_dectb = state_rf.w_ports['issue']  # DEC/TB
+
+
+        with m.FSM() as fsm:
+
+            # initiates read of current DEC
+            with m.State("DEC_READ"):
+                comb += state_r_dectb.ren.eq(1<<StateRegs.DEC)
+                with m.If(~self.pause_dec_tb):
+                    m.next = "DEC_WRITE"
+
+            # waits for DEC read to arrive (1 cycle), updates with new value
+            # respects if dec/tb writing has been paused
+            with m.State("DEC_WRITE"):
+                with m.If(self.pause_dec_tb):
+                    # if paused, return to reading
+                    m.next = "DEC_READ"
+                with m.Else():
+                    new_dec = Signal(64)
+                    # TODO: MSR.LPCR 32-bit decrement mode
+                    comb += new_dec.eq(state_r_dectb.o_data - 1)
+                    comb += state_w_dectb.wen.eq(1<<StateRegs.DEC)
+                    comb += state_w_dectb.i_data.eq(new_dec)
+                    # copy to cur_state for decoder, for an interrupt
+                    sync += spr_dec.eq(new_dec)
+                    m.next = "TB_READ"
+
+            # initiates read of current TB
+            with m.State("TB_READ"):
+                comb += state_r_dectb.ren.eq(1<<StateRegs.TB)
+                with m.If(~self.pause_dec_tb):
+                    m.next = "TB_WRITE"
+
+            # waits for read TB to arrive, initiates write of current TB
+            # respects if dec/tb writing has been paused
+            with m.State("TB_WRITE"):
+                with m.If(self.pause_dec_tb):
+                    # if paused, return to reading
+                    m.next = "TB_READ"
+                with m.Else():
+                    new_tb = Signal(64)
+                    comb += new_tb.eq(state_r_dectb.o_data + 1)
+                    comb += state_w_dectb.wen.eq(1<<StateRegs.TB)
+                    comb += state_w_dectb.i_data.eq(new_tb)
+                    m.next = "DEC_READ"
+
+        return m
+
+    def elaborate(self, platform):
+        m = Module()
+        # convenience
+        comb, sync = m.d.comb, m.d.sync
+        cur_state = self.cur_state
+        pdecode2 = self.pdecode2
+        dbg = self.dbg
+
+        # set up peripherals and core
+        core_rst = self.core_rst
+        self.setup_peripherals(m)
+
+        # reset current state if core reset requested
+        with m.If(core_rst):
+            m.d.sync += self.cur_state.eq(0)
+            # and, sigh, set configured values, which are also done in regfile
+            # XXX ??? what the hell is the shift for??
+            m.d.sync += self.cur_state.pc.eq(self.core.pc_at_reset)
+            m.d.sync += self.cur_state.msr.eq(self.core.msr_at_reset)
+
+        # check halted condition: requested PC to execute matches DMI stop addr
+        # and immediately stop. address of 0xffff_ffff_ffff_ffff can never
+        # match
+        halted = Signal()
+        comb += halted.eq(dbg.stop_addr_o == dbg.state.pc)
+        with m.If(halted):
+            comb += dbg.core_stopped_i.eq(1)
+            comb += dbg.terminate_i.eq(1)
+
+        # PC and instruction from I-Memory
+        comb += self.pc_o.eq(cur_state.pc)
+        self.pc_changed = Signal()  # note write to PC
+        self.msr_changed = Signal()  # note write to MSR
+        self.sv_changed = Signal()  # note write to SVSTATE
+
+        # read state either from incoming override or from regfile
+        state = CoreState("get")  # current state (MSR/PC/SVSTATE)
+        state_get(m, state.msr, core_rst, self.msr_i,
+                       "msr",                  # read MSR
+                       self.state_r_msr, StateRegs.MSR)
+        state_get(m, state.pc, core_rst, self.pc_i,
+                       "pc",                  # read PC
+                       self.state_r_pc, StateRegs.PC)
+        state_get(m, state.svstate, core_rst, self.svstate_i,
+                            "svstate",   # read SVSTATE
+                            self.state_r_sv, StateRegs.SVSTATE)
+
+        # don't write pc every cycle
+        comb += self.state_w_pc.wen.eq(0)
+        comb += self.state_w_pc.i_data.eq(0)
+
+        # connect up debug state.  note "combinatorially same" below,
+        # this is a bit naff, passing state over in the dbg class, but
+        # because it is combinatorial it achieves the desired goal
+        comb += dbg.state.eq(state)
+
+        # this bit doesn't have to be in the FSM: connect up to read
+        # regfiles on demand from DMI
+        self.do_dmi(m, dbg)
+
+        # DEC and TB inc/dec FSM.  copy of DEC is put into CoreState,
+        # (which uses that in PowerDecoder2 to raise 0x900 exception)
+        self.tb_dec_fsm(m, cur_state.dec)
+
+        # while stopped, allow updating the MSR, PC and SVSTATE.
+        # these are mainly for debugging purposes (including DMI/JTAG)
+        with m.If(dbg.core_stopped_i):
+            with m.If(self.pc_i.ok):
+                comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
+                comb += self.state_w_pc.i_data.eq(self.pc_i.data)
+                sync += self.pc_changed.eq(1)
+            with m.If(self.msr_i.ok):
+                comb += self.state_w_msr.wen.eq(1 << StateRegs.MSR)
+                comb += self.state_w_msr.i_data.eq(self.msr_i.data)
+                sync += self.msr_changed.eq(1)
+            with m.If(self.svstate_i.ok | self.update_svstate):
+                with m.If(self.svstate_i.ok): # over-ride from external source
+                    comb += self.new_svstate.eq(self.svstate_i.data)
+                comb += self.state_w_sv.wen.eq(1 << StateRegs.SVSTATE)
+                comb += self.state_w_sv.i_data.eq(self.new_svstate)
+                sync += self.sv_changed.eq(1)
+
+        # start renaming some of the ports to match microwatt
+        if self.microwatt_compat or self.fabric_compat:
+            self.core.o.core_terminate_o.name = "terminated_out"
+            # names of DMI interface
+            self.dbg.dmi.addr_i.name = 'dmi_addr'
+            self.dbg.dmi.din.name    = 'dmi_din'
+            self.dbg.dmi.dout.name   = 'dmi_dout'
+            self.dbg.dmi.req_i.name  = 'dmi_req'
+            self.dbg.dmi.we_i.name   = 'dmi_wr'
+            self.dbg.dmi.ack_o.name  = 'dmi_ack'
+            # wishbone instruction bus
+            ibus = self.imem.ibus
+            if self.microwatt_compat:
+                ibus.adr.name = 'wishbone_insn_out.adr'
+                ibus.dat_w.name = 'wishbone_insn_out.dat'
+                ibus.sel.name = 'wishbone_insn_out.sel'
+                ibus.cyc.name = 'wishbone_insn_out.cyc'
+                ibus.stb.name = 'wishbone_insn_out.stb'
+                ibus.we.name = 'wishbone_insn_out.we'
+                ibus.dat_r.name = 'wishbone_insn_in.dat'
+                ibus.ack.name = 'wishbone_insn_in.ack'
+                ibus.stall.name = 'wishbone_insn_in.stall'
+            # wishbone data bus
+            dbus = self.core.l0.cmpi.wb_bus()
+            if self.microwatt_compat:
+                dbus.adr.name = 'wishbone_data_out.adr'
+                dbus.dat_w.name = 'wishbone_data_out.dat'
+                dbus.sel.name = 'wishbone_data_out.sel'
+                dbus.cyc.name = 'wishbone_data_out.cyc'
+                dbus.stb.name = 'wishbone_data_out.stb'
+                dbus.we.name = 'wishbone_data_out.we'
+                dbus.dat_r.name = 'wishbone_data_in.dat'
+                dbus.ack.name = 'wishbone_data_in.ack'
+                dbus.stall.name = 'wishbone_data_in.stall'
+
+        return m
+
+    def __iter__(self):
+        yield from self.pc_i.ports()
+        yield from self.msr_i.ports()
+        yield self.pc_o
+        yield self.memerr_o
+        yield from self.core.ports()
+        yield from self.imem.ports()
+        yield self.core_bigendian_i
+        yield self.busy_o
+
+    def ports(self):
+        return list(self)
+
+    def external_ports(self):
+        if self.microwatt_compat or self.fabric_compat:
+            if self.fabric_compat:
+                ports = [self.core.o.core_terminate_o,
+                         self.alt_reset, # not connected yet
+                         self.nia, self.insn, self.nia_req, self.msr_o,
+                         self.ldst_req, self.ldst_addr,
+                         ClockSignal(),
+                         ResetSignal(),
+                        ]
+            else:
+                ports = [self.core.o.core_terminate_o,
+                         self.ext_irq,
+                         self.alt_reset, # not connected yet
+                         self.nia, self.insn, self.nia_req, self.msr_o,
+                         self.ldst_req, self.ldst_addr,
+                         ClockSignal(),
+                         ResetSignal(),
+                        ]
+            ports += list(self.dbg.dmi.ports())
+            # for dbus/ibus microwatt, exclude err btw and cti
+            for name, sig in self.imem.ibus.fields.items():
+                if name not in ['err', 'bte', 'cti', 'adr']:
+                    ports.append(sig)
+            for name, sig in self.core.l0.cmpi.wb_bus().fields.items():
+                if name not in ['err', 'bte', 'cti', 'adr']:
+                    ports.append(sig)
+            # microwatt non-compliant with wishbone
+            ports.append(self.ibus_adr)
+            ports.append(self.dbus_adr)
+
+            if self.microwatt_compat:
+                # Ignore the remaining ports in microwatt compat mode
+                return ports
+
+        ports = self.pc_i.ports()
+        ports = self.msr_i.ports()
+        ports += [self.pc_o, self.memerr_o, self.core_bigendian_i, self.busy_o,
+                  ]
+
+        if self.jtag_en:
+            ports += list(self.jtag.external_ports())
+        else:
+            # don't add DMI if JTAG is enabled
+            ports += list(self.dbg.dmi.ports())
+
+        ports += list(self.imem.ibus.fields.values())
+        ports += list(self.core.l0.cmpi.wb_bus().fields.values())
+
+        if self.sram4x4k:
+            for sram in self.sram4k:
+                ports += list(sram.bus.fields.values())
+
+        if self.xics:
+            ports += list(self.xics_icp.bus.fields.values())
+            ports += list(self.xics_ics.bus.fields.values())
+            ports.append(self.int_level_i)
+        else:
+            ports.append(self.ext_irq)
+
+        if self.gpio:
+            ports += list(self.simple_gpio.bus.fields.values())
+            ports.append(self.gpio_o)
+
+        return ports
+
+    def ports(self):
+        return list(self)
+
+
+class TestIssuerInternal(TestIssuerBase):
+    """TestIssuer - reads instructions from TestMemory and issues them
+
+    efficiency and speed is not the main goal here: functional correctness
+    and code clarity is.  optimisations (which almost 100% interfere with
+    easy understanding) come later.
+    """
+
+    def fetch_fsm(self, m, dbg, core, core_rst, nia, is_svp64_mode,
+                        fetch_pc_o_ready, fetch_pc_i_valid,
+                        fetch_insn_o_valid, fetch_insn_i_ready):
         """fetch FSM
 
         this FSM performs fetch of raw instruction data, partial-decodes
@@ -292,83 +846,122 @@ class TestIssuerInternal(Elaboratable):
         pdecode2 = self.pdecode2
         cur_state = self.cur_state
         dec_opcode_i = pdecode2.dec.raw_opcode_in # raw opcode
+        pc, msr, svstate = cur_state.pc, cur_state.msr, cur_state.svstate
 
-        msr_read = Signal(reset=1)
+        # also note instruction fetch failed
+        if hasattr(core, "icache"):
+            fetch_failed = core.icache.i_out.fetch_failed
+            flush_needed = True
+        else:
+            fetch_failed = Const(0, 1)
+            flush_needed = False
+
+        # set priv / virt mode on I-Cache, sigh
+        if isinstance(self.imem, ICache):
+            comb += self.imem.i_in.priv_mode.eq(~msr[MSR.PR])
+            comb += self.imem.i_in.virt_mode.eq(msr[MSR.IR]) # Instr. Redir (VM)
 
         with m.FSM(name='fetch_fsm'):
 
+            # allow fetch to not run at startup due to I-Cache reset not
+            # having time to settle.  power-on-reset holds dbg.core_stopped_i
+            with m.State("PRE_IDLE"):
+                with m.If(~dbg.core_stopped_i & ~dbg.core_stop_o & ~core_rst):
+                    m.next = "IDLE"
+
             # waiting (zzz)
             with m.State("IDLE"):
-                comb += fetch_pc_ready_o.eq(1)
-                with m.If(fetch_pc_valid_i):
+                # fetch allowed if not failed and stopped but not stepping
+                # (see dmi.py for how core_stop_o is generated)
+                with m.If(~fetch_failed & ~dbg.core_stop_o):
+                    comb += fetch_pc_o_ready.eq(1)
+                with m.If(fetch_pc_i_valid & ~pdecode2.instr_fault
+                          & ~dbg.core_stop_o):
                     # instruction allowed to go: start by reading the PC
                     # capture the PC and also drop it into Insn Memory
                     # we have joined a pair of combinatorial memory
                     # lookups together.  this is Generally Bad.
                     comb += self.imem.a_pc_i.eq(pc)
-                    comb += self.imem.a_valid_i.eq(1)
-                    comb += self.imem.f_valid_i.eq(1)
-                    sync += cur_state.pc.eq(pc)
-                    sync += cur_state.svstate.eq(svstate) # and svstate
-
-                    # initiate read of MSR. arrives one clock later
-                    comb += self.state_r_msr.ren.eq(1 << StateRegs.MSR)
-                    sync += msr_read.eq(0)
-
+                    comb += self.imem.a_i_valid.eq(1)
+                    comb += self.imem.f_i_valid.eq(1)
                     m.next = "INSN_READ"  # move to "wait for bus" phase
 
             # dummy pause to find out why simulation is not keeping up
             with m.State("INSN_READ"):
-                # one cycle later, msr/sv read arrives.  valid only once.
-                with m.If(~msr_read):
-                    sync += msr_read.eq(1) # yeah don't read it again
-                    sync += cur_state.msr.eq(self.state_r_msr.data_o)
-                with m.If(self.imem.f_busy_o): # zzz...
-                    # busy: stay in wait-read
-                    comb += self.imem.a_valid_i.eq(1)
-                    comb += self.imem.f_valid_i.eq(1)
+                # when using "single-step" mode, checking dbg.stopping_o
+                # prevents progress.  allow fetch to proceed once started
+                stopping = Const(0)
+                #if self.allow_overlap:
+                #    stopping = dbg.stopping_o
+                with m.If(stopping):
+                    # stopping: jump back to idle
+                    m.next = "IDLE"
                 with m.Else():
-                    # not busy: instruction fetched
-                    insn = get_insn(self.imem.f_instr_o, cur_state.pc)
-                    if self.svp64_en:
-                        svp64 = self.svp64
-                        # decode the SVP64 prefix, if any
-                        comb += svp64.raw_opcode_in.eq(insn)
-                        comb += svp64.bigendian.eq(self.core_bigendian_i)
-                        # pass the decoded prefix (if any) to PowerDecoder2
-                        sync += pdecode2.sv_rm.eq(svp64.svp64_rm)
-                        sync += pdecode2.is_svp64_mode.eq(is_svp64_mode)
-                        # remember whether this is a prefixed instruction, so
-                        # the FSM can readily loop when VL==0
-                        sync += is_svp64_mode.eq(svp64.is_svp64_mode)
-                        # calculate the address of the following instruction
-                        insn_size = Mux(svp64.is_svp64_mode, 8, 4)
-                        sync += nia.eq(cur_state.pc + insn_size)
-                        with m.If(~svp64.is_svp64_mode):
-                            # with no prefix, store the instruction
-                            # and hand it directly to the next FSM
+                    with m.If(self.imem.f_busy_o &
+                              ~pdecode2.instr_fault):  # zzz...
+                        # busy but not fetch failed: stay in wait-read
+                        comb += self.imem.a_pc_i.eq(pc)
+                        comb += self.imem.a_i_valid.eq(1)
+                        comb += self.imem.f_i_valid.eq(1)
+                    with m.Else():
+                        # not busy (or fetch failed!): instruction fetched
+                        # when fetch failed, the instruction gets ignored
+                        # by the decoder
+                        if hasattr(core, "icache"):
+                            # blech, icache returns actual instruction
+                            insn = self.imem.f_instr_o
+                        else:
+                            # but these return raw memory
+                            insn = get_insn(self.imem.f_instr_o, cur_state.pc)
+                        if self.svp64_en:
+                            svp64 = self.svp64
+                            # decode the SVP64 prefix, if any
+                            comb += svp64.raw_opcode_in.eq(insn)
+                            comb += svp64.bigendian.eq(self.core_bigendian_i)
+                            # pass the decoded prefix (if any) to PowerDecoder2
+                            sync += pdecode2.sv_rm.eq(svp64.svp64_rm)
+                            sync += pdecode2.is_svp64_mode.eq(is_svp64_mode)
+                            # remember whether this is a prefixed instruction,
+                            # so the FSM can readily loop when VL==0
+                            sync += is_svp64_mode.eq(svp64.is_svp64_mode)
+                            # calculate the address of the following instruction
+                            insn_size = Mux(svp64.is_svp64_mode, 8, 4)
+                            sync += nia.eq(cur_state.pc + insn_size)
+                            with m.If(~svp64.is_svp64_mode):
+                                # with no prefix, store the instruction
+                                # and hand it directly to the next FSM
+                                sync += dec_opcode_i.eq(insn)
+                                m.next = "INSN_READY"
+                            with m.Else():
+                                # fetch the rest of the instruction from memory
+                                comb += self.imem.a_pc_i.eq(cur_state.pc + 4)
+                                comb += self.imem.a_i_valid.eq(1)
+                                comb += self.imem.f_i_valid.eq(1)
+                                m.next = "INSN_READ2"
+                        else:
+                            # not SVP64 - 32-bit only
+                            sync += nia.eq(cur_state.pc + 4)
                             sync += dec_opcode_i.eq(insn)
-                            m.next = "INSN_READY"
-                        with m.Else():
-                            # fetch the rest of the instruction from memory
-                            comb += self.imem.a_pc_i.eq(cur_state.pc + 4)
-                            comb += self.imem.a_valid_i.eq(1)
-                            comb += self.imem.f_valid_i.eq(1)
-                            m.next = "INSN_READ2"
-                    else:
-                        # not SVP64 - 32-bit only
-                        sync += nia.eq(cur_state.pc + 4)
-                        sync += dec_opcode_i.eq(insn)
-                        m.next = "INSN_READY"
+                            if self.microwatt_compat or self.fabric_compat:
+                                # for verilator debug purposes
+                                comb += self.insn.eq(insn)
+                                comb += self.nia.eq(cur_state.pc)
+                                comb += self.msr_o.eq(cur_state.msr)
+                                comb += self.nia_req.eq(1)
+                            m.next = "INSN_READY"
 
             with m.State("INSN_READ2"):
                 with m.If(self.imem.f_busy_o):  # zzz...
                     # busy: stay in wait-read
-                    comb += self.imem.a_valid_i.eq(1)
-                    comb += self.imem.f_valid_i.eq(1)
+                    comb += self.imem.a_i_valid.eq(1)
+                    comb += self.imem.f_i_valid.eq(1)
                 with m.Else():
                     # not busy: instruction fetched
-                    insn = get_insn(self.imem.f_instr_o, cur_state.pc+4)
+                    if hasattr(core, "icache"):
+                        # blech, icache returns actual instruction
+                        insn = self.imem.f_instr_o
+                    else:
+                        insn = get_insn(self.imem.f_instr_o, cur_state.pc+4)
                     sync += dec_opcode_i.eq(insn)
                     m.next = "INSN_READY"
                     # TODO: probably can start looking at pdecode2.rm_dec
@@ -389,13 +982,14 @@ class TestIssuerInternal(Elaboratable):
 
             with m.State("INSN_READY"):
                 # hand over the instruction, to be decoded
-                comb += fetch_insn_valid_o.eq(1)
-                with m.If(fetch_insn_ready_i):
+                comb += fetch_insn_o_valid.eq(1)
+                with m.If(fetch_insn_i_ready):
                     m.next = "IDLE"
 
+
     def fetch_predicate_fsm(self, m,
-                            pred_insn_valid_i, pred_insn_ready_o,
-                            pred_mask_valid_o, pred_mask_ready_i):
+                            pred_insn_i_valid, pred_insn_o_ready,
+                            pred_mask_o_valid, pred_mask_i_ready):
         """fetch_predicate_fsm - obtains (constructs in the case of CR)
            src/dest predicate masks
 
@@ -412,7 +1006,7 @@ class TestIssuerInternal(Elaboratable):
         comb = m.d.comb
         sync = m.d.sync
         pdecode2 = self.pdecode2
-        rm_dec = pdecode2.rm_dec # SVP64RMModeDecode
+        rm_dec = pdecode2.rm_dec  # SVP64RMModeDecode
         predmode = rm_dec.predmode
         srcpred, dstpred = rm_dec.srcpred, rm_dec.dstpred
         cr_pred, int_pred = self.cr_pred, self.int_pred   # read regfiles
@@ -437,8 +1031,8 @@ class TestIssuerInternal(Elaboratable):
         with m.FSM(name="fetch_predicate"):
 
             with m.State("FETCH_PRED_IDLE"):
-                comb += pred_insn_ready_o.eq(1)
-                with m.If(pred_insn_valid_i):
+                comb += pred_insn_o_ready.eq(1)
+                with m.If(pred_insn_i_valid):
                     with m.If(predmode == SVP64PredMode.INT):
                         # skip fetching destination mask register, when zero
                         with m.If(dall1s):
@@ -470,11 +1064,11 @@ class TestIssuerInternal(Elaboratable):
                 with m.If(dunary):
                     # set selected mask bit for 1<<r3 mode
                     dst_shift = Signal(range(64))
-                    comb += dst_shift.eq(self.int_pred.data_o & 0b111111)
+                    comb += dst_shift.eq(self.int_pred.o_data & 0b111111)
                     sync += new_dstmask.eq(1 << dst_shift)
                 with m.Else():
                     # invert mask if requested
-                    sync += new_dstmask.eq(self.int_pred.data_o ^ inv)
+                    sync += new_dstmask.eq(self.int_pred.o_data ^ inv)
                 # skip fetching source mask register, when zero
                 with m.If(sall1s):
                     sync += new_srcmask.eq(-1)
@@ -491,11 +1085,11 @@ class TestIssuerInternal(Elaboratable):
                 with m.If(sunary):
                     # set selected mask bit for 1<<r3 mode
                     src_shift = Signal(range(64))
-                    comb += src_shift.eq(self.int_pred.data_o & 0b111111)
+                    comb += src_shift.eq(self.int_pred.o_data & 0b111111)
                     sync += new_srcmask.eq(1 << src_shift)
                 with m.Else():
                     # invert mask if requested
-                    sync += new_srcmask.eq(self.int_pred.data_o ^ inv)
+                    sync += new_srcmask.eq(self.int_pred.o_data ^ inv)
                 m.next = "FETCH_PRED_SHIFT_MASK"
 
             # fetch masks from the CR register file
@@ -538,9 +1132,11 @@ class TestIssuerInternal(Elaboratable):
                     cr_field = Signal(4)
                     scr_bit = Signal()
                     dcr_bit = Signal()
-                    comb += cr_field.eq(cr_pred.data_o)
-                    comb += scr_bit.eq(cr_field.bit_select(sidx, 1) ^ scrinvert)
-                    comb += dcr_bit.eq(cr_field.bit_select(didx, 1) ^ dcrinvert)
+                    comb += cr_field.eq(cr_pred.o_data)
+                    comb += scr_bit.eq(cr_field.bit_select(sidx, 1)
+                                       ^ scrinvert)
+                    comb += dcr_bit.eq(cr_field.bit_select(didx, 1)
+                                       ^ dcrinvert)
                     # set the corresponding mask bit
                     bit_to_set = Signal.like(self.srcmask)
                     comb += bit_to_set.eq(1 << cur_cr_idx)
@@ -556,18 +1152,18 @@ class TestIssuerInternal(Elaboratable):
                 m.next = "FETCH_PRED_DONE"
 
             with m.State("FETCH_PRED_DONE"):
-                comb += pred_mask_valid_o.eq(1)
-                with m.If(pred_mask_ready_i):
+                comb += pred_mask_o_valid.eq(1)
+                with m.If(pred_mask_i_ready):
                     m.next = "FETCH_PRED_IDLE"
 
-    def issue_fsm(self, m, core, pc_changed, sv_changed, nia,
+    def issue_fsm(self, m, core, nia,
                   dbg, core_rst, is_svp64_mode,
-                  fetch_pc_ready_o, fetch_pc_valid_i,
-                  fetch_insn_valid_o, fetch_insn_ready_i,
-                  pred_insn_valid_i, pred_insn_ready_o,
-                  pred_mask_valid_o, pred_mask_ready_i,
-                  exec_insn_valid_i, exec_insn_ready_o,
-                  exec_pc_valid_o, exec_pc_ready_i):
+                  fetch_pc_o_ready, fetch_pc_i_valid,
+                  fetch_insn_o_valid, fetch_insn_i_ready,
+                  pred_insn_i_valid, pred_insn_o_ready,
+                  pred_mask_o_valid, pred_mask_i_ready,
+                  exec_insn_i_valid, exec_insn_o_ready,
+                  exec_pc_o_valid, exec_pc_i_ready):
         """issue FSM
 
         decode / issue FSM.  this interacts with the "fetch" FSM
@@ -583,13 +1179,12 @@ class TestIssuerInternal(Elaboratable):
         sync = m.d.sync
         pdecode2 = self.pdecode2
         cur_state = self.cur_state
+        new_svstate = self.new_svstate
 
         # temporaries
-        dec_opcode_i = pdecode2.dec.raw_opcode_in # raw opcode
+        dec_opcode_i = pdecode2.dec.raw_opcode_in  # raw opcode
 
         # for updating svstate (things like srcstep etc.)
-        update_svstate = Signal() # set this (below) if updating
-        new_svstate = SVSTATERec("new_svstate")
         comb += new_svstate.eq(cur_state.svstate)
 
         # precalculate srcstep+1 and dststep+1
@@ -602,68 +1197,96 @@ class TestIssuerInternal(Elaboratable):
 
         # note if an exception happened.  in a pipelined or OoO design
         # this needs to be accompanied by "shadowing" (or stalling)
-        el = []
-        for exc in core.fus.excs.values():
-            el.append(exc.happened)
-        exc_happened = Signal()
-        if len(el) > 0: # at least one exception
-            comb += exc_happened.eq(Cat(*el).bool())
+        exc_happened = self.core.o.exc_happened
+        # also note instruction fetch failed
+        if hasattr(core, "icache"):
+            fetch_failed = core.icache.i_out.fetch_failed
+            flush_needed = True
+            # set to fault in decoder
+            # update (highest priority) instruction fault
+            rising_fetch_failed = rising_edge(m, fetch_failed)
+            with m.If(rising_fetch_failed):
+                sync += pdecode2.instr_fault.eq(1)
+        else:
+            fetch_failed = Const(0, 1)
+            flush_needed = False
+
+        sync += fetch_pc_i_valid.eq(0)
 
         with m.FSM(name="issue_fsm"):
 
+            with m.State("PRE_IDLE"):
+                with m.If(~dbg.core_stop_o & ~core_rst):
+                    m.next = "ISSUE_START"
+
             # sync with the "fetch" phase which is reading the instruction
             # at this point, there is no instruction running, that
             # could inadvertently update the PC.
             with m.State("ISSUE_START"):
+                # reset instruction fault
+                sync += pdecode2.instr_fault.eq(0)
                 # wait on "core stop" release, before next fetch
                 # need to do this here, in case we are in a VL==0 loop
                 with m.If(~dbg.core_stop_o & ~core_rst):
-                    comb += fetch_pc_valid_i.eq(1) # tell fetch to start
-                    with m.If(fetch_pc_ready_o):   # fetch acknowledged us
+                    sync += fetch_pc_i_valid.eq(1)  # tell fetch to start
+                    sync += cur_state.pc.eq(dbg.state.pc)
+                    sync += cur_state.svstate.eq(dbg.state.svstate)
+                    sync += cur_state.msr.eq(dbg.state.msr)
+                    with m.If(fetch_pc_o_ready):   # fetch acknowledged us
                         m.next = "INSN_WAIT"
                 with m.Else():
                     # tell core it's stopped, and acknowledge debug handshake
                     comb += dbg.core_stopped_i.eq(1)
-                    # while stopped, allow updating the PC and SVSTATE
-                    with m.If(self.pc_i.ok):
-                        comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
-                        comb += self.state_w_pc.data_i.eq(self.pc_i.data)
-                        sync += pc_changed.eq(1)
+                    # while stopped, allow updating SVSTATE
                     with m.If(self.svstate_i.ok):
                         comb += new_svstate.eq(self.svstate_i.data)
-                        comb += update_svstate.eq(1)
-                        sync += sv_changed.eq(1)
+                        comb += self.update_svstate.eq(1)
+                        sync += self.sv_changed.eq(1)
 
             # wait for an instruction to arrive from Fetch
             with m.State("INSN_WAIT"):
-                comb += fetch_insn_ready_i.eq(1)
-                with m.If(fetch_insn_valid_o):
-                    # loop into ISSUE_START if it's a SVP64 instruction
-                    # and VL == 0.  this because VL==0 is a for-loop
-                    # from 0 to 0 i.e. always, always a NOP.
-                    cur_vl = cur_state.svstate.vl
-                    with m.If(is_svp64_mode & (cur_vl == 0)):
-                        # update the PC before fetching the next instruction
-                        # since we are in a VL==0 loop, no instruction was
-                        # executed that we could be overwriting
-                        comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
-                        comb += self.state_w_pc.data_i.eq(nia)
-                        comb += self.insn_done.eq(1)
-                        m.next = "ISSUE_START"
-                    with m.Else():
-                        if self.svp64_en:
-                            m.next = "PRED_START"  # start fetching predicate
-                        else:
-                            m.next = "DECODE_SV"  # skip predication
+                # when using "single-step" mode, checking dbg.stopping_o
+                # prevents progress.  allow issue to proceed once started
+                stopping = Const(0)
+                #if self.allow_overlap:
+                #    stopping = dbg.stopping_o
+                with m.If(stopping):
+                    # stopping: jump back to idle
+                    m.next = "ISSUE_START"
+                    if flush_needed:
+                        # request the icache to stop asserting "failed"
+                        comb += core.icache.flush_in.eq(1)
+                    # stop instruction fault
+                    sync += pdecode2.instr_fault.eq(0)
+                with m.Else():
+                    comb += fetch_insn_i_ready.eq(1)
+                    with m.If(fetch_insn_o_valid):
+                        # loop into ISSUE_START if it's a SVP64 instruction
+                        # and VL == 0.  this because VL==0 is a for-loop
+                        # from 0 to 0 i.e. always, always a NOP.
+                        cur_vl = cur_state.svstate.vl
+                        with m.If(is_svp64_mode & (cur_vl == 0)):
+                            # update the PC before fetching the next instruction
+                            # since we are in a VL==0 loop, no instruction was
+                            # executed that we could be overwriting
+                            comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
+                            comb += self.state_w_pc.i_data.eq(nia)
+                            comb += self.insn_done.eq(1)
+                            m.next = "ISSUE_START"
+                        with m.Else():
+                            if self.svp64_en:
+                                m.next = "PRED_START"  # fetching predicate
+                            else:
+                                m.next = "DECODE_SV"  # skip predication
 
             with m.State("PRED_START"):
-                comb += pred_insn_valid_i.eq(1)  # tell fetch_pred to start
-                with m.If(pred_insn_ready_o):  # fetch_pred acknowledged us
+                comb += pred_insn_i_valid.eq(1)  # tell fetch_pred to start
+                with m.If(pred_insn_o_ready):  # fetch_pred acknowledged us
                     m.next = "MASK_WAIT"
 
             with m.State("MASK_WAIT"):
-                comb += pred_mask_ready_i.eq(1) # ready to receive the masks
-                with m.If(pred_mask_valid_o): # predication masks are ready
+                comb += pred_mask_i_ready.eq(1)  # ready to receive the masks
+                with m.If(pred_mask_o_valid):  # predication masks are ready
                     m.next = "PRED_SKIP"
 
             # skip zeros in predicate
@@ -713,10 +1336,10 @@ class TestIssuerInternal(Elaboratable):
                                   (skip_dststep >= cur_vl)):
                             # end of VL loop. Update PC and reset src/dst step
                             comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
-                            comb += self.state_w_pc.data_i.eq(nia)
+                            comb += self.state_w_pc.i_data.eq(nia)
                             comb += new_svstate.srcstep.eq(0)
                             comb += new_svstate.dststep.eq(0)
-                            comb += update_svstate.eq(1)
+                            comb += self.update_svstate.eq(1)
                             # synchronize with the simulator
                             comb += self.insn_done.eq(1)
                             # go back to Issue
@@ -725,125 +1348,151 @@ class TestIssuerInternal(Elaboratable):
                             # update new src/dst step
                             comb += new_svstate.srcstep.eq(skip_srcstep)
                             comb += new_svstate.dststep.eq(skip_dststep)
-                            comb += update_svstate.eq(1)
+                            comb += self.update_svstate.eq(1)
                             # proceed to Decode
                             m.next = "DECODE_SV"
 
                         # pass predicate mask bits through to satellite decoders
                         # TODO: for SIMD this will be *multiple* bits
-                        sync += core.sv_pred_sm.eq(self.srcmask[0])
-                        sync += core.sv_pred_dm.eq(self.dstmask[0])
+                        sync += core.i.sv_pred_sm.eq(self.srcmask[0])
+                        sync += core.i.sv_pred_dm.eq(self.dstmask[0])
 
             # after src/dst step have been updated, we are ready
             # to decode the instruction
             with m.State("DECODE_SV"):
                 # decode the instruction
-                sync += core.e.eq(pdecode2.e)
-                sync += core.state.eq(cur_state)
-                sync += core.raw_insn_i.eq(dec_opcode_i)
-                sync += core.bigendian_i.eq(self.core_bigendian_i)
+                with m.If(~fetch_failed):
+                    sync += pdecode2.instr_fault.eq(0)
+                sync += core.i.e.eq(pdecode2.e)
+                sync += core.i.state.eq(cur_state)
+                sync += core.i.raw_insn_i.eq(dec_opcode_i)
+                sync += core.i.bigendian_i.eq(self.core_bigendian_i)
                 if self.svp64_en:
-                    sync += core.sv_rm.eq(pdecode2.sv_rm)
+                    sync += core.i.sv_rm.eq(pdecode2.sv_rm)
                     # set RA_OR_ZERO detection in satellite decoders
-                    sync += core.sv_a_nz.eq(pdecode2.sv_a_nz)
+                    sync += core.i.sv_a_nz.eq(pdecode2.sv_a_nz)
                     # and svp64 detection
-                    sync += core.is_svp64_mode.eq(is_svp64_mode)
+                    sync += core.i.is_svp64_mode.eq(is_svp64_mode)
                     # and svp64 bit-rev'd ldst mode
                     ldst_dec = pdecode2.use_svp64_ldst_dec
-                    sync += core.use_svp64_ldst_dec.eq(ldst_dec)
+                    sync += core.i.use_svp64_ldst_dec.eq(ldst_dec)
+                # after decoding, reset any previous exception condition,
+                # allowing it to be set again during the next execution
+                sync += pdecode2.ldst_exc.eq(0)
 
                 m.next = "INSN_EXECUTE"  # move to "execute"
 
             # handshake with execution FSM, move to "wait" once acknowledged
             with m.State("INSN_EXECUTE"):
-                comb += exec_insn_valid_i.eq(1) # trigger execute
-                with m.If(exec_insn_ready_o):   # execute acknowledged us
-                    m.next = "EXECUTE_WAIT"
+                # when using "single-step" mode, checking dbg.stopping_o
+                # prevents progress.  allow execute to proceed once started
+                stopping = Const(0)
+                #if self.allow_overlap:
+                #    stopping = dbg.stopping_o
+                with m.If(stopping):
+                    # stopping: jump back to idle
+                    m.next = "ISSUE_START"
+                    if flush_needed:
+                        # request the icache to stop asserting "failed"
+                        comb += core.icache.flush_in.eq(1)
+                    # stop instruction fault
+                    sync += pdecode2.instr_fault.eq(0)
+                with m.Else():
+                    comb += exec_insn_i_valid.eq(1)  # trigger execute
+                    with m.If(exec_insn_o_ready):   # execute acknowledged us
+                        m.next = "EXECUTE_WAIT"
 
             with m.State("EXECUTE_WAIT"):
-                # wait on "core stop" release, at instruction end
-                # need to do this here, in case we are in a VL>1 loop
-                with m.If(~dbg.core_stop_o & ~core_rst):
-                    comb += exec_pc_ready_i.eq(1)
-                    # see https://bugs.libre-soc.org/show_bug.cgi?id=636
-                    #with m.If(exec_pc_valid_o & exc_happened):
-                    #    probably something like this:
-                    #    sync += pdecode2.ldst_exc.eq(core.fus.get_exc("ldst0")
-                    # TODO: the exception info needs to be blatted
-                    # into pdecode.ldst_exc, and the instruction "re-run".
-                    # when ldst_exc.happened is set, the PowerDecoder2
-                    # reacts very differently: it re-writes the instruction
-                    # with a "trap" (calls PowerDecoder2.trap()) which
-                    # will *overwrite* whatever was requested and jump the
-                    # PC to the exception address, as well as alter MSR.
-                    # nothing else needs to be done other than to note
-                    # the change of PC and MSR (and, later, SVSTATE)
-                    #with m.Elif(exec_pc_valid_o):
-                    with m.If(exec_pc_valid_o): # replace with Elif (above)
-
-                        # was this the last loop iteration?
-                        is_last = Signal()
-                        cur_vl = cur_state.svstate.vl
-                        comb += is_last.eq(next_srcstep == cur_vl)
+                comb += exec_pc_i_ready.eq(1)
+                # see https://bugs.libre-soc.org/show_bug.cgi?id=636
+                # the exception info needs to be blatted into
+                # pdecode.ldst_exc, and the instruction "re-run".
+                # when ldst_exc.happened is set, the PowerDecoder2
+                # reacts very differently: it re-writes the instruction
+                # with a "trap" (calls PowerDecoder2.trap()) which
+                # will *overwrite* whatever was requested and jump the
+                # PC to the exception address, as well as alter MSR.
+                # nothing else needs to be done other than to note
+                # the change of PC and MSR (and, later, SVSTATE)
+                with m.If(exc_happened):
+                    mmu = core.fus.get_exc("mmu0")
+                    ldst = core.fus.get_exc("ldst0")
+                    if mmu is not None:
+                        with m.If(fetch_failed):
+                            # instruction fetch: exception is from MMU
+                            # reset instr_fault (highest priority)
+                            sync += pdecode2.ldst_exc.eq(mmu)
+                            sync += pdecode2.instr_fault.eq(0)
+                            if flush_needed:
+                                # request icache to stop asserting "failed"
+                                comb += core.icache.flush_in.eq(1)
+                    with m.If(~fetch_failed):
+                        # otherwise assume it was a LDST exception
+                        sync += pdecode2.ldst_exc.eq(ldst)
+
+                with m.If(exec_pc_o_valid):
+
+                    # was this the last loop iteration?
+                    is_last = Signal()
+                    cur_vl = cur_state.svstate.vl
+                    comb += is_last.eq(next_srcstep == cur_vl)
 
-                        # if either PC or SVSTATE were changed by the previous
-                        # instruction, go directly back to Fetch, without
-                        # updating either PC or SVSTATE
-                        with m.If(pc_changed | sv_changed):
-                            m.next = "ISSUE_START"
+                    with m.If(pdecode2.instr_fault):
+                        # reset instruction fault, try again
+                        sync += pdecode2.instr_fault.eq(0)
+                        m.next = "ISSUE_START"
 
-                        # also return to Fetch, when no output was a vector
-                        # (regardless of SRCSTEP and VL), or when the last
-                        # instruction was really the last one of the VL loop
-                        with m.Elif((~pdecode2.loop_continue) | is_last):
-                            # before going back to fetch, update the PC state
-                            # register with the NIA.
-                            # ok here we are not reading the branch unit.
-                            # TODO: this just blithely overwrites whatever
-                            #       pipeline updated the PC
-                            comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
-                            comb += self.state_w_pc.data_i.eq(nia)
-                            # reset SRCSTEP before returning to Fetch
-                            if self.svp64_en:
-                                with m.If(pdecode2.loop_continue):
-                                    comb += new_svstate.srcstep.eq(0)
-                                    comb += new_svstate.dststep.eq(0)
-                                    comb += update_svstate.eq(1)
-                            else:
+                    # return directly to Decode if Execute generated an
+                    # exception.
+                    with m.Elif(pdecode2.ldst_exc.happened):
+                        m.next = "DECODE_SV"
+
+                    # if MSR, PC or SVSTATE were changed by the previous
+                    # instruction, go directly back to Fetch, without
+                    # updating either MSR PC or SVSTATE
+                    with m.Elif(self.msr_changed | self.pc_changed |
+                                self.sv_changed):
+                        m.next = "ISSUE_START"
+
+                    # also return to Fetch, when no output was a vector
+                    # (regardless of SRCSTEP and VL), or when the last
+                    # instruction was really the last one of the VL loop
+                    with m.Elif((~pdecode2.loop_continue) | is_last):
+                        # before going back to fetch, update the PC state
+                        # register with the NIA.
+                        # ok here we are not reading the branch unit.
+                        # TODO: this just blithely overwrites whatever
+                        #       pipeline updated the PC
+                        comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
+                        comb += self.state_w_pc.i_data.eq(nia)
+                        # reset SRCSTEP before returning to Fetch
+                        if self.svp64_en:
+                            with m.If(pdecode2.loop_continue):
                                 comb += new_svstate.srcstep.eq(0)
                                 comb += new_svstate.dststep.eq(0)
-                                comb += update_svstate.eq(1)
-                            m.next = "ISSUE_START"
+                                comb += self.update_svstate.eq(1)
+                        else:
+                            comb += new_svstate.srcstep.eq(0)
+                            comb += new_svstate.dststep.eq(0)
+                            comb += self.update_svstate.eq(1)
+                        m.next = "ISSUE_START"
 
-                        # returning to Execute? then, first update SRCSTEP
-                        with m.Else():
-                            comb += new_svstate.srcstep.eq(next_srcstep)
-                            comb += new_svstate.dststep.eq(next_dststep)
-                            comb += update_svstate.eq(1)
-                            # return to mask skip loop
-                            m.next = "PRED_SKIP"
+                    # returning to Execute? then, first update SRCSTEP
+                    with m.Else():
+                        comb += new_svstate.srcstep.eq(next_srcstep)
+                        comb += new_svstate.dststep.eq(next_dststep)
+                        comb += self.update_svstate.eq(1)
+                        # return to mask skip loop
+                        m.next = "PRED_SKIP"
 
-                with m.Else():
-                    comb += dbg.core_stopped_i.eq(1)
-                    # while stopped, allow updating the PC and SVSTATE
-                    with m.If(self.pc_i.ok):
-                        comb += self.state_w_pc.wen.eq(1 << StateRegs.PC)
-                        comb += self.state_w_pc.data_i.eq(self.pc_i.data)
-                        sync += pc_changed.eq(1)
-                    with m.If(self.svstate_i.ok):
-                        comb += new_svstate.eq(self.svstate_i.data)
-                        comb += update_svstate.eq(1)
-                        sync += sv_changed.eq(1)
 
         # check if svstate needs updating: if so, write it to State Regfile
-        with m.If(update_svstate):
-            comb += self.state_w_sv.wen.eq(1<<StateRegs.SVSTATE)
-            comb += self.state_w_sv.data_i.eq(new_svstate)
-            sync += cur_state.svstate.eq(new_svstate) # for next clock
-
-    def execute_fsm(self, m, core, pc_changed, sv_changed,
-                    exec_insn_valid_i, exec_insn_ready_o,
-                    exec_pc_valid_o, exec_pc_ready_i):
+        with m.If(self.update_svstate):
+            sync += cur_state.svstate.eq(self.new_svstate)  # for next clock
+
+    def execute_fsm(self, m, core,
+                    exec_insn_i_valid, exec_insn_o_ready,
+                    exec_pc_o_valid, exec_pc_i_ready):
         """execute FSM
 
         execute FSM. this interacts with the "issue" FSM
@@ -854,133 +1503,78 @@ class TestIssuerInternal(Elaboratable):
 
         comb = m.d.comb
         sync = m.d.sync
+        dbg = self.dbg
         pdecode2 = self.pdecode2
+        cur_state = self.cur_state
 
         # temporaries
-        core_busy_o = core.busy_o                 # core is busy
-        core_ivalid_i = core.ivalid_i             # instruction is valid
-        core_issue_i = core.issue_i               # instruction is issued
-        insn_type = core.e.do.insn_type           # instruction MicroOp type
+        core_busy_o = core.n.o_data.busy_o  # core is busy
+        core_ivalid_i = core.p.i_valid              # instruction is valid
+
+        if hasattr(core, "icache"):
+            fetch_failed = core.icache.i_out.fetch_failed
+        else:
+            fetch_failed = Const(0, 1)
 
         with m.FSM(name="exec_fsm"):
 
             # waiting for instruction bus (stays there until not busy)
             with m.State("INSN_START"):
-                comb += exec_insn_ready_o.eq(1)
-                with m.If(exec_insn_valid_i):
-                    comb += core_ivalid_i.eq(1)  # instruction is valid
-                    comb += core_issue_i.eq(1)  # and issued
-                    sync += sv_changed.eq(0)
-                    sync += pc_changed.eq(0)
-                    m.next = "INSN_ACTIVE"  # move to "wait completion"
+                comb += exec_insn_o_ready.eq(1)
+                with m.If(exec_insn_i_valid):
+                    comb += core_ivalid_i.eq(1)  # instruction is valid/issued
+                    sync += self.sv_changed.eq(0)
+                    sync += self.pc_changed.eq(0)
+                    sync += self.msr_changed.eq(0)
+                    with m.If(core.p.o_ready):  # only move if accepted
+                        m.next = "INSN_ACTIVE"  # move to "wait completion"
 
             # instruction started: must wait till it finishes
             with m.State("INSN_ACTIVE"):
-                with m.If(insn_type != MicrOp.OP_NOP):
-                    comb += core_ivalid_i.eq(1) # instruction is valid
-                # note changes to PC and SVSTATE
-                with m.If(self.state_nia.wen & (1<<StateRegs.SVSTATE)):
-                    sync += sv_changed.eq(1)
-                with m.If(self.state_nia.wen & (1<<StateRegs.PC)):
-                    sync += pc_changed.eq(1)
-                with m.If(~core_busy_o): # instruction done!
-                    comb += exec_pc_valid_o.eq(1)
-                    with m.If(exec_pc_ready_i):
-                        comb += self.insn_done.eq(1)
+                # note changes to MSR, PC and SVSTATE
+                with m.If(self.state_nia.wen & (1 << StateRegs.SVSTATE)):
+                    sync += self.sv_changed.eq(1)
+                with m.If(self.state_nia.wen & (1 << StateRegs.MSR)):
+                    sync += self.msr_changed.eq(1)
+                with m.If(self.state_nia.wen & (1 << StateRegs.PC)):
+                    sync += self.pc_changed.eq(1)
+                # and note changes to DEC/TB, to be passed to DEC/TB FSM
+                with m.If(self.state_spr.wen & (1 << StateRegs.TB)):
+                    comb += self.pause_dec_tb.eq(1)
+                # but also zero-out the cur_state DEC so that, on
+                # the next instruction, if it is "enable interrupt"
+                # the delay between the DEC/TB FSM reading and updating
+                # cur_state.dec doesn't trigger a spurious interrupt.
+                # the DEC/TB FSM will read the regfile and update to
+                # the correct value, so having cur_state.dec set to zero
+                # for a while is no big deal.
+                with m.If(self.state_spr.wen & (1 << StateRegs.DEC)):
+                    comb += self.pause_dec_tb.eq(1)
+                    sync += cur_state.dec.eq(0) # only needs top bit clear
+                with m.If(~core_busy_o):  # instruction done!
+                    comb += exec_pc_o_valid.eq(1)
+                    with m.If(exec_pc_i_ready):
+                        # when finished, indicate "done".
+                        # however, if there was an exception, the instruction
+                        # is *not* yet done.  this is an implementation
+                        # detail: we choose to implement exceptions by
+                        # taking the exception information from the LDST
+                        # unit, putting that *back* into the PowerDecoder2,
+                        # and *re-running the entire instruction*.
+                        # if we erroneously indicate "done" here, it is as if
+                        # there were *TWO* instructions:
+                        # 1) the failed LDST 2) a TRAP.
+                        with m.If(~pdecode2.ldst_exc.happened &
+                                   ~pdecode2.instr_fault):
+                            comb += self.insn_done.eq(1)
                         m.next = "INSN_START"  # back to fetch
-
-    def setup_peripherals(self, m):
-        comb, sync = m.d.comb, m.d.sync
-
-        # okaaaay so the debug module must be in coresync clock domain
-        # but NOT its reset signal. to cope with this, set every single
-        # submodule explicitly in coresync domain, debug and JTAG
-        # in their own one but using *external* reset.
-        csd = DomainRenamer("coresync")
-        dbd = DomainRenamer(self.dbg_domain)
-
-        m.submodules.core = core = csd(self.core)
-        m.submodules.imem = imem = csd(self.imem)
-        m.submodules.dbg = dbg = dbd(self.dbg)
-        if self.jtag_en:
-            m.submodules.jtag = jtag = dbd(self.jtag)
-            # TODO: UART2GDB mux, here, from external pin
-            # see https://bugs.libre-soc.org/show_bug.cgi?id=499
-            sync += dbg.dmi.connect_to(jtag.dmi)
-
-        cur_state = self.cur_state
-
-        # 4x 4k SRAM blocks.  these simply "exist", they get routed in litex
-        if self.sram4x4k:
-            for i, sram in enumerate(self.sram4k):
-                m.submodules["sram4k_%d" % i] = csd(sram)
-                comb += sram.enable.eq(self.wb_sram_en)
-
-        # XICS interrupt handler
-        if self.xics:
-            m.submodules.xics_icp = icp = csd(self.xics_icp)
-            m.submodules.xics_ics = ics = csd(self.xics_ics)
-            comb += icp.ics_i.eq(ics.icp_o)           # connect ICS to ICP
-            sync += cur_state.eint.eq(icp.core_irq_o) # connect ICP to core
-
-        # GPIO test peripheral
-        if self.gpio:
-            m.submodules.simple_gpio = simple_gpio = csd(self.simple_gpio)
-
-        # connect one GPIO output to ICS bit 15 (like in microwatt soc.vhdl)
-        # XXX causes litex ECP5 test to get wrong idea about input and output
-        # (but works with verilator sim *sigh*)
-        #if self.gpio and self.xics:
-        #   comb += self.int_level_i[15].eq(simple_gpio.gpio_o[0])
-
-        # instruction decoder
-        pdecode = create_pdecode()
-        m.submodules.dec2 = pdecode2 = csd(self.pdecode2)
-        if self.svp64_en:
-            m.submodules.svp64 = svp64 = csd(self.svp64)
-
-        # convenience
-        dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer
-        intrf = self.core.regs.rf['int']
-
-        # clock delay power-on reset
-        cd_por  = ClockDomain(reset_less=True)
-        cd_sync = ClockDomain()
-        core_sync = ClockDomain("coresync")
-        m.domains += cd_por, cd_sync, core_sync
-        if self.dbg_domain != "sync":
-            dbg_sync = ClockDomain(self.dbg_domain)
-            m.domains += dbg_sync
-
-        ti_rst = Signal(reset_less=True)
-        delay = Signal(range(4), reset=3)
-        with m.If(delay != 0):
-            m.d.por += delay.eq(delay - 1)
-        comb += cd_por.clk.eq(ClockSignal())
-
-        # power-on reset delay
-        core_rst = ResetSignal("coresync")
-        comb += ti_rst.eq(delay != 0 | dbg.core_rst_o | ResetSignal())
-        comb += core_rst.eq(ti_rst)
-
-        # debug clock is same as coresync, but reset is *main external*
-        if self.dbg_domain != "sync":
-            dbg_rst = ResetSignal(self.dbg_domain)
-            comb += dbg_rst.eq(ResetSignal())
-
-        # busy/halted signals from core
-        comb += self.busy_o.eq(core.busy_o)
-        comb += pdecode2.dec.bigendian.eq(self.core_bigendian_i)
-
-        # temporary hack: says "go" immediately for both address gen and ST
-        l0 = core.l0
-        ldst = core.fus.fus['ldst0']
-        st_go_edge = rising_edge(m, ldst.st.rel_o)
-        m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o) # link addr-go direct to rel
-        m.d.comb += ldst.st.go_i.eq(st_go_edge) # link store-go to rising rel
+                # terminate returns directly to INSN_START
+                with m.If(dbg.terminate_i):
+                    # comb += self.insn_done.eq(1) - no because it's not
+                    m.next = "INSN_START"  # back to fetch
 
     def elaborate(self, platform):
-        m = Module()
+        m = super().elaborate(platform)
         # convenience
         comb, sync = m.d.comb, m.d.sync
         cur_state = self.cur_state
@@ -990,43 +1584,17 @@ class TestIssuerInternal(Elaboratable):
 
         # set up peripherals and core
         core_rst = self.core_rst
-        self.setup_peripherals(m)
 
-        # reset current state if core reset requested
-        with m.If(core_rst):
-            m.d.sync += self.cur_state.eq(0)
-
-        # PC and instruction from I-Memory
-        comb += self.pc_o.eq(cur_state.pc)
-        pc_changed = Signal() # note write to PC
-        sv_changed = Signal() # note write to SVSTATE
-
-        # read state either from incoming override or from regfile
-        # TODO: really should be doing MSR in the same way
-        pc = state_get(m, core_rst, self.pc_i,
-                            "pc",                  # read PC
-                            self.state_r_pc, StateRegs.PC)
-        svstate = state_get(m, core_rst, self.svstate_i,
-                            "svstate",   # read SVSTATE
-                            self.state_r_sv, StateRegs.SVSTATE)
-
-        # don't write pc every cycle
-        comb += self.state_w_pc.wen.eq(0)
-        comb += self.state_w_pc.data_i.eq(0)
-
-        # don't read msr every cycle
-        comb += self.state_r_msr.ren.eq(0)
+        # indicate to outside world if any FU is still executing
+        comb += self.any_busy.eq(core.n.o_data.any_busy_o)  # any FU executing
 
         # address of the next instruction, in the absence of a branch
         # depends on the instruction size
         nia = Signal(64)
 
         # connect up debug signals
-        # TODO comb += core.icache_rst_i.eq(dbg.icache_rst_o)
-        comb += dbg.terminate_i.eq(core.core_terminate_o)
-        comb += dbg.state.pc.eq(pc)
-        comb += dbg.state.svstate.eq(svstate)
-        comb += dbg.state.msr.eq(cur_state.msr)
+        with m.If(core.o.core_terminate_o):
+            comb += dbg.terminate_i.eq(1)
 
         # pass the prefix mode from Fetch to Issue, so the latter can loop
         # on VL==0
@@ -1037,28 +1605,28 @@ class TestIssuerInternal(Elaboratable):
         # these are the handshake signals between each
 
         # fetch FSM can run as soon as the PC is valid
-        fetch_pc_valid_i = Signal() # Execute tells Fetch "start next read"
-        fetch_pc_ready_o = Signal() # Fetch Tells SVSTATE "proceed"
+        fetch_pc_i_valid = Signal()  # Execute tells Fetch "start next read"
+        fetch_pc_o_ready = Signal()  # Fetch Tells SVSTATE "proceed"
 
         # fetch FSM hands over the instruction to be decoded / issued
-        fetch_insn_valid_o = Signal()
-        fetch_insn_ready_i = Signal()
+        fetch_insn_o_valid = Signal()
+        fetch_insn_i_ready = Signal()
 
         # predicate fetch FSM decodes and fetches the predicate
-        pred_insn_valid_i = Signal()
-        pred_insn_ready_o = Signal()
+        pred_insn_i_valid = Signal()
+        pred_insn_o_ready = Signal()
 
         # predicate fetch FSM delivers the masks
-        pred_mask_valid_o = Signal()
-        pred_mask_ready_i = Signal()
+        pred_mask_o_valid = Signal()
+        pred_mask_i_ready = Signal()
 
         # issue FSM delivers the instruction to the be executed
-        exec_insn_valid_i = Signal()
-        exec_insn_ready_o = Signal()
+        exec_insn_i_valid = Signal()
+        exec_insn_o_ready = Signal()
 
         # execute FSM, hands over the PC/SVSTATE back to the issue FSM
-        exec_pc_valid_o = Signal()
-        exec_pc_ready_i = Signal()
+        exec_pc_o_valid = Signal()
+        exec_pc_i_ready = Signal()
 
         # the FSMs here are perhaps unusual in that they detect conditions
         # then "hold" information, combinatorially, for the core
@@ -1069,197 +1637,50 @@ class TestIssuerInternal(Elaboratable):
         # Issue is where the VL for-loop # lives.  the ready/valid
         # signalling is used to communicate between the four.
 
-        self.fetch_fsm(m, core, pc, svstate, nia, is_svp64_mode,
-                       fetch_pc_ready_o, fetch_pc_valid_i,
-                       fetch_insn_valid_o, fetch_insn_ready_i)
+        self.fetch_fsm(m, dbg, core, core_rst, nia, is_svp64_mode,
+                       fetch_pc_o_ready, fetch_pc_i_valid,
+                       fetch_insn_o_valid, fetch_insn_i_ready)
 
-        self.issue_fsm(m, core, pc_changed, sv_changed, nia,
+        self.issue_fsm(m, core, nia,
                        dbg, core_rst, is_svp64_mode,
-                       fetch_pc_ready_o, fetch_pc_valid_i,
-                       fetch_insn_valid_o, fetch_insn_ready_i,
-                       pred_insn_valid_i, pred_insn_ready_o,
-                       pred_mask_valid_o, pred_mask_ready_i,
-                       exec_insn_valid_i, exec_insn_ready_o,
-                       exec_pc_valid_o, exec_pc_ready_i)
+                       fetch_pc_o_ready, fetch_pc_i_valid,
+                       fetch_insn_o_valid, fetch_insn_i_ready,
+                       pred_insn_i_valid, pred_insn_o_ready,
+                       pred_mask_o_valid, pred_mask_i_ready,
+                       exec_insn_i_valid, exec_insn_o_ready,
+                       exec_pc_o_valid, exec_pc_i_ready)
 
         if self.svp64_en:
             self.fetch_predicate_fsm(m,
-                                     pred_insn_valid_i, pred_insn_ready_o,
-                                     pred_mask_valid_o, pred_mask_ready_i)
+                                     pred_insn_i_valid, pred_insn_o_ready,
+                                     pred_mask_o_valid, pred_mask_i_ready)
 
-        self.execute_fsm(m, core, pc_changed, sv_changed,
-                         exec_insn_valid_i, exec_insn_ready_o,
-                         exec_pc_valid_o, exec_pc_ready_i)
+        self.execute_fsm(m, core,
+                         exec_insn_i_valid, exec_insn_o_ready,
+                         exec_pc_o_valid, exec_pc_i_ready)
 
-        # whatever was done above, over-ride it if core reset is held
+        # whatever was done above, over-ride it if core reset is held.
+        # set NIA to pc_at_reset
         with m.If(core_rst):
-            sync += nia.eq(0)
-
-        # this bit doesn't have to be in the FSM: connect up to read
-        # regfiles on demand from DMI
-        self.do_dmi(m, dbg)
-
-        # DEC and TB inc/dec FSM.  copy of DEC is put into CoreState,
-        # (which uses that in PowerDecoder2 to raise 0x900 exception)
-        self.tb_dec_fsm(m, cur_state.dec)
-
-        return m
-
-    def do_dmi(self, m, dbg):
-        """deals with DMI debug requests
-
-        currently only provides read requests for the INT regfile, CR and XER
-        it will later also deal with *writing* to these regfiles.
-        """
-        comb = m.d.comb
-        sync = m.d.sync
-        dmi, d_reg, d_cr, d_xer, = dbg.dmi, dbg.d_gpr, dbg.d_cr, dbg.d_xer
-        intrf = self.core.regs.rf['int']
-
-        with m.If(d_reg.req): # request for regfile access being made
-            # TODO: error-check this
-            # XXX should this be combinatorial?  sync better?
-            if intrf.unary:
-                comb += self.int_r.ren.eq(1<<d_reg.addr)
-            else:
-                comb += self.int_r.addr.eq(d_reg.addr)
-                comb += self.int_r.ren.eq(1)
-        d_reg_delay  = Signal()
-        sync += d_reg_delay.eq(d_reg.req)
-        with m.If(d_reg_delay):
-            # data arrives one clock later
-            comb += d_reg.data.eq(self.int_r.data_o)
-            comb += d_reg.ack.eq(1)
-
-        # sigh same thing for CR debug
-        with m.If(d_cr.req): # request for regfile access being made
-            comb += self.cr_r.ren.eq(0b11111111) # enable all
-        d_cr_delay  = Signal()
-        sync += d_cr_delay.eq(d_cr.req)
-        with m.If(d_cr_delay):
-            # data arrives one clock later
-            comb += d_cr.data.eq(self.cr_r.data_o)
-            comb += d_cr.ack.eq(1)
-
-        # aaand XER...
-        with m.If(d_xer.req): # request for regfile access being made
-            comb += self.xer_r.ren.eq(0b111111) # enable all
-        d_xer_delay  = Signal()
-        sync += d_xer_delay.eq(d_xer.req)
-        with m.If(d_xer_delay):
-            # data arrives one clock later
-            comb += d_xer.data.eq(self.xer_r.data_o)
-            comb += d_xer.ack.eq(1)
-
-    def tb_dec_fsm(self, m, spr_dec):
-        """tb_dec_fsm
-
-        this is a FSM for updating either dec or tb.  it runs alternately
-        DEC, TB, DEC, TB.  note that SPR pipeline could have written a new
-        value to DEC, however the regfile has "passthrough" on it so this
-        *should* be ok.
-
-        see v3.0B p1097-1099 for Timeer Resource and p1065 and p1076
-        """
-
-        comb, sync = m.d.comb, m.d.sync
-        fast_rf = self.core.regs.rf['fast']
-        fast_r_dectb = fast_rf.r_ports['issue'] # DEC/TB
-        fast_w_dectb = fast_rf.w_ports['issue'] # DEC/TB
-
-        with m.FSM() as fsm:
-
-            # initiates read of current DEC
-            with m.State("DEC_READ"):
-                comb += fast_r_dectb.addr.eq(FastRegs.DEC)
-                comb += fast_r_dectb.ren.eq(1)
-                m.next = "DEC_WRITE"
-
-            # waits for DEC read to arrive (1 cycle), updates with new value
-            with m.State("DEC_WRITE"):
-                new_dec = Signal(64)
-                # TODO: MSR.LPCR 32-bit decrement mode
-                comb += new_dec.eq(fast_r_dectb.data_o - 1)
-                comb += fast_w_dectb.addr.eq(FastRegs.DEC)
-                comb += fast_w_dectb.wen.eq(1)
-                comb += fast_w_dectb.data_i.eq(new_dec)
-                sync += spr_dec.eq(new_dec) # copy into cur_state for decoder
-                m.next = "TB_READ"
-
-            # initiates read of current TB
-            with m.State("TB_READ"):
-                comb += fast_r_dectb.addr.eq(FastRegs.TB)
-                comb += fast_r_dectb.ren.eq(1)
-                m.next = "TB_WRITE"
-
-            # waits for read TB to arrive, initiates write of current TB
-            with m.State("TB_WRITE"):
-                new_tb = Signal(64)
-                comb += new_tb.eq(fast_r_dectb.data_o + 1)
-                comb += fast_w_dectb.addr.eq(FastRegs.TB)
-                comb += fast_w_dectb.wen.eq(1)
-                comb += fast_w_dectb.data_i.eq(new_tb)
-                m.next = "DEC_READ"
+            sync += nia.eq(self.core.pc_at_reset)
 
         return m
 
-    def __iter__(self):
-        yield from self.pc_i.ports()
-        yield self.pc_o
-        yield self.memerr_o
-        yield from self.core.ports()
-        yield from self.imem.ports()
-        yield self.core_bigendian_i
-        yield self.busy_o
-
-    def ports(self):
-        return list(self)
-
-    def external_ports(self):
-        ports = self.pc_i.ports()
-        ports += [self.pc_o, self.memerr_o, self.core_bigendian_i, self.busy_o,
-                ]
-
-        if self.jtag_en:
-            ports += list(self.jtag.external_ports())
-        else:
-            # don't add DMI if JTAG is enabled
-            ports += list(self.dbg.dmi.ports())
-
-        ports += list(self.imem.ibus.fields.values())
-        ports += list(self.core.l0.cmpi.wb_bus().fields.values())
-
-        if self.sram4x4k:
-            for sram in self.sram4k:
-                ports += list(sram.bus.fields.values())
-
-        if self.xics:
-            ports += list(self.xics_icp.bus.fields.values())
-            ports += list(self.xics_ics.bus.fields.values())
-            ports.append(self.int_level_i)
-
-        if self.gpio:
-            ports += list(self.simple_gpio.bus.fields.values())
-            ports.append(self.gpio_o)
-
-        return ports
-
-    def ports(self):
-        return list(self)
-
 
 class TestIssuer(Elaboratable):
     def __init__(self, pspec):
         self.ti = TestIssuerInternal(pspec)
         self.pll = DummyPLL(instance=True)
 
+        self.dbg_rst_i = Signal(reset_less=True)
+
         # PLL direct clock or not
         self.pll_en = hasattr(pspec, "use_pll") and pspec.use_pll
         if self.pll_en:
             self.pll_test_o = Signal(reset_less=True)
             self.pll_vco_o = Signal(reset_less=True)
             self.clk_sel_i = Signal(2, reset_less=True)
-            self.ref_clk =  ClockSignal() # can't rename it but that's ok
+            self.ref_clk = ClockSignal()  # can't rename it but that's ok
             self.pllclk_clk = ClockSignal("pllclk")
 
     def elaborate(self, platform):
@@ -1299,29 +1720,30 @@ class TestIssuer(Elaboratable):
         # internal clock is set to selector clock-out.  has the side-effect of
         # running TestIssuer at this speed (see DomainRenamer("intclk") above)
         # debug clock runs at coresync internal clock
-        cd_coresync = ClockDomain("coresync")
-        #m.domains += cd_coresync
         if self.ti.dbg_domain != 'sync':
             cd_dbgsync = ClockDomain("dbgsync")
-            #m.domains += cd_dbgsync
-        intclk = ClockSignal("coresync")
+        intclk = ClockSignal(self.ti.core_domain)
         dbgclk = ClockSignal(self.ti.dbg_domain)
         # XXX BYPASS PLL XXX
         # XXX BYPASS PLL XXX
         # XXX BYPASS PLL XXX
         if self.pll_en:
             comb += intclk.eq(self.ref_clk)
+            assert self.ti.core_domain != 'sync', \
+                "cannot set core_domain to sync and use pll at the same time"
         else:
-            comb += intclk.eq(ClockSignal())
+            if self.ti.core_domain != 'sync':
+                comb += intclk.eq(ClockSignal())
         if self.ti.dbg_domain != 'sync':
             dbgclk = ClockSignal(self.ti.dbg_domain)
             comb += dbgclk.eq(intclk)
+        comb += self.ti.dbg_rst_i.eq(self.dbg_rst_i)
 
         return m
 
     def ports(self):
         return list(self.ti.ports()) + list(self.pll.ports()) + \
-               [ClockSignal(), ResetSignal()]
+            [ClockSignal(), ResetSignal()]
 
     def external_ports(self):
         ports = self.ti.external_ports()
@@ -1343,10 +1765,10 @@ if __name__ == '__main__':
              'div': 1,
              'mul': 1,
              'shiftrot': 1
-            }
+             }
     pspec = TestMemPspec(ldst_ifacetype='bare_wb',
                          imem_ifacetype='bare_wb',
-                         addr_wid=48,
+                         addr_wid=64,
                          mask_wid=8,
                          reg_wid=64,
                          units=units)
index 8c0f8e1f5b8cc6a1a3d3e4f5947350e880c428e5..d56c140d39791dabb42b01c76380746905db9d4a 100644 (file)
@@ -4,8 +4,9 @@
 import argparse
 from nmigen.cli import verilog
 
+from openpower.consts import MSR
 from soc.config.test.test_loadstore import TestMemPspec
-from soc.simple.issuer import TestIssuer
+from soc.simple.issuer import TestIssuer, TestIssuerInternal
 
 
 if __name__ == '__main__':
@@ -58,9 +59,69 @@ if __name__ == '__main__':
     parser.add_argument("--disable-svp64", dest='svp64', action="store_false",
                         help="disable SVP64",
                         default=False)
+    parser.add_argument("--pc-reset", default="0",
+                        help="Set PC at reset (default 0)")
+    parser.add_argument("--xlen", default=64, type=int,
+                        help="Set register width [default 64]")
+    # create a module that's directly compatible as a drop-in replacement
+    # in microwatt.v
+    parser.add_argument("--microwatt-compat", dest='mwcompat',
+                        action="store_true",
+                        help="generate microwatt-compatible interface",
+                        default=False)
+    parser.add_argument("--microwatt-compat-svp64", dest='mwcompatsvp64',
+                        action="store_true",
+                        help="generate microwatt-compatible interface + SVP64",
+                        default=False)
+    parser.add_argument("--old-microwatt-compat", dest='old_mwcompat',
+                        action="store_true",
+                        help="generate old microwatt-compatible interface",
+                        default=True)
+    parser.add_argument("--microwatt-debug", dest='mwdebug',
+                        action="store_true",
+                        help="generate old microwatt-compatible interface",
+                        default=False)
+    # create a module with Fabric compatibility
+    parser.add_argument("--fabric-compat", dest='fabriccompat',
+                        action="store_true",
+                        help="generate Fabric-compatible interface",
+                        default=False)
+    # small cache option
+    parser.add_argument("--small-cache", dest='smallcache',
+                        action="store_true",
+                        help="generate small caches",
+                        default=False)
+
+    # allow overlaps in TestIssuer
+    parser.add_argument("--allow-overlap", dest='allow_overlap',
+                        action="store_true",
+                        help="allow overlap in TestIssuer",
+                        default=False)
 
     args = parser.parse_args()
 
+    # convenience: set some defaults
+    if args.mwcompat:
+        args.pll = False
+        args.debug = 'dmi'
+        args.core = True
+        args.xics = False
+        args.gpio = False
+        args.sram4x4kblock = False
+        args.svp64 = False
+
+    # Yes, this is duplicating mwcompat, but for the sake of simplicity
+    # adding support for svp64 like this
+    if args.mwcompatsvp64:
+        args.pll = False
+        args.debug = 'dmi'
+        args.core = True
+        args.xics = False
+        args.gpio = False
+        args.sram4x4kblock = False
+        args.svp64 = True
+        args.mwcompat = True # Ensures TestMemPspec gets the expected value
+
     print(args)
 
     units = {'alu': 1,
@@ -77,14 +138,26 @@ if __name__ == '__main__':
     # decide which memory type to configure
     if args.mmu:
         ldst_ifacetype = 'mmu_cache_wb'
+        imem_ifacetype = 'mmu_cache_wb'
     else:
         ldst_ifacetype = 'bare_wb'
-    imem_ifacetype = 'bare_wb'
+        imem_ifacetype = 'bare_wb'
+
+    # default MSR
+    msr_reset = (1<<MSR.LE) | (1<<MSR.SF) # 64-bit, little-endian default
+
+    # default PC
+    if args.pc_reset.startswith("0x"):
+        pc_reset = int(args.pc_reset, 16)
+    else:
+        pc_reset = int(args.pc_reset)
 
     pspec = TestMemPspec(ldst_ifacetype=ldst_ifacetype,
                          imem_ifacetype=imem_ifacetype,
-                         addr_wid=48,
+                         addr_wid=64,
                          mask_wid=8,
+                         # pipeline and integer register file width
+                         XLEN=args.xlen,
                          # must leave at 64
                          reg_wid=64,
                          # set to 32 for instruction-memory width=32
@@ -99,10 +172,20 @@ if __name__ == '__main__':
                          sram4x4kblock=args.enable_sram4x4kblock, # add SRAMs
                          debug=args.debug,      # set to jtag or dmi
                          svp64=args.svp64,      # enable SVP64
-                         mmu=args.mmu,          # enable MMU
-                         units=units)
+                         microwatt_mmu=args.mmu,         # enable MMU
+                         microwatt_compat=args.mwcompat, # microwatt compatible
+                         microwatt_old=args.old_mwcompat, # old microwatt api
+                         microwatt_debug=args.mwdebug, # microwatt debug signals
+                         fabric_compat=args.fabriccompat, # fabric compatible (overlaps with microwatt compat)
+                         small_cache=args.smallcache, # small cache/TLB sizes
+                         allow_overlap=args.allow_overlap, # allow overlap
+                         units=units,
+                         msr_reset=msr_reset,
+                         pc_reset=pc_reset)
+    #if args.mwcompat:
+    #    pspec.core_domain = 'sync'
 
-    print("mmu", pspec.__dict__["mmu"])
+    print("mmu", pspec.__dict__["microwatt_mmu"])
     print("nocore", pspec.__dict__["nocore"])
     print("regreduce", pspec.__dict__["regreduce"])
     print("gpio", pspec.__dict__["gpio"])
@@ -111,9 +194,22 @@ if __name__ == '__main__':
     print("use_pll", pspec.__dict__["use_pll"])
     print("debug", pspec.__dict__["debug"])
     print("SVP64", pspec.__dict__["svp64"])
+    print("XLEN", pspec.__dict__["XLEN"])
+    print("MSR@reset", hex(pspec.__dict__["msr_reset"]))
+    print("PC@reset", hex(pspec.__dict__["pc_reset"]))
+    print("Microwatt compatibility", pspec.__dict__["microwatt_compat"])
+    print("Old Microwatt compatibility", pspec.__dict__["microwatt_old"])
+    print("Microwatt debug", pspec.__dict__["microwatt_debug"])
+    print("Fabric compatibility", pspec.__dict__["fabric_compat"])
+    print("Small Cache/TLB", pspec.__dict__["small_cache"])
 
-    dut = TestIssuer(pspec)
+    if args.mwcompat:
+        dut = TestIssuerInternal(pspec)
+        name = "external_core_top"
+    else:
+        dut = TestIssuer(pspec)
+        name = "test_issuer"
 
-    vl = verilog.convert(dut, ports=dut.external_ports(), name="test_issuer")
+    vl = verilog.convert(dut, ports=dut.external_ports(), name=name)
     with open(args.output_filename, "w") as f:
         f.write(vl)
index 2f588f2600e71fc23c3f5b39e335cd259390a6fb..5d6bebc58d82c643ea42b6f882fd8193b199631b 100644 (file)
@@ -3,17 +3,24 @@
 related bugs:
 
  * https://bugs.libre-soc.org/show_bug.cgi?id=363
+ * https://bugs.libre-soc.org/show_bug.cgi?id=686
 """
+
 from nmigen import Module, Signal, Cat
 from nmigen.back.pysim import Simulator, Delay, Settle
 from nmutil.formaltest import FHDLTestCase
 from nmigen.cli import rtlil
 import unittest
+from openpower.test.state import (SimState, teststate_check_regs,
+                                  teststate_check_mem)
+from soc.simple.test.teststate import HDLState
 from openpower.decoder.isa.caller import special_sprs
 from openpower.decoder.power_decoder import create_pdecode
 from openpower.decoder.power_decoder2 import PowerDecode2
 from openpower.decoder.selectable_int import SelectableInt
 from openpower.decoder.isa.all import ISA
+from openpower.decoder.decode2execute1 import IssuerDecode2ToOperand
+from openpower.state import CoreState
 
 # note that using SPRreduced has to be done to match the
 # PowerDecoder2 SPR map
@@ -21,11 +28,12 @@ from openpower.decoder.power_enums import SPRreduced as SPR
 from openpower.decoder.power_enums import spr_dict, Function, XER_bits
 from soc.config.test.test_loadstore import TestMemPspec
 from openpower.endian import bigendian
+from soc.regfile.regfiles import StateRegs
 
 from soc.simple.core import NonProductionCore
 from soc.experiment.compalu_multi import find_ok  # hack
 
-from soc.fu.compunits.test.test_compunit import (setup_test_memory,
+from soc.fu.compunits.test.test_compunit import (setup_tst_memory,
                                                  check_sim_memory)
 
 # test with ALU data and Logical data
@@ -35,19 +43,48 @@ from soc.fu.shift_rot.test.test_pipe_caller import ShiftRotTestCase
 from soc.fu.cr.test.test_pipe_caller import CRTestCase
 from soc.fu.branch.test.test_pipe_caller import BranchTestCase
 from soc.fu.ldst.test.test_pipe_caller import LDSTTestCase
-from openpower.util import spr_to_fast_reg
+from openpower.test.general.overlap_hazards import (HazardTestCase,
+                                                    RandomHazardTestCase)
+from openpower.util import spr_to_fast_reg, spr_to_state_reg
+
+from openpower.consts import StateRegsEnum
 
 # list of SPRs that are controlled and managed by the MMU
-mmu_sprs = ["PRTBL", "DSISR", "DAR", "PIDR"]
+mmu_sprs = ["PRTBL", "PIDR"]
+ldst_sprs = ["DAR", "DSISR"]
+
 
-def set_mmu_spr(name, i, val, core): #important keep pep8 formatting
-        fsm = core.fus.get_fu("mmu0").alu
-        yield fsm.mmu.l_in.mtspr.eq(1)
-        yield fsm.mmu.l_in.sprn.eq(i)
-        yield fsm.mmu.l_in.rs.eq(val)
+def set_mmu_spr(name, i, val, core):  # important keep pep8 formatting
+    fsm = core.fus.get_fu("mmu0").alu
+    yield fsm.mmu.l_in.mtspr.eq(1)
+    yield fsm.mmu.l_in.sprn.eq(i)
+    yield fsm.mmu.l_in.rs.eq(val)
+    yield
+    yield fsm.mmu.l_in.mtspr.eq(0)
+    while True:
+        done = yield fsm.mmu.l_out.done
+        if done:
+            break
         yield
-        yield fsm.mmu.l_in.mtspr.eq(0)
-        print("mmu_spr was updated")
+    yield
+    print("mmu_spr %s %d was updated %x" % (name, i, val))
+
+
+def set_ldst_spr(name, i, val, core):  # important keep pep8 formatting
+    ldst = core.fus.get_fu("mmu0").alu.ldst # awkward to get at but it works
+    yield ldst.sprval_in.eq(val)
+    yield ldst.mmu_set_spr.eq(1)
+    if name == 'DAR':
+        yield ldst.mmu_set_dar.eq(1)
+        yield
+        yield ldst.mmu_set_dar.eq(0)
+    else:
+        yield ldst.mmu_set_dsisr.eq(1)
+        yield
+        yield ldst.mmu_set_dsisr.eq(0)
+    yield ldst.mmu_set_spr.eq(0)
+    print("ldst_spr %s %d was updated %x" % (name, i, val))
+
 
 def setup_regs(pdecode2, core, test):
 
@@ -60,6 +97,10 @@ def setup_regs(pdecode2, core, test):
             yield intregs.memory._array[i].eq(test.regs[i])
     yield Settle()
 
+    # set up MSR in STATE regfile, "direct" write (bypass rd/write ports)
+    stateregs = core.regs.state
+    yield stateregs.regs[StateRegsEnum.MSR].reg.eq(test.msr)
+
     # set up CR regfile, "direct" write across all CRs
     cr = test.cr
     crregs = core.regs.cr
@@ -67,7 +108,7 @@ def setup_regs(pdecode2, core, test):
     print("setup cr reg", hex(cr))
     for i in range(8):
         #j = 7-i
-        cri = (cr >> (i*4)) & 0xf
+        cri = (cr >> (i * 4)) & 0xf
         #cri = int('{:04b}'.format(cri)[::-1], 2)
         print("setup cr reg", hex(cri), i,
               crregs.regs[i].reg.shape())
@@ -102,6 +143,7 @@ def setup_regs(pdecode2, core, test):
     # setting both fast and slow SPRs from test data
 
     fregs = core.regs.fast
+    stateregs = core.regs.state
     sregs = core.regs.spr
     for sprname, val in test.sprs.items():
         if isinstance(val, SelectableInt):
@@ -110,17 +152,31 @@ def setup_regs(pdecode2, core, test):
             sprname = spr_dict[sprname].SPR
         if sprname == 'XER':
             continue
+        print ('set spr %s val %x' % (sprname, val))
+
         fast = spr_to_fast_reg(sprname)
-        if fast is None:
+        state = spr_to_state_reg(sprname)
+
+        if fast is None and state is None:
             # match behaviour of SPRMap in power_decoder2.py
             for i, x in enumerate(SPR):
                 if sprname == x.name:
-                    print("setting slow SPR %d (%s) to %x" %
-                          (i, sprname, val))
-                    if not sprname in mmu_sprs:
-                        yield sregs.memory._array[i].eq(val)
+                    print("setting slow SPR %d (%s/%d) to %x" %
+                          (i, sprname, x.value, val))
+                    if sprname in mmu_sprs:
+                        yield from set_mmu_spr(sprname, x.value, val, core)
+                    elif sprname in ldst_sprs:
+                        yield from set_ldst_spr(sprname, x.value, val, core)
                     else:
-                        yield from set_mmu_spr(sprname, i, val, core)
+                        yield sregs.memory._array[i].eq(val)
+        elif state is not None:
+            print("setting state reg %d (%s) to %x" %
+                  (state, sprname, val))
+            if stateregs.unary:
+                rval = stateregs.regs[state].reg
+            else:
+                rval = stateregs.memory._array[state]
+            yield rval.eq(val)
         else:
             print("setting fast reg %d (%s) to %x" %
                   (fast, sprname, val))
@@ -145,61 +201,15 @@ def setup_regs(pdecode2, core, test):
 
 
 def check_regs(dut, sim, core, test, code):
-    # int regs
-    intregs = []
-    for i in range(32):
-        if core.regs.int.unary:
-            rval = yield core.regs.int.regs[i].reg
-        else:
-            rval = yield core.regs.int.memory._array[i]
-        intregs.append(rval)
-    print("int regs", list(map(hex, intregs)))
-    for i in range(32):
-        simregval = sim.gpr[i].asint()
-        dut.assertEqual(simregval, intregs[i],
-                        "int reg %d not equal %s. got %x expected %x" % \
-                            (i, repr(code), simregval, intregs[i]))
-
-    # CRs
-    crregs = []
-    for i in range(8):
-        rval = yield core.regs.cr.regs[i].reg
-        crregs.append(rval)
-    print("cr regs", list(map(hex, crregs)))
-    for i in range(8):
-        rval = crregs[i]
-        cri = sim.crl[7-i].get_range().value
-        print("cr reg", i, hex(cri), i, hex(rval))
-        # XXX https://bugs.libre-soc.org/show_bug.cgi?id=363
-        dut.assertEqual(cri, rval,
-                        "cr reg %d not equal %s" % (i, repr(code)))
-
-    # XER
-    xregs = core.regs.xer
-    so = yield xregs.regs[xregs.SO].reg
-    ov = yield xregs.regs[xregs.OV].reg
-    ca = yield xregs.regs[xregs.CA].reg
-
-    print("sim SO", sim.spr['XER'][XER_bits['SO']])
-    e_so = sim.spr['XER'][XER_bits['SO']].value
-    e_ov = sim.spr['XER'][XER_bits['OV']].value
-    e_ov32 = sim.spr['XER'][XER_bits['OV32']].value
-    e_ca = sim.spr['XER'][XER_bits['CA']].value
-    e_ca32 = sim.spr['XER'][XER_bits['CA32']].value
-
-    e_ov = e_ov | (e_ov32 << 1)
-    e_ca = e_ca | (e_ca32 << 1)
+    # create the two states and compare
+    testdic = {'sim': sim, 'hdl': core}
+    yield from teststate_check_regs(dut, testdic, test, code)
 
-    print("after: so/ov-32/ca-32", so, bin(ov), bin(ca))
-    dut.assertEqual(e_so, so, "so mismatch %s" % (repr(code)))
-    dut.assertEqual(e_ov, ov, "ov mismatch %s" % (repr(code)))
-    dut.assertEqual(e_ca, ca, "ca mismatch %s" % (repr(code)))
 
-    # Check the PC as well
-    state = core.regs.state
-    pc = yield state.r_ports['cia'].data_o
-    e_pc = sim.pc.CIA.value
-    dut.assertEqual(e_pc, pc)
+def check_mem(dut, sim, core, test, code):
+    # create the two states and compare mem
+    testdic = {'sim': sim, 'hdl': core}
+    yield from teststate_check_mem(dut, testdic, test, code)
 
 
 def wait_for_busy_hi(cu):
@@ -222,8 +232,8 @@ def set_issue(core, dec2, sim):
 
 def wait_for_busy_clear(cu):
     while True:
-        busy_o = yield cu.busy_o
-        terminate_o = yield cu.core_terminate_o
+        busy_o = yield cu.o.busy_o
+        terminate_o = yield cu.o.core_terminate_o
         if not busy_o:
             print("busy/terminate:", busy_o, terminate_o)
             break
@@ -240,82 +250,136 @@ class TestRunner(FHDLTestCase):
         m = Module()
         comb = m.d.comb
         instruction = Signal(32)
-        ivalid_i = Signal()
+
+        units = {'alu': 3, 'cr': 1, 'branch': 1, 'trap': 1,
+                 'spr': 1,
+                 'logical': 1,
+                 'mul': 3,
+                 'div': 1, 'shiftrot': 1}
 
         pspec = TestMemPspec(ldst_ifacetype='testpi',
                              imem_ifacetype='',
                              addr_wid=48,
                              mask_wid=8,
+                             units=units,
+                             allow_overlap=True,
                              reg_wid=64)
 
+        cur_state = CoreState("cur") # current state (MSR/PC/SVSTATE)
+        pdecode2 = PowerDecode2(None, state=cur_state,
+                                     #opkls=IssuerDecode2ToOperand,
+                                     svp64_en=True, # self.svp64_en,
+                                     regreduce_en=False, #self.regreduce_en
+                                    )
+
         m.submodules.core = core = NonProductionCore(pspec)
-        pdecode2 = core.pdecode2
+        m.submodules.pdecode2 = pdecode2
+        core.pdecode2 = pdecode2
         l0 = core.l0
 
-        comb += core.raw_opcode_i.eq(instruction)
-        comb += core.ivalid_i.eq(ivalid_i)
+        comb += pdecode2.dec.raw_opcode_in.eq(instruction)
+        comb += pdecode2.dec.bigendian.eq(bigendian)  # little / big?
+        comb += core.i.e.eq(pdecode2.e)
+        comb += core.i.state.eq(cur_state)
+        comb += core.i.raw_insn_i.eq(instruction)
+        comb += core.i.bigendian_i.eq(bigendian)
+
+        # set the PC StateRegs read port to always send back the PC
+        stateregs = core.regs.state
+        pc_regnum = StateRegs.PC
+        comb += stateregs.r_ports['cia'].ren.eq(1<<pc_regnum)
 
         # temporary hack: says "go" immediately for both address gen and ST
         ldst = core.fus.fus['ldst0']
-        m.d.comb += ldst.ad.go.eq(ldst.ad.rel)  # link addr-go direct to rel
-        m.d.comb += ldst.st.go.eq(ldst.st.rel)  # link store-go direct to rel
+        m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o)  # link addr-go to rel
+        m.d.comb += ldst.st.go_i.eq(ldst.st.rel_o)  # link store-go to rel
 
         # nmigen Simulation
         sim = Simulator(m)
         sim.add_clock(1e-6)
 
         def process():
-            yield core.issue_i.eq(0)
             yield
 
             for test in self.test_data:
                 print(test.name)
                 program = test.program
-                self.subTest(test.name)
-                sim = ISA(pdecode2, test.regs, test.sprs, test.cr, test.mem,
-                          test.msr,
-                          bigendian=bigendian)
-                gen = program.generate_instructions()
-                instructions = list(zip(gen, program.assembly.splitlines()))
-
-                yield from setup_test_memory(l0, sim)
-                yield from setup_regs(core, test)
-
-                index = sim.pc.CIA.value//4
-                while index < len(instructions):
-                    ins, code = instructions[index]
-
-                    print("instruction: 0x{:X}".format(ins & 0xffffffff))
-                    print(code)
-
-                    # ask the decoder to decode this binary data (endian'd)
-                    yield core.bigendian_i.eq(bigendian)  # little / big?
-                    yield instruction.eq(ins)          # raw binary instr.
-                    yield ivalid_i.eq(1)
-                    yield Settle()
-                    # fn_unit = yield pdecode2.e.fn_unit
-                    #fuval = self.funit.value
-                    #self.assertEqual(fn_unit & fuval, fuval)
-
-                    # set operand and get inputs
-                    yield from set_issue(core, pdecode2, sim)
-                    yield Settle()
-
-                    yield from wait_for_busy_clear(core)
-                    yield ivalid_i.eq(0)
-                    yield
-
-                    print("sim", code)
-                    # call simulated operation
-                    opname = code.split(' ')[0]
-                    yield from sim.call(opname)
-                    index = sim.pc.CIA.value//4
-
-                    # register check
-                    yield from check_regs(self, sim, core, test, code)
-
-                    # Memory check
-                    yield from check_sim_memory(self, l0, sim, code)
+                with self.subTest(test.name):
+                    sim = ISA(pdecode2, test.regs, test.sprs, test.cr,
+                              test.mem,
+                              test.msr,
+                              bigendian=bigendian)
+                    gen = program.generate_instructions()
+                    instructions = list(zip(gen, program.assembly.splitlines()))
+
+                    yield from setup_tst_memory(l0, test.mem)
+                    yield from setup_regs(pdecode2, core, test)
+
+                    index = sim.pc.CIA.value // 4
+                    while index < len(instructions):
+                        ins, code = instructions[index]
+
+                        print("instruction: 0x{:X}".format(ins & 0xffffffff))
+                        print(code)
+
+                        # ask the decoder to decode this binary data (endian'd)
+                        yield instruction.eq(ins)          # raw binary instr.
+                        yield Settle()
+
+                        print("sim", code)
+                        # call simulated operation
+                        opname = code.split(' ')[0]
+                        yield from sim.call(opname)
+                        pc = sim.pc.CIA.value
+                        nia = sim.pc.NIA.value
+                        index = pc // 4
+
+                        # set the PC to the same simulated value
+                        # (core is not able to do this itself, except
+                        # for branch / TRAP)
+                        print ("after call, pc nia", pc, nia)
+                        yield stateregs.regs[pc_regnum].reg.eq(pc)
+                        yield Settle()
+
+                        yield core.p.i_valid.eq(1)
+                        yield
+                        o_ready = yield core.p.o_ready
+                        while True:
+                            if o_ready:
+                                break
+                            yield
+                            o_ready = yield core.p.o_ready
+                        yield core.p.i_valid.eq(0)
+
+                        # set operand and get inputs
+                        yield from wait_for_busy_clear(core)
+
+                        # synchronised (non-overlap) is fine to check
+                        if not core.allow_overlap:
+                            # register check
+                            yield from check_regs(self, sim, core, test, code)
+
+                            # Memory check
+                            yield from check_mem(self, sim, core, test, code)
+
+                    # non-overlap mode is only fine to check right at the end
+                    if core.allow_overlap:
+                        # wait until all settled
+                        # XXX really this should be in DMI, which should in turn
+                        # use issuer.any_busy to not send back "stopped" signal
+                        while (yield core.o.any_busy_o):
+                            yield
+                        yield Settle()
+
+                        # register check
+                        yield from check_regs(self, sim, core, test, code)
+
+                        # Memory check
+                        yield from check_mem(self, sim, core, test, code)
+
+            # give a couple extra clock cycles for gtkwave display to be happy
+            yield
+            yield
 
         sim.add_sync_process(process)
         with sim.write_vcd("core_simulator.vcd", "core_simulator.gtkw",
@@ -326,12 +390,14 @@ class TestRunner(FHDLTestCase):
 if __name__ == "__main__":
     unittest.main(exit=False)
     suite = unittest.TestSuite()
-    suite.addTest(TestRunner(LDSTTestCase().test_data))
-    suite.addTest(TestRunner(CRTestCase().test_data))
-    suite.addTest(TestRunner(ShiftRotTestCase().test_data))
-    suite.addTest(TestRunner(LogicalTestCase().test_data))
-    suite.addTest(TestRunner(ALUTestCase().test_data))
-    suite.addTest(TestRunner(BranchTestCase().test_data))
+    suite.addTest(TestRunner(HazardTestCase().test_data))
+    suite.addTest(TestRunner(RandomHazardTestCase().test_data))
+    #suite.addTest(TestRunner(LDSTTestCase().test_data))
+    #suite.addTest(TestRunner(CRTestCase().test_data))
+    #suite.addTest(TestRunner(ShiftRotTestCase().test_data))
+    #suite.addTest(TestRunner(LogicalTestCase().test_data))
+    #suite.addTest(TestRunner(ALUTestCase().test_data))
+    #suite.addTest(TestRunner(BranchTestCase().test_data))
 
     runner = unittest.TextTestRunner()
     runner.run(suite)
index ebc529fcada9f0a3b60d2fee463890f477955f21..b86e162efd2e2cfdbbf2763fbabaefc7e0f999d0 100644 (file)
@@ -20,39 +20,79 @@ from soc.simple.test.test_runner import TestRunner
 
 # test with ALU data and Logical data
 from openpower.test.alu.alu_cases import ALUTestCase
+from openpower.test.general.overlap_hazards import HazardTestCase
 from openpower.test.div.div_cases import DivTestCases
+from openpower.test.mul.mul_cases import MulTestCases2Arg
 from openpower.test.logical.logical_cases import LogicalTestCase
 from openpower.test.shift_rot.shift_rot_cases import ShiftRotTestCase
+from openpower.test.shift_rot.shift_rot_cases2 import ShiftRotTestCase2
 from openpower.test.cr.cr_cases import CRTestCase
 from openpower.test.branch.branch_cases import BranchTestCase
-from soc.fu.spr.test.test_pipe_caller import SPRTestCase
+from soc.fu.spr.test.test_pipe_caller import SPRTestCase
 from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.trap.trap_cases import TrapTestCase
 from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
-from openpower.simulator.test_helloworld_sim import HelloTestCases
+from openpower.simulator.test_helloworld_sim import HelloTestCases
 
 
 if __name__ == "__main__":
     svp64 = True
-    if len(sys.argv) == 2:
-        if sys.argv[1] == 'nosvp64':
-            svp64 = False
-        sys.argv.pop()
+    if len(sys.argv) > 1 and sys.argv[1] == 'nosvp64':
+        svp64 = False
+        del sys.argv[1]
 
-    print ("SVP64 test mode enabled", svp64)
+    # detect overlap case
+    allow_overlap = False
+    if len(sys.argv) >= 2 and sys.argv[1] == '--allow-overlap':
+        allow_overlap = True
+        del sys.argv[1]
+
+    # use in-order issuer, instead of the original FSM based one
+    inorder = False
+    if len(sys.argv) >= 2 and sys.argv[1] == '--inorder':
+        inorder = True
+        del sys.argv[1]
+
+    # allow list of testing to be selected by command-line
+    testing = []
+    for i in reversed(range(1, len(sys.argv))):
+        if not sys.argv[i].startswith('-'):
+            testing.append(sys.argv.pop(i))
+
+    if not testing:
+        testing = ['general', 'ldst', 'cr', 'shiftrot', 'shiftrot2',
+                   'logical', 'alu',
+                   'branch', 'div', 'mul', 'hazard']
+
+    print("SVP64 test mode enabled", svp64, "overlap",
+          allow_overlap, "in-order", inorder, "testing", testing)
 
     unittest.main(exit=False)
     suite = unittest.TestSuite()
-    # suite.addTest(TestRunner(HelloTestCases.test_data, svp64=svp64))
-    suite.addTest(TestRunner(DivTestCases().test_data, svp64=svp64))
-    # suite.addTest(TestRunner(AttnTestCase.test_data, svp64=svp64))
-    suite.addTest(TestRunner(GeneralTestCases.test_data, svp64=svp64))
-    suite.addTest(TestRunner(LDSTTestCase().test_data, svp64=svp64))
-    suite.addTest(TestRunner(CRTestCase().test_data, svp64=svp64))
-    suite.addTest(TestRunner(ShiftRotTestCase().test_data, svp64=svp64))
-    suite.addTest(TestRunner(LogicalTestCase().test_data, svp64=svp64))
-    suite.addTest(TestRunner(ALUTestCase().test_data, svp64=svp64))
-    suite.addTest(TestRunner(BranchTestCase().test_data, svp64=svp64))
-    # suite.addTest(TestRunner(SPRTestCase.test_data, svp64=svp64))
+
+    # dictionary  of data for tests
+    tests = {'hello': HelloTestCases.test_data,
+             'div': DivTestCases().test_data,
+             'mul': MulTestCases2Arg().test_data,
+             'attn': AttnTestCase.test_data,
+             'general': GeneralTestCases.test_data,
+             'ldst': LDSTTestCase().test_data,
+             'cr': CRTestCase().test_data,
+             'shiftrot': ShiftRotTestCase().test_data,
+             'shiftrot2': ShiftRotTestCase2().test_data,
+             'logical': LogicalTestCase().test_data,
+             'hazard': HazardTestCase().test_data,
+             'alu': ALUTestCase().test_data,
+             'branch': BranchTestCase().test_data,
+             'trap': TrapTestCase().test_data,
+             'spr': SPRTestCase().test_data
+             }
+
+    # walk through all tests, those requested get added
+    for tname, data in tests.items():
+        if tname in testing:
+            suite.addTest(TestRunner(data, svp64=svp64, inorder=inorder,
+                                     allow_overlap=allow_overlap))
 
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/src/soc/simple/test/test_issuer_dcache.py b/src/soc/simple/test/test_issuer_dcache.py
new file mode 100644 (file)
index 0000000..593f5a3
--- /dev/null
@@ -0,0 +1,53 @@
+"""dcbz test case
+
+related bugs:
+
+ * https://bugs.libre-soc.org/show_bug.cgi?id=51
+"""
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+
+import unittest
+import sys
+
+# here is the logic which takes test cases and "executes" them.
+# in this instance (TestRunner) its job is to instantiate both
+# a Libre-SOC nmigen-based HDL instance and an ISACaller python
+# simulator.  it's also responsible for performing the single
+# step and comparison.
+from soc.simple.test.test_runner import TestRunner
+
+##########
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+class DCBZTestCase(TestAccumulatorBase):
+
+    def case_1_dcbz(self):
+        lst = ["dcbz 1, 2"]
+        initial_regs = [0] * 32
+        initial_regs[1] = 0x0004
+        initial_regs[2] = 0x0008
+        initial_mem = {0x0000: (0x5432123412345678, 8),
+                       0x0008: (0xabcdef0187654321, 8),
+                       0x0020: (0x1828384822324252, 8),
+                        }
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem)
+##########
+
+
+if __name__ == "__main__":
+    svp64 = False
+
+    unittest.main(exit=False)
+    suite = unittest.TestSuite()
+
+    # add other test cases later
+    suite.addTest(TestRunner(DCBZTestCase().test_data, svp64=svp64,
+                              microwatt_mmu=True))
+
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
diff --git a/src/soc/simple/test/test_issuer_linux_5_7.py b/src/soc/simple/test/test_issuer_linux_5_7.py
new file mode 100644 (file)
index 0000000..00a0949
--- /dev/null
@@ -0,0 +1,126 @@
+"""simple core test, runs instructions from a TestMemory
+
+related bugs:
+
+ * https://bugs.libre-soc.org/show_bug.cgi?id=363
+"""
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+
+import unittest
+import sys
+
+# here is the logic which takes test cases and "executes" them.
+# in this instance (TestRunner) its job is to instantiate both
+# a Libre-SOC nmigen-based HDL instance and an ISACaller python
+# simulator.  it's also responsible for performing the single
+# step and comparison.
+from soc.simple.test.test_runner import TestRunner
+
+#@platen:bookmarks
+#src/openpower/test/runner.py:class TestRunnerBase(FHDLTestCase):
+
+# test with MMU
+from openpower.test.mmu.mmu_cases import MMUTestCase
+from openpower.test.mmu.mmu_rom_cases import MMUTestCaseROM, default_mem
+from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.ldst.ldst_exc_cases import LDSTExceptionTestCase
+#from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
+from soc.experiment.test import pagetables
+
+
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+from openpower.consts import MSR
+
+from soc.experiment.test import pagetables
+
+
+class MMUTestCase(TestAccumulatorBase):
+
+    def case_first_vm_enabled(self):
+        lst = [
+               "std 6,0(2)",
+              ]
+
+        # set up regs
+        initial_regs = [0] * 32
+        initial_regs[2] = 0xc0000000005fc190
+        initial_regs[6] = 0x0101
+
+        # memory same as microwatt test
+        initial_mem = pagetables.microwatt_linux_5_7_boot
+
+        # set virtual and non-privileged
+        # msr: 8000000000000011
+        initial_msr = 0 << MSR.PR # must set "problem" state
+        initial_msr |= 1 << MSR.LE # little-endian
+        initial_msr |= 1 << MSR.SF # 64-bit
+        initial_msr |= 1 << MSR.DR # set "virtual" state for data
+
+        # set PRTBL to 0xe000000
+        initial_sprs = {720: 0xe000000, # PRTBL
+                        48: 1       # PIDR
+                        } 
+
+        print("MMUTEST: initial_msr=",initial_msr)
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem,
+                             initial_sprs=initial_sprs,
+                             initial_msr=initial_msr)
+
+
+    def case_first_vm_enabled_2(self):
+        lst = [
+               "std 6,0(2)",
+              ]
+
+        # set up regs
+        initial_regs = [0] * 32
+        initial_regs[2] = 0xc000000000598000
+        initial_regs[6] = 0x0101
+
+        # memory same as microwatt test
+        initial_mem = pagetables.microwatt_linux_5_7_boot
+
+        # set virtual and non-privileged
+        # msr: 8000000000000011
+        initial_msr = 0 << MSR.PR # must set "problem" state
+        initial_msr |= 1 << MSR.LE # little-endian
+        initial_msr |= 1 << MSR.SF # 64-bit
+        initial_msr |= 1 << MSR.DR # set "virtual" state for data
+
+        # set PRTBL to 0xe000000
+        initial_sprs = {720: 0xe00000c, # PRTBL
+                        48: 1       # PIDR
+                        }
+
+        print("MMUTEST: initial_msr=",initial_msr)
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem,
+                             initial_sprs=initial_sprs,
+                             initial_msr=initial_msr)
+
+
+if __name__ == "__main__":
+    svp64 = True
+    if len(sys.argv) == 2:
+        if sys.argv[1] == 'nosvp64':
+            svp64 = False
+        sys.argv.pop()
+
+    print ("SVP64 test mode enabled", svp64)
+
+    unittest.main(exit=False)
+    suite = unittest.TestSuite()
+
+    # MMU/DCache integration tests
+    suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
+                              microwatt_mmu=True,
+                              rom=pagetables.microwatt_linux_5_7_boot))
+
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
index 4bf4d37a05792e5f04465a9801d7a2516d270457..e979c25f5f05873fb4cf8b4aeb19469a172f2155 100644 (file)
@@ -18,6 +18,9 @@ import sys
 # step and comparison.
 from soc.simple.test.test_runner import TestRunner
 
+#@platen:bookmarks
+#src/openpower/test/runner.py:class TestRunnerBase(FHDLTestCase):
+
 # test with MMU
 from openpower.test.mmu.mmu_cases import MMUTestCase
 from openpower.test.mmu.mmu_rom_cases import MMUTestCaseROM, default_mem
@@ -25,6 +28,109 @@ from openpower.test.ldst.ldst_cases import LDSTTestCase
 from openpower.test.ldst.ldst_exc_cases import LDSTExceptionTestCase
 #from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
 
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+from openpower.consts import MSR
+
+class MMUTestCase(TestAccumulatorBase):
+
+    # now working correctly
+    def case_1_dcbz(self):
+        lst = ["dcbz 1, 2",  # MMUTEST.DCBZ: EA from adder 12
+               "dcbz 1, 3"]  # MMUTEST.DCBZ: EA from adder 11
+        initial_regs = [0] * 32
+        initial_regs[1] = 0x0004
+        initial_regs[2] = 0x0008
+        initial_regs[3] = 0x0007
+        initial_mem = {}
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem)
+
+    # MMUTEST: OP_TLBIE: insn_bits=39
+    def case_2_tlbie(self):
+        lst = ["tlbie 1,1,1,1,1"] # tlbie   RB,RS,RIC,PRS,R
+        initial_regs = [0] * 32
+        initial_mem = {}
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem)
+
+    # OP_MTSPR: spr=720
+    def case_3_mtspr(self):
+        lst = ["mtspr 720,1"] # mtspr PRTBL,r1
+        initial_regs = [0] * 32
+        initial_regs[1] = 0x1234
+        initial_mem = {}
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem)
+
+    # OP_MFSPR: spr=18/19
+    def case_4_mfspr(self):
+        lst = ["mfspr 1,18", # mtspr r1,DSISR
+               "mfspr 2,19"] # mtspr r2,DAR
+        initial_regs = [0] * 32
+        initial_regs[1] = 0x1234
+        initial_regs[2] = 0x3456
+        initial_mem = {}
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem)
+
+    # new testcase for all sprs
+    def case_5_allsprs(self):
+        lst =  ["mtspr 720,1",   #MMUTEST: OP_MTSPR: spr=720
+                "mtspr 48,2",    #MMUTEST: OP_MTSPR: spr=48
+                "mtspr 18,3",    #MMUTEST: OP_MTSPR: spr=18
+                "mtspr 19,4",    #MMUTEST: OP_MTSPR: spr=19
+                "mfspr 5,720",   #MMUTEST: OP_MFSPR: spr=720 returns=4660
+                "mfspr 6,48",    #MMUTEST: OP_MFSPR: spr=48 returns=13398
+                "mfspr 7,18",    #MMUTEST: OP_MFSPR: spr=18 returns=17185
+                "mfspr 8,19"     #MMUTEST: OP_MFSPR: spr=19 returns=25923
+                ]
+        initial_regs = [0] * 32
+        initial_regs[1] = 0x1234
+        initial_regs[2] = 0x3456
+        initial_regs[3] = 0x4321
+        initial_regs[4] = 0x6543
+        initial_mem = {}
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem)
+
+    # MMUTEST: initial_msr= 16384
+    # msr 16384
+    # ISACaller initial_msr 16384
+    # FIXME msr does not get passed to LoadStore1
+    def case_5_ldst_exception(self):
+        lst = ["stb 10,0(2)"]
+        initial_regs = [0] * 32
+        initial_regs[1] = 0x1234
+        initial_regs[2] = 0x3456
+        initial_regs[3] = 0x4321
+        initial_regs[4] = 0x6543
+        initial_regs[10] = 0xfe
+        initial_mem = {}
+        #enable virtmode
+        initial_msr = 1 << MSR.PR # must set "problem" state for virtual memory
+        print("MMUTEST: initial_msr=",initial_msr)
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem,initial_msr=initial_msr)
+
+    # deliberately misalign 
+    def case_6_ldst_misalign(self):
+        lst = ["std 10,0(2)"]
+        initial_regs = [0] * 32
+        initial_regs[1] = 0x1234
+        initial_regs[2] = 0x3456
+        initial_regs[3] = 0x4321
+        initial_regs[4] = 0x6543
+        initial_regs[10] = 0x0123456789abcdef
+        initial_mem = {}
+        #enable virtmode
+        initial_msr = 1 << MSR.PR # must set "problem" state for virtual memory
+        print("MMUTEST: initial_msr=",initial_msr)
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem,initial_msr=initial_msr)
+
 if __name__ == "__main__":
     svp64 = True
     if len(sys.argv) == 2:
@@ -36,22 +142,10 @@ if __name__ == "__main__":
 
     unittest.main(exit=False)
     suite = unittest.TestSuite()
-    #suite.addTest(TestRunner(GeneralTestCases.test_data, svp64=svp64,
-    #                          microwatt_mmu=True))
-    #suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
-    #                          microwatt_mmu=True))
-
-    # without ROM set
-    #suite.addTest(TestRunner(MMUTestCaseROM().test_data, svp64=svp64,
-    #                          microwatt_mmu=True))
 
-    # LD/ST tests should all still work
-    suite.addTest(TestRunner(LDSTTestCase().test_data, svp64=svp64,
+    # MMU/DCache integration tests
+    suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
                               microwatt_mmu=True))
 
-    # LD/ST exception cases
-    #suite.addTest(TestRunner(LDSTExceptionTestCase().test_data, svp64=svp64,
-    #                          microwatt_mmu=True))
-
     runner = unittest.TextTestRunner()
     runner.run(suite)
diff --git a/src/soc/simple/test/test_issuer_mmu_ifetch.py b/src/soc/simple/test/test_issuer_mmu_ifetch.py
new file mode 100644 (file)
index 0000000..81f1b32
--- /dev/null
@@ -0,0 +1,114 @@
+"""simple core test, runs instructions from a TestMemory
+
+related bugs:
+
+ * https://bugs.libre-soc.org/show_bug.cgi?id=363
+"""
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+
+import unittest
+import sys
+
+# here is the logic which takes test cases and "executes" them.
+# in this instance (TestRunner) its job is to instantiate both
+# a Libre-SOC nmigen-based HDL instance and an ISACaller python
+# simulator.  it's also responsible for performing the single
+# step and comparison.
+from soc.simple.test.test_runner import TestRunner
+
+#@platen:bookmarks
+#src/openpower/test/runner.py:class TestRunnerBase(FHDLTestCase):
+
+# test with MMU
+from openpower.test.mmu.mmu_cases import MMUTestCase
+from openpower.test.mmu.mmu_rom_cases import MMUTestCaseROM, default_mem
+from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.ldst.ldst_exc_cases import LDSTExceptionTestCase
+#from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
+
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+from openpower.consts import MSR
+
+from soc.experiment.test import pagetables
+
+
+class MMUTestCase(TestAccumulatorBase):
+
+    def case_virtual_ld_st(self):
+        lst = ["stb 10,0(2)",
+               "addi 10,0, -4",
+               "stb 10,0(5)",
+               "lhz 6,0(2)",
+              ]
+
+        # set up regs
+        initial_regs = [0] * 32
+        initial_regs[1] = 0x1000000 # hm, was going to do mtspr 720,1 with this
+        initial_regs[2] = 0x3456
+        initial_regs[3] = 0x4321
+        initial_regs[4] = 0x6543
+        initial_regs[5] = 0x3457
+        initial_regs[10] = 0xfe
+
+        # no pre-loaded memory here
+        initial_mem = {}
+
+        # set virtual and non-privileged
+        initial_msr = 0 << MSR.PR # must set "problem" state
+        #initial_msr |= 1 << MSR.DR # set "virtual" state for data
+        initial_msr |= 1 << MSR.IR # set "virtual" state for instructions
+        initial_msr |= 1 << MSR.LE # set little-endian
+
+        # set PRTBL to 0x1000000
+        initial_sprs = {720: 0x1000000} # PRTBL
+
+        print("MMUTEST: initial_msr=",initial_msr)
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem,
+                             initial_sprs=initial_sprs,
+                             initial_msr=initial_msr)
+
+    def case_virtual_invalid_no_prtbl(self):
+        """virtual memory test but with no PRTBL set it is expected
+        to throw an "invalid" exception
+        """
+        lst = ["stb 10,0(2)",
+              ]
+
+        # set up regs
+        initial_regs = [0] * 32
+
+        # set virtual and non-privileged
+        initial_msr = 1 << MSR.PR # must set "problem" state
+        initial_msr |= 1 << MSR.DR # set "virtual" state for data
+        initial_msr |= 1 << MSR.IR # set "virtual" state for instructions
+
+        print("MMUTEST: initial_msr=",initial_msr)
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_msr=initial_msr,
+                             stop_at_pc=0x400) # stop at this exception addr
+
+if __name__ == "__main__":
+    svp64 = True
+    if len(sys.argv) == 2:
+        if sys.argv[1] == 'nosvp64':
+            svp64 = False
+        sys.argv.pop()
+
+    print ("SVP64 test mode enabled", svp64)
+
+    unittest.main(exit=False)
+    suite = unittest.TestSuite()
+
+    # MMU/DCache integration tests
+    suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
+                              microwatt_mmu=True,
+                              rom=pagetables.test1))
+
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
diff --git a/src/soc/simple/test/test_issuer_mmu_microwatt.py b/src/soc/simple/test/test_issuer_mmu_microwatt.py
new file mode 100644 (file)
index 0000000..69fe9ff
--- /dev/null
@@ -0,0 +1,93 @@
+"""simple core test, runs instructions from a TestMemory
+
+related bugs:
+
+ * https://bugs.libre-soc.org/show_bug.cgi?id=363
+"""
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+
+import unittest
+import sys
+
+# here is the logic which takes test cases and "executes" them.
+# in this instance (TestRunner) its job is to instantiate both
+# a Libre-SOC nmigen-based HDL instance and an ISACaller python
+# simulator.  it's also responsible for performing the single
+# step and comparison.
+from soc.simple.test.test_runner import TestRunner
+
+#@platen:bookmarks
+#src/openpower/test/runner.py:class TestRunnerBase(FHDLTestCase):
+
+# test with MMU
+from openpower.test.mmu.mmu_cases import MMUTestCase
+from openpower.test.mmu.mmu_rom_cases import MMUTestCaseROM, default_mem
+from openpower.test.ldst.ldst_cases import LDSTTestCase
+from openpower.test.ldst.ldst_exc_cases import LDSTExceptionTestCase
+#from openpower.simulator.test_sim import (GeneralTestCases, AttnTestCase)
+from soc.experiment.test import pagetables
+
+
+from openpower.simulator.program import Program
+from openpower.endian import bigendian
+from openpower.test.common import TestAccumulatorBase
+
+from openpower.consts import MSR
+
+from soc.experiment.test import pagetables
+
+
+class MMUTestCase(TestAccumulatorBase):
+
+    def case_microwatt_test_3_mmu_ld(self):
+        lst = [
+               "ld 6,0(2)",
+              ]
+
+        # set up regs
+        initial_regs = [0] * 32
+        initial_regs[2] = 0x124108
+
+        # memory same as microwatt test
+        initial_mem = pagetables.microwatt_test2
+
+        # set virtual and non-privileged
+        # msr: 8000000000000011
+        initial_msr = 0 << MSR.PR # must set "problem" state
+        initial_msr |= 1 << MSR.LE # little-endian
+        initial_msr |= 1 << MSR.SF # 64-bit
+        initial_msr |= 1 << MSR.DR # set "virtual" state for data
+
+        # set PRTBL to 0x12000
+        initial_sprs = {720: 0x12000, # PRTBL
+                        48: 1       # PIDR
+                        } 
+
+        print("MMUTEST: initial_msr=",initial_msr)
+        self.add_case(Program(lst, bigendian), initial_regs,
+                             initial_mem=initial_mem,
+                             initial_sprs=initial_sprs,
+                             initial_msr=initial_msr)
+
+
+if __name__ == "__main__":
+    svp64 = True
+    if len(sys.argv) == 2:
+        if sys.argv[1] == 'nosvp64':
+            svp64 = False
+        sys.argv.pop()
+
+    print ("SVP64 test mode enabled", svp64)
+
+    unittest.main(exit=False)
+    suite = unittest.TestSuite()
+
+    # MMU/DCache integration tests
+    suite.addTest(TestRunner(MMUTestCase().test_data, svp64=svp64,
+                              microwatt_mmu=True,
+                              rom=pagetables.microwatt_test2))
+
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
index b89867e480c06afcb83acbefd451db32e08653de..29d6acf63a05e1713c5b1c6b086c85b76af46970 100644 (file)
@@ -15,11 +15,12 @@ from soc.config.test.test_loadstore import TestMemPspec
 from soc.simple.test.test_core import (setup_regs, check_regs,
                                        wait_for_busy_clear,
                                        wait_for_busy_hi)
-from soc.fu.compunits.test.test_compunit import (setup_test_memory,
-                                                 check_sim_memory,
+from soc.fu.compunits.test.test_compunit import (check_sim_memory,
                                                  get_l0_mem)
 
-from soc.simple.test.test_issuer import setup_i_memory
+from soc.simple.test.test_runner import setup_i_memory
+
+from pathlib import Path
 
 import sys
 sys.setrecursionlimit(10**6)
@@ -37,6 +38,8 @@ class BinaryTestCase(FHDLTestCase):
         with Program("1.bin", bigendian) as program:
             self.run_tst_program(program)
 
+    @unittest.skipUnless(Path("hello_world.bin").exists(),
+                         "missing hello_world.bin")
     def test_binary(self):
         with Program("hello_world.bin", bigendian) as program:
             self.run_tst_program(program)
@@ -63,7 +66,7 @@ class TestRunner(FHDLTestCase):
 
         pspec = TestMemPspec(ldst_ifacetype='test_bare_wb',
                              imem_ifacetype='test_bare_wb',
-                             addr_wid=48,
+                             addr_wid=64,
                              mask_wid=8,
                              reg_wid=64,
                              imem_test_depth=32768,
@@ -112,7 +115,6 @@ class TestRunner(FHDLTestCase):
                 # blech!  put the same listing into the data memory
                 data_mem = get_l0_mem(l0)
                 yield from setup_i_memory(data_mem, pc, instructions)
-                # yield from setup_test_memory(l0, sim)
                 yield from setup_regs(core, test)
 
                 yield pc_i.eq(pc)
index cb4ca4589719d3fb09a214dd2e1cc15711f0c4cd..5d6e57da2b1e82025a1604b52aa918927dac4d8a 100644 (file)
@@ -3,47 +3,84 @@
 related bugs:
 
  * https://bugs.libre-soc.org/show_bug.cgi?id=363
+ * https://bugs.libre-soc.org/show_bug.cgi?id=686#c51
 """
-from nmigen import Module, Signal, Cat, ClockSignal
+from nmigen import Module, Signal
 from nmigen.hdl.xfrm import ResetInserter
+from copy import copy
+from pprint import pprint
 
 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
 # Also, check out the cxxsim nmigen branch, and latest yosys from git
 from nmutil.sim_tmp_alternative import Simulator, Settle
 
-from nmutil.formaltest import FHDLTestCase
-from nmutil.gtkw import write_gtkw
-from nmigen.cli import rtlil
-from openpower.decoder.isa.caller import special_sprs, SVP64State
+from openpower.decoder.isa.caller import SVP64State
 from openpower.decoder.isa.all import ISA
 from openpower.endian import bigendian
 
-from openpower.decoder.power_decoder import create_pdecode
-from openpower.decoder.power_decoder2 import PowerDecode2
-from soc.regfile.regfiles import StateRegs
-
 from soc.simple.issuer import TestIssuerInternal
+from soc.simple.inorder import TestIssuerInternalInOrder
 
-from soc.config.test.test_loadstore import TestMemPspec
-from soc.simple.test.test_core import (setup_regs, check_regs,
+from soc.simple.test.test_core import (setup_regs, check_regs, check_mem,
                                        wait_for_busy_clear,
                                        wait_for_busy_hi)
-from soc.fu.compunits.test.test_compunit import (setup_test_memory,
+from soc.fu.compunits.test.test_compunit import (setup_tst_memory,
                                                  check_sim_memory)
 from soc.debug.dmi import DBGCore, DBGCtrl, DBGStat
 from nmutil.util import wrap
-from soc.experiment.test.test_mmu_dcache import wb_get
+from openpower.test.state import TestState, StateRunner
+from openpower.test.runner import TestRunnerBase
+
+
+def insert_into_rom(startaddr, instructions, rom):
+    print("insn before, init rom", len(instructions))
+    pprint(rom)
+
+    startaddr //= 4  # instructions are 32-bit
+
+    # 64 bit
+    mask = ((1 << 64)-1)
+    for ins in instructions:
+        if isinstance(ins, tuple):
+            insn, code = ins
+        else:
+            insn, code = ins, ''
+        insn = insn & 0xffffffff
+        msbs = (startaddr >> 1) & mask
+        lsb = 1 if (startaddr & 1) else 0
+        print ("insn", hex(insn), hex(msbs), hex(lsb))
+
+        val = rom.get(msbs<<3, 0)
+        if insn != 0:
+            print("before set", hex(4*startaddr),
+                  hex(msbs), hex(val), hex(insn))
+        val = (val | (insn << (lsb*32)))
+        val = val & mask
+        rom[msbs<<3] = val
+        if insn != 0:
+            print("after  set", hex(4*startaddr), hex(msbs), hex(val))
+            print("instr: %06x 0x%x %s %08x" % (4*startaddr, insn, code, val))
+        startaddr += 1
+        startaddr = startaddr & mask
 
+    print ("after insn insert")
+    pprint(rom)
 
-def setup_i_memory(imem, startaddr, instructions):
+
+def setup_i_memory(imem, startaddr, instructions, rom):
     mem = imem
     print("insn before, init mem", mem.depth, mem.width, mem,
           len(instructions))
-    for i in range(mem.depth):
-        yield mem._array[i].eq(0)
-    yield Settle()
+
+    if not rom:
+        # initialise mem array to zero
+        for i in range(mem.depth):
+            yield mem._array[i].eq(0)
+        yield Settle()
+
     startaddr //= 4  # instructions are 32-bit
     if mem.width == 32:
+        assert rom is None, "cannot do 32-bit from wb_get ROM yet"
         mask = ((1 << 32)-1)
         for ins in instructions:
             if isinstance(ins, tuple):
@@ -68,15 +105,22 @@ def setup_i_memory(imem, startaddr, instructions):
             insn, code = ins, ''
         insn = insn & 0xffffffff
         msbs = (startaddr >> 1) & mask
-        val = yield mem._array[msbs]
+        lsb = 1 if (startaddr & 1) else 0
+
+        if rom: # must put the value into the wb_get area
+            val = rom[msbs<<1]
+        else:
+            val = yield mem._array[msbs]
         if insn != 0:
             print("before set", hex(4*startaddr),
                   hex(msbs), hex(val), hex(insn))
-        lsb = 1 if (startaddr & 1) else 0
         val = (val | (insn << (lsb*32)))
         val = val & mask
-        yield mem._array[msbs].eq(val)
-        yield Settle()
+        if rom: # must put the value into the wb_get area
+            rom[msbs<<1] = val
+        else:
+            yield mem._array[msbs].eq(val)
+            yield Settle()
         if insn != 0:
             print("after  set", hex(4*startaddr), hex(msbs), hex(val))
             print("instr: %06x 0x%x %s %08x" % (4*startaddr, insn, code, val))
@@ -121,330 +165,238 @@ def get_dmi(dmi, addr):
     return data
 
 
-class TestRunner(FHDLTestCase):
-    def __init__(self, tst_data, microwatt_mmu=False, rom=None,
-                        svp64=True):
-        super().__init__("run_all")
-        self.test_data = tst_data
-        self.microwatt_mmu = microwatt_mmu
-        self.rom = rom
-        self.svp64 = svp64
-
-    def run_all(self):
-        m = Module()
-        comb = m.d.comb
-        pc_i = Signal(32)
-        svstate_i = Signal(32)
+class HDLRunner(StateRunner):
+    """HDLRunner:  Implements methods for the setup, preparation, and
+    running of tests using nmigen HDL simulation.
+    """
+
+    def __init__(self, dut, m, pspec):
+        super().__init__("hdl", HDLRunner)
+
+        self.dut = dut
+        self.pspec = pspec
+        self.pc_i = Signal(32)
+        self.svstate_i = Signal(64)
 
-        if self.microwatt_mmu:
-            ldst_ifacetype = 'test_mmu_cache_wb'
-        else:
-            ldst_ifacetype = 'test_bare_wb'
-        imem_ifacetype = 'test_bare_wb'
-
-        pspec = TestMemPspec(ldst_ifacetype=ldst_ifacetype,
-                             imem_ifacetype=imem_ifacetype,
-                             addr_wid=48,
-                             mask_wid=8,
-                             imem_reg_wid=64,
-                             # wb_data_width=32,
-                             use_pll=False,
-                             nocore=False,
-                             xics=False,
-                             gpio=False,
-                             regreduce=True,
-                             svp64=self.svp64,
-                             mmu=self.microwatt_mmu,
-                             reg_wid=64)
         #hard_reset = Signal(reset_less=True)
-        issuer = TestIssuerInternal(pspec)
+        if pspec.inorder:
+            self.issuer = TestIssuerInternalInOrder(pspec)
+        else:
+            self.issuer = TestIssuerInternal(pspec)
         # use DMI RESET command instead, this does actually work though
-        #issuer = ResetInserter({'coresync': hard_reset,
+        # issuer = ResetInserter({'coresync': hard_reset,
         #                        'sync': hard_reset})(issuer)
-        m.submodules.issuer = issuer
-        imem = issuer.imem._get_memory()
-        core = issuer.core
-        dmi = issuer.dbg.dmi
-        pdecode2 = issuer.pdecode2
-        l0 = core.l0
-        regreduce_en = pspec.regreduce_en == True
+        m.submodules.issuer = self.issuer
+        self.dmi = self.issuer.dbg.dmi
 
-        # copy of the decoder for simulator
-        simdec = create_pdecode()
-        simdec2 = PowerDecode2(simdec, regreduce_en=regreduce_en)
-        m.submodules.simdec2 = simdec2  # pain in the neck
+        comb = m.d.comb
+        comb += self.issuer.pc_i.data.eq(self.pc_i)
+        comb += self.issuer.svstate_i.data.eq(self.svstate_i)
 
-        # run core clock at same rate as test clock
-        intclk = ClockSignal("coresync")
-        comb += intclk.eq(ClockSignal())
+    def prepare_for_test(self, test):
+        self.test = test
+        #print ("preparing for test name", test.name)
 
-        comb += issuer.pc_i.data.eq(pc_i)
-        comb += issuer.svstate_i.data.eq(svstate_i)
+        # set up bigendian (TODO: don't do this, use MSR)
+        yield self.issuer.core_bigendian_i.eq(bigendian)
+        yield Settle()
 
-        # nmigen Simulation
-        sim = Simulator(m)
-        sim.add_clock(1e-6)
+        yield
+        yield
+        yield
+        yield
+        #print ("end of test preparation", test.name)
+
+    def setup_during_test(self):
+        # first run a manual hard-reset of the debug interface.
+        # core is counting down on a 3-clock delay at this point
+        yield self.issuer.dbg_rst_i.eq(1)
+        yield
+        yield self.issuer.dbg_rst_i.eq(0)
 
-        def process():
+        # now run a DMI-interface reset.  because DMI is running
+        # in dbgsync domain its reset is *NOT* connected to
+        # core reset (hence the dbg_rst_i blip, above)
+        yield from set_dmi(self.dmi, DBGCore.CTRL, 1 << DBGCtrl.STOP)
+        yield
+        #print("test setup")
+
+    def run_test(self, instructions):
+        """run_hdl_state - runs a TestIssuer nmigen HDL simulation
+        """
+
+        #print("starting test")
+
+        if self.dut.rom is None:
+            imem = self.issuer.imem._get_memory()
+            #print("got memory", imem)
+        else:
+            print("skipping memory get due to rom")
+            pprint(self.dut.rom)
+        core = self.issuer.core
+        dmi = self.issuer.dbg.dmi
+        pdecode2 = self.issuer.pdecode2
+        l0 = core.l0
+        hdl_states = []
+
+        # establish the TestIssuer context (mem, regs etc)
+
+        pc = 0  # start address
+        counter = 0  # test to pause/start
+
+        # XXX for now, when ROM (run under wb_get) is detected,
+        # skip setup of memories.  must be done a different way
+        if self.dut.rom is None:
+            yield from setup_i_memory(imem, pc, instructions, self.dut.rom)
+            yield from setup_tst_memory(l0, self.test.mem)
+        else:
+            insert_into_rom(pc, instructions, self.dut.default_mem)
+        print("about to setup regs")
+        yield from setup_regs(pdecode2, core, self.test)
+        #print("setup mem and regs done")
+
+        # set PC and SVSTATE
+        yield self.pc_i.eq(pc)
+        yield self.issuer.pc_i.ok.eq(1)
+
+        # copy initial SVSTATE
+        initial_svstate = copy(self.test.svstate)
+        if isinstance(initial_svstate, int):
+            initial_svstate = SVP64State(initial_svstate)
+        yield self.svstate_i.eq(initial_svstate.value)
+        yield self.issuer.svstate_i.ok.eq(1)
+        yield
 
-            # start in stopped
-            yield from set_dmi(dmi, DBGCore.CTRL, 1<<DBGCtrl.STOP)
-            yield
+        print("instructions", instructions)
 
-            # get each test, completely reset the core, and run it
+        # before starting the simulation, set the core stop address to be
+        # just after the last instruction. if a load of an instruction is
+        # requested at this address, the core is immediately put into "halt"
+        # XXX: keep an eye out for in-order problems
+        hard_stop_addr = self.test.stop_at_pc
+        if hard_stop_addr is None:
+            hard_stop_addr = len(instructions)*4
+        yield from set_dmi(dmi, DBGCore.STOPADDR, hard_stop_addr)
 
-            for test in self.test_data:
+        # run the loop of the instructions on the current test
+        index = (yield self.issuer.cur_state.pc) // 4
+        while index < len(instructions):
+            ins, code = instructions[index]
 
-                # set up bigendian (TODO: don't do this, use MSR)
-                yield issuer.core_bigendian_i.eq(bigendian)
-                yield Settle()
+            print("hdl instr: 0x{:X}".format(ins & 0xffffffff))
+            print(index, code)
 
+            if counter == 0:
+                # start the core
                 yield
+                yield from set_dmi(dmi, DBGCore.CTRL,
+                                   1 << DBGCtrl.START)
+                yield self.issuer.pc_i.ok.eq(0)  # no change PC after this
+                yield self.issuer.svstate_i.ok.eq(0)  # ditto
                 yield
                 yield
+
+            counter = counter + 1
+
+            # wait until executed
+            while not ((yield self.issuer.insn_done) or
+                       (yield self.issuer.dbg.terminated_o)):
                 yield
 
-                print(test.name)
-                program = test.program
-                with self.subTest(test.name):
-                    print("regs", test.regs)
-                    print("sprs", test.sprs)
-                    print("cr", test.cr)
-                    print("mem", test.mem)
-                    print("msr", test.msr)
-                    print("assem", program.assembly)
-                    gen = list(program.generate_instructions())
-                    insncode = program.assembly.splitlines()
-                    instructions = list(zip(gen, insncode))
-
-                    # set up the Simulator (which must track TestIssuer exactly)
-                    sim = ISA(simdec2, test.regs, test.sprs, test.cr, test.mem,
-                              test.msr,
-                              initial_insns=gen, respect_pc=True,
-                              disassembly=insncode,
-                              bigendian=bigendian,
-                              initial_svstate=test.svstate)
-
-                    # establish the TestIssuer context (mem, regs etc)
-
-                    pc = 0  # start address
-                    counter = 0  # test to pause/start
-
-                    yield from setup_i_memory(imem, pc, instructions)
-                    yield from setup_test_memory(l0, sim)
-                    yield from setup_regs(pdecode2, core, test)
-
-                    # set PC and SVSTATE
-                    yield pc_i.eq(pc)
-                    yield issuer.pc_i.ok.eq(1)
-
-                    initial_svstate = test.svstate
-                    if isinstance(initial_svstate, int):
-                        initial_svstate = SVP64State(initial_svstate)
-                    yield svstate_i.eq(initial_svstate.spr.value)
-                    yield issuer.svstate_i.ok.eq(1)
-                    yield
+            # okaaay long story: in overlap mode, PC is updated one cycle
+            # late.
+            if self.dut.allow_overlap:
+                yield
+            yield Settle()
 
-                    print("instructions", instructions)
-
-                    # run the loop of the instructions on the current test
-                    index = sim.pc.CIA.value//4
-                    while index < len(instructions):
-                        ins, code = instructions[index]
-
-                        print("instruction: 0x{:X}".format(ins & 0xffffffff))
-                        print(index, code)
-
-                        if counter == 0:
-                            # start the core
-                            yield
-                            yield from set_dmi(dmi, DBGCore.CTRL,
-                                               1<<DBGCtrl.START)
-                            yield issuer.pc_i.ok.eq(0) # no change PC after this
-                            yield issuer.svstate_i.ok.eq(0) # ditto
-                            yield
-                            yield
-
-                        counter = counter + 1
-
-                        # wait until executed
-                        while not (yield issuer.insn_done):
-                            yield
-
-                        # set up simulated instruction (in simdec2)
-                        try:
-                            yield from sim.setup_one()
-                        except KeyError:  # instruction not in imem: stop
-                            break
-                        yield Settle()
-
-                        # call simulated operation
-                        print("sim", code)
-                        yield from sim.execute_one()
-                        yield Settle()
-                        index = sim.pc.CIA.value//4
-
-                        terminated = yield issuer.dbg.terminated_o
-                        print("terminated", terminated)
-
-                        if index >= len(instructions):
-                            print ("index over, send dmi stop")
-                            # stop at end
-                            yield from set_dmi(dmi, DBGCore.CTRL,
-                                               1<<DBGCtrl.STOP)
-                            yield
-                            yield
-
-                        # register check
-                        yield from check_regs(self, sim, core, test, code)
-
-                        # Memory check
-                        yield from check_sim_memory(self, l0, sim, code)
-
-                        terminated = yield issuer.dbg.terminated_o
-                        print("terminated(2)", terminated)
-                        if terminated:
-                            break
+            index = (yield self.issuer.cur_state.pc) // 4
 
+            terminated = yield self.issuer.dbg.terminated_o
+            print("terminated", terminated, index, len(instructions))
+
+            if index < len(instructions):
+                # Get HDL mem and state
+                state = yield from TestState("hdl", core, self.dut,
+                                             code)
+                hdl_states.append(state)
+
+            if index >= len(instructions):
+                print("index over, send dmi stop")
                 # stop at end
-                yield from set_dmi(dmi, DBGCore.CTRL, 1<<DBGCtrl.STOP)
+                yield from set_dmi(dmi, DBGCore.CTRL, 1 << DBGCtrl.STOP)
                 yield
                 yield
+                # hmm really should use DMI status check here but hey it's quick
+                while True:
+                    stopped = yield self.issuer.dbg.core_stop_o
+                    if stopped:
+                        break
+                    yield
+                break
+
+            terminated = yield self.issuer.dbg.terminated_o
+            print("terminated(2)", terminated)
+            if terminated:
+                break
+
+        if self.dut.allow_overlap: # or not self.dut.rom: ??
+            # wait until all settled
+            # XXX really this should be in DMI, which should in turn
+            # use issuer.any_busy to not send back "stopped" signal
+            while (yield self.issuer.any_busy):
+                yield
 
-                # get CR
-                cr = yield from get_dmi(dmi, DBGCore.CR)
-                print("after test %s cr value %x" % (test.name, cr))
+        if self.dut.allow_overlap:
+            # get last state, at end of run
+            state = yield from TestState("hdl", core, self.dut,
+                                         code)
+            hdl_states.append(state)
 
-                # get XER
-                xer = yield from get_dmi(dmi, DBGCore.XER)
-                print("after test %s XER value %x" % (test.name, xer))
+        return hdl_states
 
-                # test of dmi reg get
-                for int_reg in range(32):
-                    yield from set_dmi(dmi, DBGCore.GSPR_IDX, int_reg)
-                    value = yield from get_dmi(dmi, DBGCore.GSPR_DATA)
+    def end_test(self):
+        yield from set_dmi(self.dmi, DBGCore.CTRL, 1 << DBGCtrl.STOP)
+        yield
+        yield
 
-                    print("after test %s reg %2d value %x" %
-                          (test.name, int_reg, value))
+        # TODO, here is where the static (expected) results
+        # can be checked: register check (TODO, memory check)
+        # see https://bugs.libre-soc.org/show_bug.cgi?id=686#c51
+        # yield from check_regs(self, sim, core, test, code,
+        #                       >>>expected_data<<<)
 
-                # pull a reset
-                yield from set_dmi(dmi, DBGCore.CTRL, 1<<DBGCtrl.RESET)
-                yield
+        # get CR
+        cr = yield from get_dmi(self.dmi, DBGCore.CR)
+        print("after test %s cr value %x" % (self.test.name, cr))
+
+        # get XER
+        xer = yield from get_dmi(self.dmi, DBGCore.XER)
+        print("after test %s XER value %x" % (self.test.name, xer))
+
+        # get MSR
+        msr = yield from get_dmi(self.dmi, DBGCore.MSR)
+        print("after test %s MSR value %x" % (self.test.name, msr))
+
+        # test of dmi reg get
+        for int_reg in range(32):
+            yield from set_dmi(self.dmi, DBGCore.GSPR_IDX, int_reg)
+            value = yield from get_dmi(self.dmi, DBGCore.GSPR_DATA)
 
-        styles = {
-            'dec': {'base': 'dec'},
-            'bin': {'base': 'bin'},
-            'closed': {'closed': True}
-        }
-
-        traces = [
-            'clk',
-            ('state machines', 'closed', [
-                'fetch_pc_valid_i', 'fetch_pc_ready_o',
-                'fetch_fsm_state',
-                'fetch_insn_valid_o', 'fetch_insn_ready_i',
-                'pred_insn_valid_i', 'pred_insn_ready_o',
-                'fetch_predicate_state',
-                'pred_mask_valid_o', 'pred_mask_ready_i',
-                'issue_fsm_state',
-                'exec_insn_valid_i', 'exec_insn_ready_o',
-                'exec_fsm_state',
-                'exec_pc_valid_o', 'exec_pc_ready_i',
-                'insn_done', 'core_stop_o', 'pc_i_ok', 'pc_changed',
-                'is_last', 'dec2.no_out_vec']),
-            {'comment': 'fetch and decode'},
-            (None, 'dec', [
-                'cia[63:0]', 'nia[63:0]', 'pc[63:0]',
-                'cur_pc[63:0]', 'core_core_cia[63:0]']),
-            'raw_insn_i[31:0]',
-            'raw_opcode_in[31:0]', 'insn_type',
-            ('svp64 decoding', 'closed', [
-                'svp64_rm[23:0]', ('dec2.extra[8:0]', 'bin'),
-                'dec2.sv_rm_dec.mode', 'dec2.sv_rm_dec.predmode',
-                'dec2.sv_rm_dec.ptype_in',
-                'dec2.sv_rm_dec.dstpred[2:0]', 'dec2.sv_rm_dec.srcpred[2:0]',
-                'dstmask[63:0]', 'srcmask[63:0]',
-                'dregread[4:0]', 'dinvert',
-                'sregread[4:0]', 'sinvert',
-                'core.int.pred__addr[4:0]', 'core.int.pred__data_o[63:0]',
-                'core.int.pred__ren']),
-            ('register augmentation', 'dec', 'closed', [
-                {'comment': 'v3.0b registers'},
-                'dec2.dec_o.RT[4:0]',
-                'dec2.dec_a.RA[4:0]',
-                'dec2.dec_b.RB[4:0]',
-                ('Rdest', [
-                    'dec2.o_svdec.reg_in[4:0]',
-                    ('dec2.o_svdec.spec[2:0]', 'bin'),
-                    'dec2.o_svdec.reg_out[6:0]']),
-                ('Rsrc1', [
-                    'dec2.in1_svdec.reg_in[4:0]',
-                    ('dec2.in1_svdec.spec[2:0]', 'bin'),
-                    'dec2.in1_svdec.reg_out[6:0]']),
-                ('Rsrc1', [
-                    'dec2.in2_svdec.reg_in[4:0]',
-                    ('dec2.in2_svdec.spec[2:0]', 'bin'),
-                    'dec2.in2_svdec.reg_out[6:0]']),
-                {'comment': 'SVP64 registers'},
-                'dec2.rego[6:0]', 'dec2.reg1[6:0]', 'dec2.reg2[6:0]'
-            ]),
-            {'comment': 'svp64 context'},
-            'core_core_vl[6:0]', 'core_core_maxvl[6:0]',
-            'core_core_srcstep[6:0]', 'next_srcstep[6:0]',
-            'core_core_dststep[6:0]',
-            {'comment': 'issue and execute'},
-            'core.core_core_insn_type',
-            (None, 'dec', [
-                'core_rego[6:0]', 'core_reg1[6:0]', 'core_reg2[6:0]']),
-            'issue_i', 'busy_o',
-            {'comment': 'dmi'},
-            'dbg.dmi_req_i', 'dbg.dmi_ack_o',
-            {'comment': 'instruction memory'},
-            'imem.sram.rdport.memory(0)[63:0]',
-            {'comment': 'registers'},
-            # match with soc.regfile.regfiles.IntRegs port names
-            'core.int.rp_src1.memory(0)[63:0]',
-            'core.int.rp_src1.memory(1)[63:0]',
-            'core.int.rp_src1.memory(2)[63:0]',
-            'core.int.rp_src1.memory(3)[63:0]',
-            'core.int.rp_src1.memory(4)[63:0]',
-            'core.int.rp_src1.memory(5)[63:0]',
-            'core.int.rp_src1.memory(6)[63:0]',
-            'core.int.rp_src1.memory(7)[63:0]',
-            'core.int.rp_src1.memory(9)[63:0]',
-            'core.int.rp_src1.memory(10)[63:0]',
-            'core.int.rp_src1.memory(13)[63:0]',
-        ]
-
-        if self.microwatt_mmu:
-            traces += [
-                {'comment': 'microwatt_mmu'},
-                'core.fus.mmu0.alu_mmu0.illegal',
-                'core.fus.mmu0.alu_mmu0.debug0[3:0]',
-                'core.fus.mmu0.alu_mmu0.mmu.state',
-                'core.fus.mmu0.alu_mmu0.mmu.pid[31:0]',
-                'core.fus.mmu0.alu_mmu0.mmu.prtbl[63:0]',
-                {'comment': 'wishbone_memory'},
-                'core.fus.mmu0.alu_mmu0.dcache.stb',
-                'core.fus.mmu0.alu_mmu0.dcache.cyc',
-                'core.fus.mmu0.alu_mmu0.dcache.we',
-                'core.fus.mmu0.alu_mmu0.dcache.ack',
-                'core.fus.mmu0.alu_mmu0.dcache.stall,'
-            ]
-
-        write_gtkw("issuer_simulator.gtkw",
-                   "issuer_simulator.vcd",
-                   traces, styles, module='top.issuer')
-
-        # add run of instructions
-        sim.add_sync_process(process)
-
-        # optionally, if a wishbone-based ROM is passed in, run that as an
-        # extra emulated process
-        if self.rom is not None:
-            dcache = core.fus.fus["mmu0"].alu.dcache
-            default_mem = self.rom
-            sim.add_sync_process(wrap(wb_get(dcache, default_mem, "DCACHE")))
-
-        with sim.write_vcd("issuer_simulator.vcd"):
-            sim.run()
+            print("after test %s reg %2d value %x" %
+                  (self.test.name, int_reg, value))
+
+        # pull a reset
+        yield from set_dmi(self.dmi, DBGCore.CTRL, 1 << DBGCtrl.RESET)
+        yield
+
+
+class TestRunner(TestRunnerBase):
+    def __init__(self, tst_data, microwatt_mmu=False, rom=None,
+                 svp64=True, inorder=False, run_hdl=True, run_sim=True,
+                 allow_overlap=False):
+        if run_hdl:
+            run_hdl = HDLRunner
+        super().__init__(tst_data, microwatt_mmu=microwatt_mmu,
+                         rom=rom, inorder=inorder,
+                         svp64=svp64, run_hdl=run_hdl, run_sim=run_sim,
+                         allow_overlap=allow_overlap)
diff --git a/src/soc/simple/test/teststate.py b/src/soc/simple/test/teststate.py
new file mode 100644 (file)
index 0000000..cc62c5d
--- /dev/null
@@ -0,0 +1,76 @@
+""" Power ISA test API
+
+This module implements the creation, inspection and comparison
+of test states for TestIssuer HDL
+
+"""
+
+from openpower.decoder.power_enums import XER_bits
+from openpower.util import log
+from openpower.test.state import (State, state_add, state_factory,
+                                  TestState,)
+from soc.fu.compunits.test.test_compunit import get_l0_mem
+
+class HDLState(State):
+    """HDLState: Obtains registers and memory from an nmigen simulator
+    object by implementing State class methods.
+    """
+    def __init__(self, core):
+        super().__init__()
+        self.core = core
+
+    def get_fpregs(self):
+        if False:
+            yield
+        self.fpregs = []
+        for i in range(32):
+            self.fpregs.append(0)
+
+    def get_intregs(self):
+        self.intregs = []
+        for i in range(32):
+            if self.core.regs.int.unary:
+                rval = yield self.core.regs.int.regs[i].reg
+            else:
+                rval = yield self.core.regs.int.memory._array[i]
+            self.intregs.append(rval)
+        log("class hdl int regs", list(map(hex, self.intregs)))
+
+    def get_crregs(self):
+        self.crregs = []
+        for i in range(8):
+            rval = yield self.core.regs.cr.regs[7-i].reg
+            self.crregs.append(rval)
+        log("class hdl cr regs", list(map(hex, self.crregs)))
+
+    def get_xregs(self):
+        self.xregs = []
+        self.xr = self.core.regs.xer
+        self.so = yield self.xr.regs[self.xr.SO].reg
+        self.ov = yield self.xr.regs[self.xr.OV].reg
+        self.ca = yield self.xr.regs[self.xr.CA].reg
+        self.xregs.extend((self.so, self.ov, self.ca))
+        log("class hdl xregs", list(map(hex, self.xregs)))
+
+    def get_pc(self):
+        self.pcl = []
+        self.state = self.core.regs.state
+        # relies on the state.r_port being permanently held as PC
+        self.pc = yield self.state.r_ports['cia'].o_data
+        self.pcl.append(self.pc)
+        log("class hdl pc", hex(self.pc))
+
+    def get_mem(self):
+        self.mem = {}
+        # get the underlying HDL-simulated memory from the L0CacheBuffer
+        if hasattr(self.core, "icache"):
+            # err temporarily ignore memory
+            return # XXX have to work out how to deal with wb_get
+        hdlmem = get_l0_mem(self.core.l0)
+        for i in range(hdlmem.depth):
+            value = yield hdlmem._array[i] # should not really do this
+            self.mem[i*8] = value
+
+
+# add to State Factory
+state_add('hdl', HDLState)
diff --git a/src/unused/TLB/.gitignore b/src/unused/TLB/.gitignore
deleted file mode 100644 (file)
index 3324664..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-*.wpr
-__pycache__
diff --git a/src/unused/TLB/AddressEncoder.py b/src/unused/TLB/AddressEncoder.py
deleted file mode 100644 (file)
index 49da819..0000000
+++ /dev/null
@@ -1,77 +0,0 @@
-from nmigen import Module, Signal, Elaboratable
-from nmigen.lib.coding import Encoder, PriorityEncoder
-
-
-class AddressEncoder(Elaboratable):
-    """Address Encoder
-
-       The purpose of this module is to take in a vector and
-       encode the bits that are one hot into an address. This module
-       combines both nmigen's Encoder and PriorityEncoder and will state
-       whether the input line has a single bit hot, multiple bits hot,
-       or no bits hot. The output line will always have the lowest value
-       address output.
-
-       Usage:
-       The output is valid when either single or multiple match is high.
-       Otherwise output is 0.
-    """
-
-    def __init__(self, width):
-        """ Arguments:
-            * width: The desired length of the input vector
-        """
-        # Internal
-        self.encoder = Encoder(width)
-        self.p_encoder = PriorityEncoder(width)
-
-        # Input
-        self.i = Signal(width)
-
-        # Output
-        self.single_match = Signal(1)
-        self.multiple_match = Signal(1)
-        self.o = Signal(range(width))
-
-    def elaborate(self, platform=None):
-        m = Module()
-
-        # Add internal submodules
-        m.submodules.encoder = self.encoder
-        m.submodules.p_encoder = self.p_encoder
-
-        m.d.comb += [
-            self.encoder.i.eq(self.i),
-            self.p_encoder.i.eq(self.i)
-        ]
-
-        # Steps:
-        # 1. check if the input vector is non-zero
-        # 2. if non-zero, check if single match or multiple match
-        # 3. set output line to be lowest value address output
-
-        # If the priority encoder recieves an input of 0
-        # If n is 1 then the output is not valid
-        with m.If(self.p_encoder.n):
-            m.d.comb += [
-                self.single_match.eq(0),
-                self.multiple_match.eq(0),
-                self.o.eq(0)
-            ]
-        # If the priority encoder recieves an input > 0
-        with m.Else():
-            # Multiple Match if encoder n is invalid
-            with m.If(self.encoder.n):
-                m.d.comb += [
-                    self.single_match.eq(0),
-                    self.multiple_match.eq(1)
-                ]
-            # Single Match if encoder n is valid
-            with m.Else():
-                m.d.comb += [
-                    self.single_match.eq(1),
-                    self.multiple_match.eq(0)
-                ]
-            # Always set output based on priority encoder output
-            m.d.comb += self.o.eq(self.p_encoder.o)
-        return m
diff --git a/src/unused/TLB/Cam.py b/src/unused/TLB/Cam.py
deleted file mode 100644 (file)
index c5fd069..0000000
+++ /dev/null
@@ -1,126 +0,0 @@
-from nmigen import Array, Cat, Module, Signal, Elaboratable
-from nmigen.lib.coding import Decoder
-from nmigen.cli import main  # , verilog
-
-from .CamEntry import CamEntry
-from .AddressEncoder import AddressEncoder
-
-
-class Cam(Elaboratable):
-    """ Content Addressable Memory (CAM)
-
-        The purpose of this module is to quickly look up whether an
-        entry exists given a data key.
-        This module will search for the given data in all internal entries
-        and output whether a  single or multiple match was found.
-        If an single entry is found the address be returned and single_match
-        is set HIGH. If multiple entries are found the lowest address is
-        returned and multiple_match is set HIGH. If neither single_match or
-        multiple_match are HIGH this implies no match was found. To write
-        to the CAM set the address bus to the desired entry and set write_enable
-        HIGH. Entry managment should be performed one level above this block
-        as lookup is performed within.
-
-        Notes:
-        The read and write operations take one clock cycle to complete.
-        Currently the read_warning line is present for interfacing but
-        is not necessary for this design. This module is capable of writing
-        in the first cycle, reading on the second, and output the correct
-        address on the third.
-    """
-
-    def __init__(self, data_size, cam_size):
-        """ Arguments:
-            * data_size: (bits) The bit size of the data
-            * cam_size: (number) The number of entries in the CAM
-        """
-
-        # Internal
-        self.cam_size = cam_size
-        self.encoder = AddressEncoder(cam_size)
-        self.decoder = Decoder(cam_size)
-        self.entry_array = Array(CamEntry(data_size) for x in range(cam_size))
-
-        # Input
-        self.enable = Signal(1)
-        self.write_enable = Signal(1)
-        self.data_in = Signal(data_size)  # The data to be written
-        self.data_mask = Signal(data_size)  # mask for ternary writes
-        # address of CAM Entry to write
-        self.address_in = Signal(range(cam_size))
-
-        # Output
-        self.read_warning = Signal(1)  # High when a read interrupts a write
-        self.single_match = Signal(1)  # High when there is only one match
-        self.multiple_match = Signal(1)  # High when there at least two matches
-        # The lowest address matched
-        self.match_address = Signal(range(cam_size))
-
-    def elaborate(self, platform=None):
-        m = Module()
-        # AddressEncoder for match types and output address
-        m.submodules.AddressEncoder = self.encoder
-        # Decoder is used to select which entry will be written to
-        m.submodules.Decoder = self.decoder
-        # CamEntry Array Submodules
-        # Note these area added anonymously
-        entry_array = self.entry_array
-        m.submodules += entry_array
-
-        # Decoder logic
-        m.d.comb += [
-            self.decoder.i.eq(self.address_in),
-            self.decoder.n.eq(0)
-        ]
-
-        encoder_vector = []
-        with m.If(self.enable):
-            # Set the key value for every CamEntry
-            for index in range(self.cam_size):
-
-                # Write Operation
-                with m.If(self.write_enable):
-                    with m.If(self.decoder.o[index]):
-                        m.d.comb += entry_array[index].command.eq(2)
-                    with m.Else():
-                        m.d.comb += entry_array[index].command.eq(0)
-
-                # Read Operation
-                with m.Else():
-                    m.d.comb += entry_array[index].command.eq(1)
-
-                # Send data input to all entries
-                m.d.comb += entry_array[index].data_in.eq(self.data_in)
-                # Send all entry matches to encoder
-                ematch = entry_array[index].match
-                encoder_vector.append(ematch)
-
-            # Give input to and accept output from encoder module
-            m.d.comb += [
-                self.encoder.i.eq(Cat(*encoder_vector)),
-                self.single_match.eq(self.encoder.single_match),
-                self.multiple_match.eq(self.encoder.multiple_match),
-                self.match_address.eq(self.encoder.o)
-            ]
-
-        # If the CAM is not enabled set all outputs to 0
-        with m.Else():
-            m.d.comb += [
-                self.read_warning.eq(0),
-                self.single_match.eq(0),
-                self.multiple_match.eq(0),
-                self.match_address.eq(0)
-            ]
-
-        return m
-
-    def ports(self):
-        return [self.enable, self.write_enable,
-                self.data_in, self.data_mask,
-                self.read_warning, self.single_match,
-                self.multiple_match, self.match_address]
-
-
-if __name__ == '__main__':
-    cam = Cam(4, 4)
-    main(cam, ports=cam.ports())
diff --git a/src/unused/TLB/CamEntry.py b/src/unused/TLB/CamEntry.py
deleted file mode 100644 (file)
index b1d9308..0000000
+++ /dev/null
@@ -1,46 +0,0 @@
-from nmigen import Module, Signal, Elaboratable
-
-
-class CamEntry(Elaboratable):
-    """ Content Addressable Memory (CAM) Entry
-
-        The purpose of this module is to represent an entry within a CAM.
-        This module when given a read command will compare  the given data
-        and output whether a match was found or not. When given a write
-        command it will write the given data into internal registers.
-    """
-
-    def __init__(self, data_size):
-        """ Arguments:
-            * data_size: (bit count) The size of the data
-        """
-        # Input
-        self.command = Signal(2) # 00 => NA 01 => Read 10 => Write 11 => Reset
-        self.data_in = Signal(data_size) # Data input when writing
-
-        # Output
-        self.match = Signal(1) # Result of the internal/input key comparison
-        self.data = Signal(data_size)
-
-    def elaborate(self, platform=None):
-        m = Module()
-        with m.Switch(self.command):
-            with m.Case("00"):
-                m.d.sync += self.match.eq(0)
-            with m.Case("01"):
-                with m.If(self.data == self.data_in):
-                    m.d.sync += self.match.eq(1)
-                with m.Else():
-                    m.d.sync += self.match.eq(0)
-            with m.Case("10"):
-                m.d.sync += [
-                    self.data.eq(self.data_in),
-                    self.match.eq(0)
-                ]
-            with m.Case():
-                m.d.sync += [
-                    self.match.eq(0),
-                    self.data.eq(0)
-                ]
-
-        return m
diff --git a/src/unused/TLB/LFSR.py b/src/unused/TLB/LFSR.py
deleted file mode 100644 (file)
index d8b606e..0000000
+++ /dev/null
@@ -1,109 +0,0 @@
-# SPDX-License-Identifier: LGPL-2.1-or-later
-# See Notices.txt for copyright information
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-from nmigen.cli import verilog, rtlil
-
-
-class LFSRPolynomial(set):
-    """ implements a polynomial for use in LFSR
-    """
-    def __init__(self, exponents=()):
-        for e in exponents:
-            assert isinstance(e, int), TypeError("%s must be an int" % repr(e))
-            assert (e >= 0), ValueError("%d must not be negative" % e)
-        set.__init__(self, set(exponents).union({0})) # must contain zero
-
-    @property
-    def max_exponent(self):
-        return max(self) # derived from set, so this returns the max exponent
-
-    @property
-    def exponents(self):
-        exponents = list(self) # get elements of set as a list
-        exponents.sort(reverse=True)
-        return exponents
-
-    def __str__(self):
-        expd = {0: "1", 1: 'x', 2: "x^{}"} # case 2 isn't 2, it's min(i,2)
-        retval = map(lambda i: expd[min(i,2)].format(i), self.exponents)
-        return " + ".join(retval)
-
-    def __repr__(self):
-        return "LFSRPolynomial(%s)" % self.exponents
-
-
-# list of selected polynomials from https://web.archive.org/web/20190418121923/https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Some_polynomials_for_maximal_LFSRs  # noqa
-LFSR_POLY_2 = LFSRPolynomial([2, 1, 0])
-LFSR_POLY_3 = LFSRPolynomial([3, 2, 0])
-LFSR_POLY_4 = LFSRPolynomial([4, 3, 0])
-LFSR_POLY_5 = LFSRPolynomial([5, 3, 0])
-LFSR_POLY_6 = LFSRPolynomial([6, 5, 0])
-LFSR_POLY_7 = LFSRPolynomial([7, 6, 0])
-LFSR_POLY_8 = LFSRPolynomial([8, 6, 5, 4, 0])
-LFSR_POLY_9 = LFSRPolynomial([9, 5, 0])
-LFSR_POLY_10 = LFSRPolynomial([10, 7, 0])
-LFSR_POLY_11 = LFSRPolynomial([11, 9, 0])
-LFSR_POLY_12 = LFSRPolynomial([12, 11, 10, 4, 0])
-LFSR_POLY_13 = LFSRPolynomial([13, 12, 11, 8, 0])
-LFSR_POLY_14 = LFSRPolynomial([14, 13, 12, 2, 0])
-LFSR_POLY_15 = LFSRPolynomial([15, 14, 0])
-LFSR_POLY_16 = LFSRPolynomial([16, 15, 13, 4, 0])
-LFSR_POLY_17 = LFSRPolynomial([17, 14, 0])
-LFSR_POLY_18 = LFSRPolynomial([18, 11, 0])
-LFSR_POLY_19 = LFSRPolynomial([19, 18, 17, 14, 0])
-LFSR_POLY_20 = LFSRPolynomial([20, 17, 0])
-LFSR_POLY_21 = LFSRPolynomial([21, 19, 0])
-LFSR_POLY_22 = LFSRPolynomial([22, 21, 0])
-LFSR_POLY_23 = LFSRPolynomial([23, 18, 0])
-LFSR_POLY_24 = LFSRPolynomial([24, 23, 22, 17, 0])
-
-
-class LFSR(LFSRPolynomial, Elaboratable):
-    """ implements a Linear Feedback Shift Register
-    """
-    def __init__(self, polynomial):
-        """ Inputs:
-            ------
-            :polynomial: the polynomial to feedback on.  may be a LFSRPolynomial
-                         instance or an iterable of ints (list/tuple/generator)
-            :enable:     enable (set LO to disable.  NOTE: defaults to HI)
-
-            Outputs:
-            -------
-            :state: the LFSR state.  bitwidth is taken from the polynomial
-                    maximum exponent.
-
-            Note: if an LFSRPolynomial is passed in as the input, because
-            LFSRPolynomial is derived from set() it's ok:
-            LFSRPolynomial(LFSRPolynomial(p)) == LFSRPolynomial(p)
-        """
-        LFSRPolynomial.__init__(self, polynomial)
-        self.state = Signal(self.max_exponent, reset=1)
-        self.enable = Signal(reset=1)
-
-    def elaborate(self, platform):
-        m = Module()
-        # do absolutely nothing if the polynomial is empty (always has a zero)
-        if self.max_exponent <= 1:
-            return m
-
-        # create XOR-bunch, select bits from state based on exponent
-        feedback = Const(0) # doesn't do any harm starting from 0b0 (xor chain)
-        for exponent in self:
-            if exponent > 0: # don't have to skip, saves CPU cycles though
-                feedback ^= self.state[exponent - 1]
-
-        # if enabled, shift-and-feedback
-        with m.If(self.enable):
-            # shift up lower bits by Cat'ing in a new bit zero (feedback)
-            newstate = Cat(feedback, self.state[:-1])
-            m.d.sync += self.state.eq(newstate)
-
-        return m
-
-
-# example: Poly24
-if __name__ == '__main__':
-    p24 = rtlil.convert(LFSR(LFSR_POLY_24))
-    with open("lfsr2_p24.il", "w") as f:
-        f.write(p24)
diff --git a/src/unused/TLB/LFSR.pyi b/src/unused/TLB/LFSR.pyi
deleted file mode 100644 (file)
index 64eb911..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-License-Identifier: LGPL-2.1-or-later
-# See Notices.txt for copyright information
-from nmigen import Module
-from typing import Iterable, Optional, Iterator, Any, Union
-from typing_extensions import final
-
-
-@final
-class LFSRPolynomial(set):
-    def __init__(self, exponents: Iterable[int] = ()):
-        def elements() -> Iterable[int]: ...
-    @property
-    def exponents(self) -> list[int]: ...
-    def __str__(self) -> str: ...
-    def __repr__(self) -> str: ...
-
-
-@final
-class LFSR:
-    def __init__(self, polynomial: Union[Iterable[int], LFSRPolynomial]): ...
-    @property
-    def width(self) -> int: ...
-    def elaborate(self, platform: Any) -> Module: ...
diff --git a/src/unused/TLB/Makefile b/src/unused/TLB/Makefile
deleted file mode 100644 (file)
index 1eb67ac..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-verilog:
-       python3 Cam.py generate -t v > Cam.v
diff --git a/src/unused/TLB/MemorySet.py b/src/unused/TLB/MemorySet.py
deleted file mode 100644 (file)
index 11890ed..0000000
+++ /dev/null
@@ -1,66 +0,0 @@
-from nmigen import Cat, Memory, Module, Signal, Elaboratable
-from nmigen.cli import main
-from nmigen.cli import verilog, rtlil
-
-
-class MemorySet(Elaboratable):
-    def __init__(self, data_size, tag_size, set_count, active):
-        self.active = active
-        input_size = tag_size + data_size  # Size of the input data
-        memory_width = input_size + 1  # The width of the cache memory
-        self.active = active
-        self.data_size = data_size
-        self.tag_size = tag_size
-
-        # XXX TODO, use rd-enable and wr-enable?
-        self.mem = Memory(width=memory_width, depth=set_count)
-        self.r = self.mem.read_port()
-        self.w = self.mem.write_port()
-
-        # inputs (address)
-        self.cset = Signal(range(set_count))  # The set to be checked
-        self.tag = Signal(tag_size)        # The tag to find
-        self.data_i = Signal(data_size)    # Incoming data
-
-        # outputs
-        self.valid = Signal()
-        self.data_o = Signal(data_size)    # Outgoing data (excludes tag)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.mem = self.mem
-        m.submodules.r = self.r
-        m.submodules.w = self.w
-
-        # temporaries
-        active_bit = Signal()
-        tag_valid = Signal()
-        data_start = self.active + 1
-        data_end = data_start + self.data_size
-        tag_start = data_end
-        tag_end = tag_start + self.tag_size
-
-        # connect the read port address to the set/entry
-        read_port = self.r
-        m.d.comb += read_port.addr.eq(self.cset)
-        # Pull out active bit from data
-        data = read_port.data
-        m.d.comb += active_bit.eq(data[self.active])
-        # Validate given tag vs stored tag
-        tag = data[tag_start:tag_end]
-        m.d.comb += tag_valid.eq(self.tag == tag)
-        # An entry is only valid if the tags match AND
-        # is marked as a valid entry
-        m.d.comb += self.valid.eq(tag_valid & active_bit)
-
-        # output data: TODO, check rd-enable?
-        m.d.comb += self.data_o.eq(data[data_start:data_end])
-
-        # connect the write port addr to the set/entry (only if write enabled)
-        # (which is only done on a match, see SAC.write_entry below)
-        write_port = self.w
-        with m.If(write_port.en):
-            m.d.comb += write_port.addr.eq(self.cset)
-            m.d.comb += write_port.data.eq(Cat(1, self.data_i, self.tag))
-
-        return m
diff --git a/src/unused/TLB/PermissionValidator.py b/src/unused/TLB/PermissionValidator.py
deleted file mode 100644 (file)
index 5bc90b2..0000000
+++ /dev/null
@@ -1,68 +0,0 @@
-from nmigen import Module, Signal, Elaboratable
-from nmigen.cli import main
-
-from soc.TLB.PteEntry import PteEntry
-
-
-class PermissionValidator(Elaboratable):
-    """ The purpose of this Module is to check the Permissions of a given PTE
-        against the requested access permissions.
-
-        This module will either validate (by setting the valid bit HIGH)
-        the request or find a permission fault and invalidate (by setting
-        the valid bit LOW) the request
-    """
-
-    def __init__(self, asid_size, pte_size):
-        """ Arguments:
-            * asid_size: (bit count) The size of the asid to be processed
-            * pte_size: (bit count) The size of the pte to be processed
-
-            Return:
-            * valid HIGH when permissions are correct
-        """
-        # Internal
-        self.pte_entry = PteEntry(asid_size, pte_size)
-
-        # Input
-        self.data = Signal(asid_size + pte_size)
-        self.xwr = Signal(3)  # Execute, Write, Read
-        self.super_mode = Signal(1)  # Supervisor Mode
-        self.super_access = Signal(1)  # Supervisor Access
-        self.asid = Signal(15)  # Address Space IDentifier (ASID)
-
-        # Output
-        self.valid = Signal(1)  # Denotes if the permissions are correct
-
-    def elaborate(self, platform=None):
-        m = Module()
-
-        m.submodules.pte_entry = self.pte_entry
-
-        m.d.comb += self.pte_entry.i.eq(self.data)
-
-        # Check if the entry is valid
-        with m.If(self.pte_entry.v):
-            # ASID match or Global Permission
-            # Note that the MSB bound is exclusive
-            with m.If((self.pte_entry.asid == self.asid) | self.pte_entry.g):
-                # Check Execute, Write, Read (XWR) Permissions
-                with m.If(self.pte_entry.xwr == self.xwr):
-                    # Supervisor Logic
-                    with m.If(self.super_mode):
-                        # Valid if entry is not in user mode or supervisor
-                        # has Supervisor User Memory (SUM) access via the
-                        # SUM bit in the sstatus register
-                        m.d.comb += self.valid.eq((~self.pte_entry.u)
-                                                  | self.super_access)
-                    # User logic
-                    with m.Else():
-                        # Valid if the entry is in user mode only
-                        m.d.comb += self.valid.eq(self.pte_entry.u)
-                with m.Else():
-                    m.d.comb += self.valid.eq(0)
-            with m.Else():
-                m.d.comb += self.valid.eq(0)
-        with m.Else():
-            m.d.comb += self.valid.eq(0)
-        return m
diff --git a/src/unused/TLB/PteEntry.py b/src/unused/TLB/PteEntry.py
deleted file mode 100644 (file)
index 73ea922..0000000
+++ /dev/null
@@ -1,67 +0,0 @@
-from nmigen import Module, Signal, Elaboratable
-from nmigen.cli import main
-
-
-class PteEntry(Elaboratable):
-    """ The purpose of this Module is to  centralize the parsing of Page
-        Table Entries (PTE) into one module to prevent common mistakes
-        and duplication of code. The control bits are parsed out for
-        ease of use.
-
-        This module parses according to the standard PTE given by the
-        Volume II: RISC-V Privileged Architectures V1.10 Pg 60.
-        The Address Space IDentifier (ASID) is appended to the MSB of the input
-        and is parsed out as such.
-
-        An valid input Signal would be:
-              ASID   PTE
-        Bits:[78-64][63-0]
-
-        The output PTE value will include the control bits.
-    """
-    def __init__(self, asid_size, pte_size):
-        """ Arguments:
-            * asid_size: (bit count) The size of the asid to be processed
-            * pte_size: (bit count) The size of the pte to be processed
-
-            Return:
-            * d The Dirty bit from the PTE portion of i
-            * a The Accessed bit from the PTE portion of i
-            * g The Global bit from the PTE portion of i
-            * u The User Mode bit from the PTE portion of i
-            * xwr The Execute/Write/Read bit from the PTE portion of i
-            * v The Valid bit from the PTE portion of i
-            * asid The asid portion of i
-            * pte The pte portion of i
-        """
-        # Internal
-        self.asid_start = pte_size
-        self.asid_end = pte_size + asid_size
-
-        # Input
-        self.i = Signal(asid_size + pte_size)
-
-        # Output
-        self.d = Signal(1) # Dirty bit (From pte)
-        self.a = Signal(1) # Accessed bit (From pte)
-        self.g = Signal(1) # Global Access (From pte)
-        self.u = Signal(1) # User Mode (From pte)
-        self.xwr = Signal(3) # Execute Read Write (From pte)
-        self.v = Signal(1) # Valid (From pte)
-        self.asid = Signal(asid_size) # Associated Address Space IDentifier
-        self.pte = Signal(pte_size) # Full Page Table Entry
-
-    def elaborate(self, platform=None):
-        m = Module()
-        # Pull out all control bites from PTE
-        m.d.comb += [
-            self.d.eq(self.i[7]),
-            self.a.eq(self.i[6]),
-            self.g.eq(self.i[5]),
-            self.u.eq(self.i[4]),
-            self.xwr.eq(self.i[1:4]),
-            self.v.eq(self.i[0])
-        ]
-        m.d.comb += self.asid.eq(self.i[self.asid_start:self.asid_end])
-        m.d.comb += self.pte.eq(self.i[0:self.asid_start])
-        return m
diff --git a/src/unused/TLB/SetAssociativeCache.py b/src/unused/TLB/SetAssociativeCache.py
deleted file mode 100644 (file)
index 30ad809..0000000
+++ /dev/null
@@ -1,274 +0,0 @@
-"""
-
-Online simulator of 4-way set-associative cache:
-http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/sa4.html
-
-Python simulator of a N-way set-associative cache:
-https://github.com/vaskevich/CacheSim/blob/master/cachesim.py
-"""
-
-from nmigen import Array, Cat, Memory, Module, Signal, Mux, Elaboratable
-from nmigen.compat.genlib import fsm
-from nmigen.cli import main
-from nmigen.cli import verilog, rtlil
-
-from .AddressEncoder import AddressEncoder
-from .MemorySet import MemorySet
-
-# TODO: use a LFSR that advances continuously and picking the bottom
-# few bits from it to select which cache line to replace, instead of PLRU
-# http://bugs.libre-riscv.org/show_bug.cgi?id=71
-from .ariane.plru import PLRU
-from .LFSR import LFSR, LFSR_POLY_24
-
-SA_NA = "00"  # no action (none)
-SA_RD = "01"  # read
-SA_WR = "10"  # write
-
-
-class SetAssociativeCache(Elaboratable):
-    """ Set Associative Cache Memory
-
-        The purpose of this module is to generate a memory cache given the
-        constraints passed in. This will create a n-way set associative cache.
-        It is expected for the SV TLB that the VMA will provide the set number
-        while the ASID provides the tag (still to be decided).
-
-    """
-
-    def __init__(self, tag_size, data_size, set_count, way_count, lfsr=False):
-        """ Arguments
-            * tag_size (bits): The bit count of the tag
-            * data_size (bits): The bit count of the data to be stored
-            * set_count (number): The number of sets/entries in the cache
-            * way_count (number): The number of slots a data can be stored
-                                  in one set
-            * lfsr: if set, use an LFSR for (pseudo-randomly) selecting
-                    set/entry to write to.  otherwise, use a PLRU
-        """
-        # Internals
-        self.lfsr_mode = lfsr
-        self.way_count = way_count  # The number of slots in one set
-        self.tag_size = tag_size    # The bit count of the tag
-        self.data_size = data_size  # The bit count of the data to be stored
-
-        # set up Memory array
-        self.mem_array = Array()  # memory array
-        for i in range(way_count):
-            ms = MemorySet(data_size, tag_size, set_count, active=0)
-            self.mem_array.append(ms)
-
-        # Finds valid entries
-        self.encoder = AddressEncoder(way_count)
-
-        # setup PLRU or LFSR
-        if lfsr:
-            # LFSR mode
-            self.lfsr = LFSR(LFSR_POLY_24)
-        else:
-            # PLRU mode
-            # One block to handle plru calculations
-            self.plru = PLRU(way_count)
-            self.plru_array = Array()  # PLRU data on each set
-            for i in range(set_count):
-                name = "plru%d" % i
-                self.plru_array.append(Signal(self.plru.TLBSZ, name=name))
-
-        # Input
-        self.enable = Signal(1)   # Whether the cache is enabled
-        self.command = Signal(2)  # 00=None, 01=Read, 10=Write (see SA_XX)
-        self.cset = Signal(range(set_count))  # The set to be checked
-        self.tag = Signal(tag_size)        # The tag to find
-        self.data_i = Signal(data_size)    # The input data
-
-        # Output
-        self.ready = Signal(1)  # 0 => Processing 1 => Ready for commands
-        self.hit = Signal(1)            # Tag matched one way in the given set
-        # Tag matched many ways in the given set
-        self.multiple_hit = Signal(1)
-        self.data_o = Signal(data_size)  # The data linked to the matched tag
-
-    def check_tags(self, m):
-        """ Validate the tags in the selected set. If one and only one
-            tag matches set its state to zero and increment all others
-            by one. We only advance to next state if a single hit is found.
-        """
-        # Vector to store way valid results
-        # A zero denotes a way is invalid
-        valid_vector = []
-        # Loop through memory to prep read/write ports and set valid_vector
-        for i in range(self.way_count):
-            valid_vector.append(self.mem_array[i].valid)
-
-        # Pass encoder the valid vector
-        m.d.comb += self.encoder.i.eq(Cat(*valid_vector))
-
-        # Only one entry should be marked
-        # This is due to already verifying the tags
-        # matched and the valid bit is high
-        with m.If(self.hit):
-            m.next = "FINISHED_READ"
-            # Pull out data from the read port
-            data = self.mem_array[self.encoder.o].data_o
-            m.d.comb += self.data_o.eq(data)
-            if not self.lfsr_mode:
-                self.access_plru(m)
-
-        # Oh no! Seal the gates! Multiple tags matched?!? kasd;ljkafdsj;k
-        with m.Elif(self.multiple_hit):
-            # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck
-            m.d.comb += self.data_o.eq(0)
-
-        # No tag matches means no data
-        with m.Else():
-            # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck
-            m.d.comb += self.data_o.eq(0)
-
-    def access_plru(self, m):
-        """ An entry was accessed and the plru tree must now be updated
-        """
-        # Pull out the set's entry being edited
-        plru_entry = self.plru_array[self.cset]
-        m.d.comb += [
-            # Set the plru data to the current state
-            self.plru.plru_tree.eq(plru_entry),
-            # Set that the cache was accessed
-            self.plru.lu_access_i.eq(1)
-        ]
-
-    def read(self, m):
-        """ Go through the read process of the cache.
-            This takes two cycles to complete. First it checks for a valid tag
-            and secondly it updates the LRU values.
-        """
-        with m.FSM() as fsm_read:
-            with m.State("READY"):
-                m.d.comb += self.ready.eq(0)
-                # check_tags will set the state if the conditions are met
-                self.check_tags(m)
-            with m.State("FINISHED_READ"):
-                m.next = "READY"
-                m.d.comb += self.ready.eq(1)
-                if not self.lfsr_mode:
-                    plru_tree_o = self.plru.plru_tree_o
-                    m.d.sync += self.plru_array[self.cset].eq(plru_tree_o)
-
-    def write_entry(self, m):
-        if not self.lfsr_mode:
-            m.d.comb += [  # set cset (mem address) into PLRU
-                self.plru.plru_tree.eq(self.plru_array[self.cset]),
-                # and connect plru to encoder for write
-                self.encoder.i.eq(self.plru.replace_en_o)
-            ]
-            write_port = self.mem_array[self.encoder.o].w
-        else:
-            # use the LFSR to generate a random(ish) one of the mem array
-            lfsr_output = Signal(range(self.way_count))
-            lfsr_random = Signal(range(self.way_count))
-            m.d.comb += lfsr_output.eq(self.lfsr.state)  # lose some bits
-            # address too big, limit to range of array
-            m.d.comb += lfsr_random.eq(Mux(lfsr_output > self.way_count,
-                                           lfsr_output - self.way_count,
-                                           lfsr_output))
-            write_port = self.mem_array[lfsr_random].w
-
-        # then if there is a match from the encoder, enable the selected write
-        with m.If(self.encoder.single_match):
-            m.d.comb += write_port.en.eq(1)
-
-    def write(self, m):
-        """ Go through the write process of the cache.
-            This takes two cycles to complete. First it writes the entry,
-            and secondly it updates the PLRU (in plru mode)
-        """
-        with m.FSM() as fsm_write:
-            with m.State("READY"):
-                m.d.comb += self.ready.eq(0)
-                self.write_entry(m)
-                m.next = "FINISHED_WRITE"
-            with m.State("FINISHED_WRITE"):
-                m.d.comb += self.ready.eq(1)
-                if not self.lfsr_mode:
-                    plru_entry = self.plru_array[self.cset]
-                    m.d.sync += plru_entry.eq(self.plru.plru_tree_o)
-                m.next = "READY"
-
-    def elaborate(self, platform=None):
-        m = Module()
-
-        # ----
-        # set up Modules: AddressEncoder, LFSR/PLRU, Mem Array
-        # ----
-
-        m.submodules.AddressEncoder = self.encoder
-        if self.lfsr_mode:
-            m.submodules.LFSR = self.lfsr
-        else:
-            m.submodules.PLRU = self.plru
-
-        for i, mem in enumerate(self.mem_array):
-            setattr(m.submodules, "mem%d" % i, mem)
-
-        # ----
-        # select mode: PLRU connect to encoder, LFSR do... something
-        # ----
-
-        if not self.lfsr_mode:
-            # Set what entry was hit
-            m.d.comb += self.plru.lu_hit.eq(self.encoder.o)
-        else:
-            # enable LFSR
-            m.d.comb += self.lfsr.enable.eq(self.enable)
-
-        # ----
-        # connect hit/multiple hit to encoder output
-        # ----
-
-        m.d.comb += [
-            self.hit.eq(self.encoder.single_match),
-            self.multiple_hit.eq(self.encoder.multiple_match),
-        ]
-
-        # ----
-        # connect incoming data/tag/cset(addr) to mem_array
-        # ----
-
-        for mem in self.mem_array:
-            write_port = mem.w
-            m.d.comb += [mem.cset.eq(self.cset),
-                         mem.tag.eq(self.tag),
-                         mem.data_i.eq(self.data_i),
-                         write_port.en.eq(0),  # default: disable write
-                         ]
-        # ----
-        # Commands: READ/WRITE/TODO
-        # ----
-
-        with m.If(self.enable):
-            with m.Switch(self.command):
-                # Search all sets at a particular tag
-                with m.Case(SA_RD):
-                    self.read(m)
-                with m.Case(SA_WR):
-                    self.write(m)
-                    # Maybe catch multiple tags write here?
-                    # TODO
-                # TODO: invalidate/flush, flush-all?
-
-        return m
-
-    def ports(self):
-        return [self.enable, self.command, self.cset, self.tag, self.data_i,
-                self.ready, self.hit, self.multiple_hit, self.data_o]
-
-
-if __name__ == '__main__':
-    sac = SetAssociativeCache(4, 8, 4, 6)
-    vl = rtlil.convert(sac, ports=sac.ports())
-    with open("SetAssociativeCache.il", "w") as f:
-        f.write(vl)
-
-    sac_lfsr = SetAssociativeCache(4, 8, 4, 6, True)
-    vl = rtlil.convert(sac_lfsr, ports=sac_lfsr.ports())
-    with open("SetAssociativeCacheLFSR.il", "w") as f:
-        f.write(vl)
diff --git a/src/unused/TLB/TLB.py b/src/unused/TLB/TLB.py
deleted file mode 100644 (file)
index a3c0224..0000000
+++ /dev/null
@@ -1,177 +0,0 @@
-""" TLB Module
-
-    The expected form of the data is:
-    * Item (Bits)
-    * Tag (N - 79) / ASID (78 - 64) / PTE (63 - 0)
-"""
-
-from nmigen import Memory, Module, Signal, Cat, Elaboratable
-from nmigen.cli import main
-
-from .PermissionValidator import PermissionValidator
-from .Cam import Cam
-
-
-class TLB(Elaboratable):
-    def __init__(self, asid_size, vma_size, pte_size, L1_size):
-        """ Arguments
-            * asid_size: Address Space IDentifier (ASID) typically 15 bits
-            * vma_size: Virtual Memory Address (VMA) typically 36 bits
-            * pte_size: Page Table Entry (PTE) typically 64 bits
-
-            Notes:
-            These arguments should represent the largest possible size
-            defined by the MODE settings. See
-            Volume II: RISC-V Privileged Architectures V1.10 Page 57
-        """
-
-        # Internal
-        self.state = 0
-        # L1 Cache Modules
-        self.cam_L1 = Cam(vma_size, L1_size)
-        self.mem_L1 = Memory(width=asid_size + pte_size, depth=L1_size)
-
-        # Permission Validator
-        self.perm_validator = PermissionValidator(asid_size, pte_size)
-
-        # Inputs
-        self.supermode = Signal(1)  # Supervisor Mode
-        self.super_access = Signal(1)  # Supervisor Access
-        # 00=None, 01=Search, 10=Write L1, 11=Write L2
-        self.command = Signal(2)
-        self.xwr = Signal(3)  # Execute, Write, Read
-        self.mode = Signal(4)  # 4 bits for access to Sv48 on Rv64
-        self.address_L1 = Signal(range(L1_size))
-        self.asid = Signal(asid_size)  # Address Space IDentifier (ASID)
-        self.vma = Signal(vma_size)  # Virtual Memory Address (VMA)
-        self.pte_in = Signal(pte_size)  # To be saved Page Table Entry (PTE)
-
-        # Outputs
-        self.hit = Signal(1)  # Denotes if the VMA had a mapped PTE
-        self.perm_valid = Signal(1)  # Denotes if the permissions are correct
-        self.pte_out = Signal(pte_size)  # PTE that was mapped to by the VMA
-
-    def search(self, m, read_L1, write_L1):
-        """ searches the TLB
-        """
-        m.d.comb += [
-            write_L1.en.eq(0),
-            self.cam_L1.write_enable.eq(0),
-            self.cam_L1.data_in.eq(self.vma)
-        ]
-        # Match found in L1 CAM
-        match_found = Signal(reset_less=True)
-        m.d.comb += match_found.eq(self.cam_L1.single_match
-                                   | self.cam_L1.multiple_match)
-        with m.If(match_found):
-            # Memory shortcut variables
-            mem_address = self.cam_L1.match_address
-            # Memory Logic
-            m.d.comb += read_L1.addr.eq(mem_address)
-            # Permission Validator Logic
-            m.d.comb += [
-                self.hit.eq(1),
-                # Set permission validator data to the correct
-                # register file data according to CAM match
-                # address
-                self.perm_validator.data.eq(read_L1.data),
-                # Execute, Read, Write
-                self.perm_validator.xwr.eq(self.xwr),
-                # Supervisor Mode
-                self.perm_validator.super_mode.eq(self.supermode),
-                # Supverisor Access
-                self.perm_validator.super_access.eq(self.super_access),
-                # Address Space IDentifier (ASID)
-                self.perm_validator.asid.eq(self.asid),
-                # Output result of permission validation
-                self.perm_valid.eq(self.perm_validator.valid)
-            ]
-            # Only output PTE if permissions are valid
-            with m.If(self.perm_validator.valid):
-                # XXX TODO - dummy for now
-                reg_data = Signal.like(self.pte_out)
-                m.d.comb += [
-                    self.pte_out.eq(reg_data)
-                ]
-            with m.Else():
-                m.d.comb += [
-                    self.pte_out.eq(0)
-                ]
-        # Miss Logic
-        with m.Else():
-            m.d.comb += [
-                self.hit.eq(0),
-                self.perm_valid.eq(0),
-                self.pte_out.eq(0)
-            ]
-
-    def write_l1(self, m, read_L1, write_L1):
-        """ writes to the L1 cache
-        """
-        # Memory_L1 Logic
-        m.d.comb += [
-            write_L1.en.eq(1),
-            write_L1.addr.eq(self.address_L1),
-            # The Cat places arguments from LSB -> MSB
-            write_L1.data.eq(Cat(self.pte_in, self.asid))
-        ]
-        # CAM_L1 Logic
-        m.d.comb += [
-            self.cam_L1.write_enable.eq(1),
-            self.cam_L1.data_in.eq(self.vma),  # data_in is sent to all entries
-            # self.cam_L1.address_in.eq(todo) # a CAM entry needs to be selected
-
-        ]
-
-    def elaborate(self, platform):
-        m = Module()
-        # Add submodules
-        # Submodules for L1 Cache
-        m.submodules.cam_L1 = self.cam_L1
-        m.submodules.read_L1 = read_L1 = self.mem_L1.read_port()
-        m.submodules.write_L1 = write_L1 = self.mem_L1.write_port()
-
-        # Permission Validator Submodule
-        m.submodules.perm_valididator = self.perm_validator
-
-        # When MODE specifies translation
-        # TODO add in different bit length handling ie prefix 0s
-        tlb_enable = Signal(reset_less=True)
-        m.d.comb += tlb_enable.eq(self.mode != 0)
-
-        with m.If(tlb_enable):
-            m.d.comb += [
-                self.cam_L1.enable.eq(1)
-            ]
-            with m.Switch(self.command):
-                # Search
-                with m.Case("01"):
-                    self.search(m, read_L1, write_L1)
-
-                # Write L1
-                # Expected that the miss will be handled in software
-                with m.Case("10"):
-                    self.write_l1(m, read_L1, write_L1)
-
-                # TODO
-                # with m.Case("11"):
-
-        # When disabled
-        with m.Else():
-            m.d.comb += [
-                self.cam_L1.enable.eq(0),
-                # XXX TODO - self.reg_file.enable.eq(0),
-                self.hit.eq(0),
-                self.perm_valid.eq(0),  # XXX TODO, check this
-                self.pte_out.eq(0)
-            ]
-        return m
-
-
-if __name__ == '__main__':
-    tlb = TLB(15, 36, 64, 4)
-    main(tlb, ports=[tlb.supermode, tlb.super_access, tlb.command,
-                     tlb.xwr, tlb.mode, tlb.address_L1, tlb.asid,
-                     tlb.vma, tlb.pte_in,
-                     tlb.hit, tlb.perm_valid, tlb.pte_out,
-                     ] + tlb.cam_L1.ports())
diff --git a/src/unused/TLB/__init__.py b/src/unused/TLB/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/src/unused/TLB/ariane/TreePLRU.cpp b/src/unused/TLB/ariane/TreePLRU.cpp
deleted file mode 100644 (file)
index 2f6aeea..0000000
+++ /dev/null
@@ -1,211 +0,0 @@
-#include <cstdint>
-#include <iostream>
-#include <cmath>
-
-
-#define NWAY 4
-#define NLINE 256
-#define HIT 0
-#define MISS 1
-#define MS 1000
-/*
-Detailed TreePLRU inference see here: https://docs.google.com/spreadsheets/d/14zQpPYPwDAbCCjBT_a3KLaE5FEk-RNhI8Z7Qm_biW8g/edit?usp=sharing
-Ref: https://people.cs.clemson.edu/~mark/464/p_lru.txt
-four-way set associative - three bits
-   each bit represents one branch point in a binary decision tree; let 1
-   represent that the left side has been referenced more recently than the
-   right side, and 0 vice-versa
-              are all 4 lines valid?
-                   /       \
-                 yes        no, use an invalid line
-                  |
-                  |
-                  |
-             bit_0 == 0?            state | replace      ref to | next state
-              /       \             ------+--------      -------+-----------
-             y         n             00x  |  line_0      line_0 |    11_
-            /           \            01x  |  line_1      line_1 |    10_
-     bit_1 == 0?    bit_2 == 0?      1x0  |  line_2      line_2 |    0_1
-       /    \          /    \        1x1  |  line_3      line_3 |    0_0
-      y      n        y      n
-     /        \      /        \        ('x' means       ('_' means unchanged)
-   line_0  line_1  line_2  line_3      don't care)
- 8-way set associative - 7  = 1+2+4 bits
-16-way set associative - 15 = 1+2+4+8 bits
-32-way set associative - 31 = 1+2+4+8+16 bits
-64-way set associative - 63 = 1+2+4+8+16+32 bits
-*/
-using namespace std;
-struct AddressField {
-    uint64_t wd_idx : 2;//Unused
-    uint64_t offset : 4;//Unused
-    uint64_t index  : 8;//NLINE = 256 = 2^8
-    uint64_t tag    : 50;
-};
-
-union Address {
-    uint32_t* p;
-    AddressField fields;
-};
-
-struct Cell {
-    bool v;
-    uint64_t tag;
-
-    Cell() : v(false), tag(0) {}
-
-    bool isHit(uint64_t tag) {
-        return v && (tag == this->tag);
-    }
-
-    void fetch(uint32_t* address) {
-        Address addr;
-        addr.p = address;
-        addr.fields.offset = 0;
-        addr.fields.wd_idx = 0;
-        tag = addr.fields.tag;
-        v = true;
-    }
-};
-
-ostream& operator<<(ostream & out, const Cell& cell) {
-    out << " v:" << cell.v << " tag:" << hex << cell.tag;
-    return out;
-}
-
-struct Block {
-    Cell cell[NWAY];
-    uint32_t state;
-    uint64_t *mask;//Mask the state to get accurate value for specified 1 bit.
-    uint64_t *value;
-    uint64_t *next_value;
-
-    Block() : state(0) {
-        switch (NWAY) {
-            case 4:
-                mask = new uint64_t[4]{0b110, 0b110, 0b101, 0b101};
-                value = new uint64_t[4]{0b000, 0b010, 0b100, 0b101};
-                next_value = new uint64_t[4]{0b110, 0b100, 0b001, 0b000};
-                break;
-            case 8:
-                mask = new uint64_t[8]{0b1101000, 0b1101000, 0b1100100, 0b1100100, 0b1010010, 0b1010010, 0b1010001,
-                                       0b1010001};
-                value = new uint64_t[8]{0b0000000, 0b0001000, 0b0100000, 0b0100100, 0b1000000, 0b1000010, 0b1010000,
-                                        0b1010001};
-                next_value = new uint64_t[8]{0b1101000, 0b1100000, 0b1000100, 0b1000000, 0b0010010, 0b0010000,
-                                             0b0000001, 0b0000000};
-                break;
-                //TODO - more NWAY goes here.
-            default:
-                std::cout << "Error definition NWAY = " << NWAY << std::endl;
-        }
-    }
-
-    uint32_t *getByTag(uint64_t tag, uint32_t *pway) {
-        for (int i = 0; i < NWAY; ++i) {
-            if (cell[i].isHit(tag)) {
-                *pway = i;
-                return pway;
-            }
-        }
-        return NULL;
-    }
-
-    void setLRU(uint32_t *address) {
-        int way = 0;
-        uint32_t st = state;
-        for (int i = 0; i < NWAY; ++i) {
-            if ((state & mask[i]) == value[i]) {
-                state ^= mask[i];
-                way = i;
-                break;
-            }
-        }
-        cell[way].fetch(address);
-        cout << "MISS: way:" << way << " address:" << address << " state:" << st << "->" << state << endl;
-    }
-
-    uint32_t *get(uint32_t *address, uint32_t *pway) {
-        Address addr;
-        addr.p = address;
-        uint32_t *d = getByTag(addr.fields.tag, pway);
-        if (d != NULL) {
-            return &d[addr.fields.offset];
-        }
-        return d;
-    }
-
-    int set(uint32_t *address) {
-        uint32_t way = 0;
-        uint32_t *p = get(address, &way);
-        if (p != NULL) {
-            printf("HIT: address:%p ref_to way:%d state %X --> ", address, way, state);
-            state &= ~mask[way];
-            printf("%X --> ", state);
-            state |= next_value[way];
-            printf("%X\n", state);
-            // *p = *address; //skip since address is fake.
-            return HIT;
-        } else {
-            setLRU(address);
-            return MISS;
-        }
-    }
-};
-
-ostream& operator<<(ostream & out, const Block& block) {
-    out << "state:" << block.state << " ";
-    for (int i = 0; i<NWAY; i++) {
-        out << block.cell[i];
-    }
-    return out;
-}
-
-struct Cache {
-    Block block[NLINE];
-    uint32_t count[2];
-    Cache() { count[HIT] = 0; count[MISS] = 0; }
-
-    void access(uint32_t* address) {
-        Address addr;
-        addr.p = address;
-        Block& b = block[addr.fields.index];
-        ++count[b.set(address)];
-    }
-
-};
-ostream& operator<<(ostream & out, const Cache& cache) {
-    out << "\n==Summary==\n\tHit: " << cache.count[HIT] <<  " Miss: " << cache.count[MISS] << std::endl;
-    for (int i = 0; i < NLINE; i++) {
-        out << cache.block[i] << endl;
-    }
-    return out;
-}
-
-Cache cache;
-void multiply(uint32_t* m1, uint32_t* m2, uint32_t* res)
-{
-    int x, i, j;
-    for (i = 0; i < MS; i++) {
-        for (j = 0; j < MS; j++) {
-            cache.access(res + i*MS +j);
-            for (x = 0; x < MS; x++) {
-                cache.access(m1 + i*MS + x);
-                cache.access(m2 + x*MS + j);
-                cache.access(res + i*MS +j);
-                // res[i][j] += m1[i][x] * m2[x][j];
-                cache.access(res + i*MS +j);
-            }
-        }
-    }
-}
-
-int main()
-{
-    uint32_t* m1 = (uint32_t*) 0xFACE00A000000000LL;  // fake virtual address; don’t access it
-    uint32_t* m2 = (uint32_t*) 0xFACE00B000000000LL;  // fake virtual address; don’t access it
-    uint32_t* res =  (uint32_t*) 0xFACE00C000000000LL; // fake virtual address; don’t access it
-    multiply(m1, m2, res);
-    cout << cache << endl;
-    return 0;
-}
diff --git a/src/unused/TLB/ariane/__init__.py b/src/unused/TLB/ariane/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/src/unused/TLB/ariane/exceptcause.py b/src/unused/TLB/ariane/exceptcause.py
deleted file mode 100644 (file)
index 4c5cb2d..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-from nmigen import Const
-
-INSTR_ADDR_MISALIGNED = Const(0, 64)
-INSTR_ACCESS_FAULT    = Const(1, 64)
-ILLEGAL_INSTR         = Const(2, 64)
-BREAKPOINT            = Const(3, 64)
-LD_ADDR_MISALIGNED    = Const(4, 64)
-LD_ACCESS_FAULT       = Const(5, 64)
-ST_ADDR_MISALIGNED    = Const(6, 64)
-ST_ACCESS_FAULT       = Const(7, 64)
-ENV_CALL_UMODE        = Const(8, 64)  # environment call from user mode
-ENV_CALL_SMODE        = Const(9, 64)  # environment call from supervisor mode
-ENV_CALL_MMODE        = Const(11, 64) # environment call from machine mode
-INSTR_PAGE_FAULT      = Const(12, 64) # Instruction page fault
-LOAD_PAGE_FAULT       = Const(13, 64) # Load page fault
-STORE_PAGE_FAULT      = Const(15, 64) # Store page fault
diff --git a/src/unused/TLB/ariane/miss_handler.py b/src/unused/TLB/ariane/miss_handler.py
deleted file mode 100644 (file)
index 5ddc725..0000000
+++ /dev/null
@@ -1,786 +0,0 @@
-# Copyright 2018 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the "License"); you may not use this file except in
-# compliance with the License.  You may obtain a copy of the License at
-# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-#
-# Author: Florian Zaruba, ETH Zurich
-# Date: 12.11.2017
-# Description: Handles cache misses.
-from nmigen.lib.coding import Encoder, PriorityEncoder
-
-
-# --------------
-# MISS Handler
-# --------------
-import ariane_pkg::*;
-import std_cache_pkg::*;
-
-unsigned NR_PORTS         = 3
-
-class MissReq(RecordObject):
-    def __init__(self, name=None):
-        Record.__init__(self, name)
-        self.valid = Signal()
-        self.addr = Signal(64)
-        self.be = Signal(8)
-        self.size = Signal(2)
-        self.we = Signal()
-        self.wdata = Signal(64)
-        bypass = Signal()
-
-class CacheLine:
-    def __init__(self):
-        self.tag = Signal(DCACHE_TAG_WIDTH) # tag array
-        self.data = Signal(DCACHE_LINE_WIDTH) # data array
-        self.valid = Signal() # state array
-        self.dirty = Signal() # state array
-
-# cache line byte enable
-class CLBE:
-    def __init__(self):
-        self.tag = Signal(DCACHE_TAG_WIDTH+7)//8) # byte enable into tag array
-        self.data = Signal(DCACHE_LINE_WIDTH+7)//8) # byte enable data array
-        # bit enable into state array (valid for a pair of dirty/valid bits)
-        self.vldrty = Signal(DCACHE_SET_ASSOC)
-    } cl_be_t;
-
-
-
-    # FSM states
-"""
-    enum logic [3:0] {
-        IDLE,               # 0
-        FLUSHING,           # 1
-        FLUSH,              # 2
-        WB_CACHELINE_FLUSH, # 3
-        FLUSH_REQ_STATUS,   # 4
-        WB_CACHELINE_MISS,  # 5
-        WAIT_GNT_SRAM,      # 6
-        MISS,               # 7
-        REQ_CACHELINE,      # 8
-        MISS_REPL,          # 9
-        SAVE_CACHELINE,     # A
-        INIT,               # B
-        AMO_LOAD,           # C
-        AMO_SAVE_LOAD,      # D
-        AMO_STORE           # E
-    } state_d, state_q;
-"""
-
-class MissHandler(Elaboratable):
-    def __init__(self, NR_PORTS):
-        self.NR_PORTS = NR_PORTS
-        self.pwid = pwid = ceil(log(NR_PORTS) / log(2))
-        self.flush_i = Signal()      # flush request
-        self.flush_ack_o = Signal()  # acknowledge successful flush
-        self.miss_o = Signal()
-        self.busy_i = Signal()       # dcache is busy with something
-
-        # Bypass or miss
-        self.miss_req_i = Array(MissReq(name="missreq") for i in range(NR_PORTS))
-        # Bypass handling
-        self.bypass_gnt_o = Signal(NR_PORTS)
-        self.bypass_valid_o = Signal(NR_PORTS)
-        self.bypass_data_o = Array(Signal(name="bdata_o", 64) \
-                                    for i in range(NR_PORTS))
-
-        # AXI port
-        output ariane_axi::req_t                            axi_bypass_o,
-        input  ariane_axi::resp_t                           axi_bypass_i,
-
-        # Miss handling (~> cacheline refill)
-        self.miss_gnt_o = Signal(NR_PORTS)
-        self.active_serving_o = Signal(NR_PORTS)
-
-        self.critical_word_o = Signal(64)
-        self.critical_word_valid_o = Signal()
-        output ariane_axi::req_t                            axi_data_o,
-        input  ariane_axi::resp_t                           axi_data_i,
-
-        self.mshr_addr_i = Array(Signal(name="bdata_o", 56) \
-                                    for i in range(NR_PORTS))
-        self.mshr_addr_matches_o = Signal(NR_PORTS)
-        self.mshr_index_matches_o = Signal(NR_PORTS)
-
-        # AMO
-        self.amo_req_i = AMOReq()
-        self.amo_resp_o = AMOResp()
-        # Port to SRAMs, for refill and eviction
-        self.req_o = Signal(DCACHE_SET_ASSOC)
-        self.addr_o = Signal(DCACHE_INDEX_WIDTH) # address into cache array
-        self.data_o = CacheLine()
-        self.be_o = CLBE()
-        self.data_i = Array(CacheLine() \
-                                    for i in range(DCACHE_SET_ASSOC))
-        self.we_o = Signal()
-
-    def elaborate(self, platform):
-        # Registers
-        mshr_t                                  mshr_d, mshr_q;
-        logic [DCACHE_INDEX_WIDTH-1:0]          cnt_d, cnt_q;
-        logic [DCACHE_SET_ASSOC-1:0]            evict_way_d, evict_way_q;
-        # cache line to evict
-        cache_line_t                            evict_cl_d, evict_cl_q;
-
-        logic serve_amo_d, serve_amo_q;
-        # Request from one FSM
-        miss_req_valid = Signal(self.NR_PORTS)
-        miss_req_bypass = Signal(self.NR_PORTS)
-        miss_req_addr = Array(Signal(name="miss_req_addr", 64) \
-                                    for i in range(NR_PORTS))
-        miss_req_wdata = Array(Signal(name="miss_req_wdata", 64) \
-                                    for i in range(NR_PORTS))
-        miss_req_we = Signal(self.NR_PORTS)
-        miss_req_be = Array(Signal(name="miss_req_be", 8) \
-                                    for i in range(NR_PORTS))
-        miss_req_size = Array(Signal(name="miss_req_size", 2) \
-                                    for i in range(NR_PORTS))
-
-        # Cache Line Refill <-> AXI
-        req_fsm_miss_valid = Signal()
-        req_fsm_miss_addr = Signal(64)
-        req_fsm_miss_wdata = Signal(DCACHE_LINE_WIDTH)
-        req_fsm_miss_we = Signal()
-        req_fsm_miss_be = Signal(DCACHE_LINE_WIDTH//8)
-        ariane_axi::ad_req_t                     req_fsm_miss_req;
-        req_fsm_miss_size = Signal(2)
-
-        gnt_miss_fsm = Signal()
-        valid_miss_fsm = Signal()
-        nmiss = DCACHE_LINE_WIDTH//64
-        data_miss_fsm = Array(Signal(name="data_miss_fsm", 64) \
-                                    for i in range(nmiss))
-
-        # Cache Management <-> LFSR
-        lfsr_enable = Signal()
-        lfsr_oh = Signal(DCACHE_SET_ASSOC)
-        lfsr_bin = Signal($clog2(DCACHE_SET_ASSOC-1))
-        # AMOs
-        ariane_pkg::amo_t amo_op;
-        amo_operand_a = Signal(64)
-        amo_operand_b = Signal(64)
-        amo_result_o = Signal(64)
-
-        struct packed {
-            logic [63:3] address;
-            logic        valid;
-        } reservation_d, reservation_q;
-
-        # ------------------------------
-        # Cache Management
-        # ------------------------------
-        evict_way = Signal(DCACHE_SET_ASSOC)
-        valid_way = Signal(DCACHE_SET_ASSOC)
-
-        for (i in range(DCACHE_SET_ASSOC):
-            comb += evict_way[i].eq(data_i[i].valid & data_i[i].dirty)
-            comb += valid_way[i].eq(data_i[i].valid)
-
-        # ----------------------
-        # Default Assignments
-        # ----------------------
-        # to AXI refill
-        req_fsm_miss_req    = ariane_axi::CACHE_LINE_REQ;
-        req_fsm_miss_size   = Const(0b11, 2)
-        # core
-        serve_amo_d         = serve_amo_q;
-        # --------------------------------
-        # Flush and Miss operation
-        # --------------------------------
-        state_d      = state_q;
-        cnt_d        = cnt_q;
-        evict_way_d  = evict_way_q;
-        evict_cl_d   = evict_cl_q;
-        mshr_d       = mshr_q;
-        # communicate to the requester which unit we are currently serving
-        active_serving_o[mshr_q.id] = mshr_q.valid;
-        # AMOs
-        # silence the unit when not used
-        amo_op = amo_req_i.amo_op;
-
-        reservation_d = reservation_q;
-        with m.FSM() as state_q:
-
-            with m.Case("IDLE"):
-                # lowest priority are AMOs, wait until everything else
-                # is served before going for the AMOs
-                with m.If (amo_req_i.req & ~busy_i):
-                    # 1. Flush the cache
-                    with m.If(~serve_amo_q):
-                        m.next = "FLUSH_REQ_STATUS"
-                        serve_amo_d.eq(0b1
-                        cnt_d.eq(0
-                    # 2. Do the AMO
-                    with m.Else():
-                        m.next = "AMO_LOAD"
-                        serve_amo_d.eq(0b0
-
-                # check if we want to flush and can flush
-                # e.g.: we are not busy anymore
-                # TODO: Check that the busy flag is indeed needed
-                with m.If (flush_i & ~busy_i):
-                    m.next = "FLUSH_REQ_STATUS"
-                    cnt_d = 0
-
-                # check if one of the state machines missed
-                for i in range(NR_PORTS):
-                    # here comes the refill portion of code
-                    with m.If (miss_req_valid[i] & ~miss_req_bypass[i]):
-                        m.next = "MISS"
-                        # we are taking another request so don't
-                        # take the AMO
-                        serve_amo_d  = 0b0;
-                        # save to MSHR
-                        wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH
-                        comb += [ mshr_d.valid.eq(0b1),
-                                  mshr_d.we.eq(miss_req_we[i]),
-                                  mshr_d.id.eq(i),
-                                  mshr_d.addr.eq(miss_req_addr[i][0:wid]),
-                                  mshr_d.wdata.eq(miss_req_wdata[i]),
-                                  mshr_d.be.eq(miss_req_be[i]),
-                                ]
-                        break
-
-            #  ~> we missed on the cache
-            with m.Case("MISS"):
-                # 1. Check if there is an empty cache-line
-                # 2. If not -> evict one
-                comb += req_o.eq(1)
-                sync += addr_o.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH]
-                m.next = "MISS_REPL"
-                comb += miss_o.eq(1)
-
-            # ~> second miss cycle
-            with m.Case("MISS_REPL"):
-                # if all are valid we need to evict one, 
-                # pseudo random from LFSR
-                with m.If(~(~valid_way).bool()):
-                    comb += lfsr_enable.eq(0b1)
-                    comb += evict_way_d.eq(lfsr_oh)
-                    # do we need to write back the cache line?
-                    with m.If(data_i[lfsr_bin].dirty):
-                        state_d = WB_CACHELINE_MISS;
-                        comb += evict_cl_d.tag.eq(data_i[lfsr_bin].tag)
-                        comb += evict_cl_d.data.eq(data_i[lfsr_bin].data)
-                        comb += cnt_d.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH])
-                    # no - we can request a cache line now
-                    with m.Else():
-                        m.next = "REQ_CACHELINE"
-                # we have at least one free way
-                with m.Else():
-                    # get victim cache-line by looking for the
-                    # first non-valid bit
-                    comb += evict_way_d.eq(get_victim_cl(~valid_way)
-                    m.next = "REQ_CACHELINE"
-
-            # ~> we can just load the cache-line,
-            # the way is store in evict_way_q
-            with m.Case("REQ_CACHELINE"):
-                comb += req_fsm_miss_valid .eq(1)
-                sync += req_fsm_miss_addr  .eq(mshr_q.addr)
-
-                with m.If (gnt_miss_fsm):
-                    m.next = "SAVE_CACHELINE"
-                    comb += miss_gnt_o[mshr_q.id].eq(1)
-
-            # ~> replace the cacheline
-            with m.Case("SAVE_CACHELINE"):
-                # calculate cacheline offset
-                automatic logic [$clog2(DCACHE_LINE_WIDTH)-1:0] cl_offset;
-                sync += cl_offset.eq(mshr_q.addr[3:DCACHE_BYTE_OFFSET] << 6)
-                # we've got a valid response from refill unit
-                with m.If (valid_miss_fsm):
-                    wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH
-                    sync += addr_o      .eq(mshr_q.addr[:DCACHE_INDEX_WIDTH])
-                    sync += req_o       .eq(evict_way_q)
-                    comb += we_o        .eq(1)
-                    comb += be_o        .eq(1)
-                    sync += be_o.vldrty .eq(evict_way_q)
-                    sync += data_o.tag  .eq(mshr_q.addr[DCACHE_INDEX_WIDTH:wid]
-                    comb += data_o.data .eq(data_miss_fsm)
-                    comb += data_o.valid.eq(1)
-                    comb += data_o.dirty.eq(0)
-
-                    # is this a write?
-                    with m.If (mshr_q.we):
-                        # Yes, so safe the updated data now
-                        for i in range(8):
-                            # check if we really want to write
-                            # the corresponding byte
-                            with m.If (mshr_q.be[i]):
-                                sync += data_o.data[(cl_offset + i*8) +: 8].eq(mshr_q.wdata[i];
-                        # it's immediately dirty if we write
-                        comb += data_o.dirty.eq(1)
-
-                    # reset MSHR
-                    comb += mshr_d.valid.eq(0)
-                    # go back to idle
-                    m.next = 'IDLE'
-
-            # ------------------------------
-            # Write Back Operation
-            # ------------------------------
-            # ~> evict a cache line from way saved in evict_way_q
-            with m.Case("WB_CACHELINE_FLUSH"):
-            with m.Case("WB_CACHELINE_MISS"):
-
-                comb += req_fsm_miss_valid .eq(0b1)
-                sync += req_fsm_miss_addr  .eq({evict_cl_q.tag, cnt_q[DCACHE_INDEX_WIDTH-1:DCACHE_BYTE_OFFSET], {{DCACHE_BYTE_OFFSET}{0b0}}};
-                comb += req_fsm_miss_be    .eq(1)
-                comb += req_fsm_miss_we    .eq(0b1)
-                sync += req_fsm_miss_wdata .eq(evict_cl_q.data;
-
-                # we've got a grant --> this is timing critical, think about it
-                if (gnt_miss_fsm) begin
-                    # write status array
-                    sync += addr_o    .eq(cnt_q)
-                    comb += req_o     .eq(0b1)
-                    comb += we_o      .eq(0b1)
-                    comb += data_o.valid.eq(INVALIDATE_ON_FLUSH ? 0b0 : 0b1)
-                    # invalidate
-                    sync += be_o.vldrty.eq(evict_way_q)
-                    # go back to handling the miss or flushing,
-                    # depending on where we came from
-                    with m.If(state_q == WB_CACHELINE_MISS):
-                        m.next = "MISS"
-                    with m.Else():
-                        m.next = "FLUSH_REQ_STATUS"
-
-            # ------------------------------
-            # Flushing & Initialization
-            # ------------------------------
-            # ~> make another request to check the same
-            # cache-line if there are still some valid entries
-            with m.Case("FLUSH_REQ_STATUS"):
-                comb += req_o  .eq(1)
-                sync += addr_o .eq(cnt_q)
-                m.next = "FLUSHING"
-
-            with m.Case("FLUSHING"):
-                # this has priority
-                # at least one of the cache lines is dirty
-                with m.If(~evict_way):
-                    # evict cache line, look for the first
-                    # cache-line which is dirty
-                    comb += evict_way_d.eq(get_victim_cl(evict_way))
-                    comb += evict_cl_d .eq(data_i[one_hot_to_bin(evict_way)])
-                    state_d     = WB_CACHELINE_FLUSH;
-                # not dirty ~> increment and continue
-                with m.Else():
-                    # increment and re-request
-                    sync += cnt_d.eq(cnt_q + (1 << DCACHE_BYTE_OFFSET))
-                    m.next = "FLUSH_REQ_STATUS"
-                    sync += addr_o     .eq(cnt_q)
-                    comb += req_o      .eq(1)
-                    comb += be_o.vldrty.eq(INVALIDATE_ON_FLUSH ? 1 : 0)
-                    comb += we_o       .eq(1)
-                    # finished with flushing operation, go back to idle
-                    with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \
-                               == DCACHE_NUM_WORDS-1):
-                        # only acknowledge if the flush wasn't
-                        # triggered by an atomic
-                        sync += flush_ack_o.eq(~serve_amo_q)
-                        m.next = "IDLE"
-
-            # ~> only called after reset
-            with m.Case("INIT"):
-                # initialize status array
-                sync += addr_o.eq(cnt_q)
-                comb += req_o .eq(1)
-                comb += we_o  .eq(1)
-                # only write the dirty array
-                comb += be_o.vldrty.eq(1)
-                sync += cnt_d      .eq(cnt_q + (1 << DCACHE_BYTE_OFFSET))
-                # finished initialization
-                with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \
-                            == DCACHE_NUM_WORDS-1)
-                    m.next = "IDLE"
-
-            # ----------------------
-            # AMOs
-            # ----------------------
-            # TODO(zarubaf) Move this closer to memory
-            # ~> we are here because we need to do the AMO,
-            # the cache is clean at this point
-            # start by executing the load
-            with m.Case("AMO_LOAD"):
-                comb += req_fsm_miss_valid.eq(1)
-                # address is in operand a
-                comb += req_fsm_miss_addr.eq(amo_req_i.operand_a)
-                comb += req_fsm_miss_req.eq(ariane_axi::SINGLE_REQ)
-                comb += req_fsm_miss_size.eq(amo_req_i.size)
-                # the request has been granted
-                with m.If(gnt_miss_fsm):
-                    m.next = "AMO_SAVE_LOAD"
-            # save the load value
-            with m.Case("AMO_SAVE_LOAD"):
-                with m.If (valid_miss_fsm):
-                    # we are only concerned about the lower 64-bit
-                    comb += mshr_d.wdata.eq(data_miss_fsm[0])
-                    m.next = "AMO_STORE"
-            # and do the store
-            with m.Case("AMO_STORE"):
-                load_data = Signal(64)
-                # re-align load data
-                comb += load_data.eq(data_align(amo_req_i.operand_a[:3],
-                                                mshr_q.wdata))
-                # Sign-extend for word operation
-                with m.If (amo_req_i.size == 0b10):
-                    comb += amo_operand_a.eq(sext32(load_data[:32]))
-                    comb += amo_operand_b.eq(sext32(amo_req_i.operand_b[:32]))
-                with m.Else():
-                    comb += amo_operand_a.eq(load_data)
-                    comb += amo_operand_b.eq(amo_req_i.operand_b)
-
-                #  we do not need a store request for load reserved
-                # or a failing store conditional
-                #  we can bail-out without making any further requests
-                with m.If ((amo_req_i.amo_op == AMO_LR) | \
-                           ((amo_req_i.amo_op == AMO_SC) & \
-                           ((reservation_q.valid & \
-                            (reservation_q.address != \
-                             amo_req_i.operand_a[3:64])) | \
-                             ~reservation_q.valid))):
-                    comb += req_fsm_miss_valid.eq(0)
-                    m.next = "IDLE"
-                    comb += amo_resp_o.ack.eq(1)
-                    # write-back the result
-                    comb += amo_resp_o.result.eq(amo_operand_a)
-                    # we know that the SC failed
-                    with m.If (amo_req_i.amo_op == AMO_SC):
-                        comb += amo_resp_o.result.eq(1)
-                        # also clear the reservation
-                        comb += reservation_d.valid.eq(0)
-                with m.Else():
-                    comb += req_fsm_miss_valid.eq(1)
-
-                comb += req_fsm_miss_we  .eq(1)
-                comb += req_fsm_miss_req .eq(ariane_axi::SINGLE_REQ)
-                comb += req_fsm_miss_size.eq(amo_req_i.size)
-                comb += req_fsm_miss_addr.eq(amo_req_i.operand_a)
-
-                comb += req_fsm_miss_wdata.eq(
-                    data_align(amo_req_i.operand_a[0:3], amo_result_o))
-                comb += req_fsm_miss_be.eq(
-                    be_gen(amo_req_i.operand_a[0:3], amo_req_i.size))
-
-                # place a reservation on the memory
-                with m.If (amo_req_i.amo_op == AMO_LR):
-                    comb += reservation_d.address.eq(amo_req_i.operand_a[3:64])
-                    comb += reservation_d.valid.eq(1)
-
-                # the request is valid or we didn't need to go for another store
-                with m.If (valid_miss_fsm):
-                    m.next = "IDLE"
-                    comb += amo_resp_o.ack.eq(1)
-                    # write-back the result
-                    comb += amo_resp_o.result.eq(amo_operand_a;
-
-                    if (amo_req_i.amo_op == AMO_SC) begin
-                        comb += amo_resp_o.result.eq(0)
-                        # An SC must fail if there is another SC
-                        # (to any address) between the LR and the SC in
-                        # program order (even to the same address).
-                        # in any case destroy the reservation
-                        comb += reservation_d.valid.eq(0)
-
-        # check MSHR for aliasing
-
-        comb += mshr_addr_matches_o .eq(0)
-        comb += mshr_index_matches_o.eq()
-
-        for i in range(NR_PORTS):
-            # check mshr for potential matching of other units,
-            # exclude the unit currently being served
-            with m.If (mshr_q.valid & \
-                    (mshr_addr_i[i][DCACHE_BYTE_OFFSET:56] == \
-                     mshr_q.addr[DCACHE_BYTE_OFFSET:56])):
-                comb += mshr_addr_matches_o[i].eq(1)
-
-            # same as previous, but checking only the index
-            with m.If (mshr_q.valid & \
-                    (mshr_addr_i[i][DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] == \
-                     mshr_q.addr[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH])):
-                mshr_index_matches_o[i].eq(1)
-
-        # --------------------
-        # Sequential Process
-        # --------------------
-
-        """
-        #pragma translate_off
-        `ifndef VERILATOR
-        # assert that cache only hits on one way
-        assert property (
-          @(posedge clk_i) $onehot0(evict_way_q)) else $warning("Evict-way should be one-hot encoded");
-        `endif
-        #pragma translate_on
-        """
-
-        # ----------------------
-        # Bypass Arbiter
-        # ----------------------
-        # Connection Arbiter <-> AXI
-        req_fsm_bypass_valid = Signal()
-        req_fsm_bypass_addr = Signal(64)
-        req_fsm_bypass_wdata = Signal(64)
-        req_fsm_bypass_we = Signal()
-        req_fsm_bypass_be = Signal(8)
-        req_fsm_bypass_size = Signal(2)
-        gnt_bypass_fsm = Signal()
-        valid_bypass_fsm = Signal()
-        data_bypass_fsm = Signal(64)
-        logic [$clog2(NR_PORTS)-1:0] id_fsm_bypass;
-        logic [3:0]                  id_bypass_fsm;
-        logic [3:0]                  gnt_id_bypass_fsm;
-
-        i_bypass_arbiter = ib = AXIArbiter( NR_PORTS, 64)
-        comb += [
-            # Master Side
-            ib.data_req_i     .eq( miss_req_valid & miss_req_bypass         ),
-            ib.address_i      .eq( miss_req_addr                            ),
-            ib.data_wdata_i   .eq( miss_req_wdata                           ),
-            ib.data_we_i      .eq( miss_req_we                              ),
-            ib.data_be_i      .eq( miss_req_be                              ),
-            ib.data_size_i    .eq( miss_req_size                            ),
-            ib.data_gnt_o     .eq( bypass_gnt_o                             ),
-            ib.data_rvalid_o  .eq( bypass_valid_o                           ),
-            ib.data_rdata_o   .eq( bypass_data_o                            ),
-            # Slave Sid
-            ib.id_i           .eq( id_bypass_fsm[$clog2(NR_PORTS)-1:0]      ),
-            ib.id_o           .eq( id_fsm_bypass                            ),
-            ib.gnt_id_i       .eq( gnt_id_bypass_fsm[$clog2(NR_PORTS)-1:0]  ),
-            ib.address_o      .eq( req_fsm_bypass_addr                      ),
-            ib.data_wdata_o   .eq( req_fsm_bypass_wdata                     ),
-            ib.data_req_o     .eq( req_fsm_bypass_valid                     ),
-            ib.data_we_o      .eq( req_fsm_bypass_we                        ),
-            ib.data_be_o      .eq( req_fsm_bypass_be                        ),
-            ib.data_size_o    .eq( req_fsm_bypass_size                      ),
-            ib.data_gnt_i     .eq( gnt_bypass_fsm                           ),
-            ib.data_rvalid_i  .eq( valid_bypass_fsm                         ),
-            ib.data_rdata_i   .eq( data_bypass_fsm                          ),
-        ]
-
-        axi_adapter #(
-            .DATA_WIDTH            ( 64                 ),
-            .AXI_ID_WIDTH          ( 4                  ),
-            .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET )
-        ) i_bypass_axi_adapter (
-            .clk_i,
-            .rst_ni,
-            .req_i                 ( req_fsm_bypass_valid   ),
-            .type_i                ( ariane_axi::SINGLE_REQ ),
-            .gnt_o                 ( gnt_bypass_fsm         ),
-            .addr_i                ( req_fsm_bypass_addr    ),
-            .we_i                  ( req_fsm_bypass_we      ),
-            .wdata_i               ( req_fsm_bypass_wdata   ),
-            .be_i                  ( req_fsm_bypass_be      ),
-            .size_i                ( req_fsm_bypass_size    ),
-            .id_i                  ( Cat(id_fsm_bypass, 0, 0) ),
-            .valid_o               ( valid_bypass_fsm       ),
-            .rdata_o               ( data_bypass_fsm        ),
-            .gnt_id_o              ( gnt_id_bypass_fsm      ),
-            .id_o                  ( id_bypass_fsm          ),
-            .critical_word_o       (                        ), # not used for single requests
-            .critical_word_valid_o (                        ), # not used for single requests
-            .axi_req_o             ( axi_bypass_o           ),
-            .axi_resp_i            ( axi_bypass_i           )
-        );
-
-        # ----------------------
-        # Cache Line AXI Refill
-        # ----------------------
-        axi_adapter  #(
-            .DATA_WIDTH            ( DCACHE_LINE_WIDTH  ),
-            .AXI_ID_WIDTH          ( 4                  ),
-            .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET )
-        ) i_miss_axi_adapter (
-            .clk_i,
-            .rst_ni,
-            .req_i               ( req_fsm_miss_valid ),
-            .type_i              ( req_fsm_miss_req   ),
-            .gnt_o               ( gnt_miss_fsm       ),
-            .addr_i              ( req_fsm_miss_addr  ),
-            .we_i                ( req_fsm_miss_we    ),
-            .wdata_i             ( req_fsm_miss_wdata ),
-            .be_i                ( req_fsm_miss_be    ),
-            .size_i              ( req_fsm_miss_size  ),
-            .id_i                ( Const(0b1100, 4)   ),
-            .gnt_id_o            (                    ), # open
-            .valid_o             ( valid_miss_fsm     ),
-            .rdata_o             ( data_miss_fsm      ),
-            .id_o                (                    ),
-            .critical_word_o,
-            .critical_word_valid_o,
-            .axi_req_o           ( axi_data_o         ),
-            .axi_resp_i          ( axi_data_i         )
-        );
-
-        # -----------------
-        # Replacement LFSR
-        # -----------------
-        lfsr_8bit #(.WIDTH (DCACHE_SET_ASSOC)) i_lfsr (
-            .en_i           ( lfsr_enable ),
-            .refill_way_oh  ( lfsr_oh     ),
-            .refill_way_bin ( lfsr_bin    ),
-            .*
-        );
-
-        # -----------------
-        # AMO ALU
-        # -----------------
-        amo_alu i_amo_alu (
-            .amo_op_i        ( amo_op        ),
-            .amo_operand_a_i ( amo_operand_a ),
-            .amo_operand_b_i ( amo_operand_b ),
-            .amo_result_o    ( amo_result_o  )
-        );
-
-        # -----------------
-        # Struct Split
-        # -----------------
-
-        for i in range(NR_PORTS):
-            miss_req = MissReq()
-            comb += miss_req.eq(miss_req_i[i]);
-            comb += miss_req_valid  [i] .eq(miss_req.valid)
-            comb += miss_req_bypass [i] .eq(miss_req.bypass)
-            comb += miss_req_addr   [i] .eq(miss_req.addr)
-            comb += miss_req_wdata  [i] .eq(miss_req.wdata)
-            comb += miss_req_we     [i] .eq(miss_req.we)
-            comb += miss_req_be     [i] .eq(miss_req.be)
-            comb += miss_req_size   [i] .eq(miss_req.size)
-
-    # --------------
-    # AXI Arbiter
-    # --------------s
-    #
-    # Description: Arbitrates access to AXI refill/bypass
-    #
-class AXIArbiter:
-    def __init__(self, NR_PORTS   = 3, DATA_WIDTH = 64):
-        self.NR_PORTS = NR_PORTS
-        self.DATA_WIDTH = DATA_WIDTH
-        self.pwid = pwid = ceil(log(NR_PORTS) / log(2))
-        rst_ni = ResetSignal() # Asynchronous reset active low
-        # master ports
-        self.data_req_i = Signal(NR_PORTS)
-        self.address_i = Array(Signal(name="address_i", 64) \
-                                    for i in range(NR_PORTS))
-        self.data_wdata_i = Array(Signal(name="data_wdata_i", 64) \
-                                    for i in range(NR_PORTS))
-        self.data_we_i = Signal(NR_PORTS)
-        self.data_be_i = Array(Signal(name="data_wdata_i", DATA_WIDTH/8) \
-                                    for i in range(NR_PORTS))
-        self.data_size_i = Array(Signal(name="data_size_i", 2) \
-                                    for i in range(NR_PORTS))
-        self.data_gnt_o = Signal(NR_PORTS)
-        self.data_rvalid_o = Signal(NR_PORTS)
-        self.data_rdata_o = Array(Signal(name="data_rdata_o", 64) \
-                                    for i in range(NR_PORTS))
-
-        # slave port
-        self.id_i = Signal(pwid)
-        self.id_o = Signal(pwid)
-        self.gnt_id_i = Signal(pwid)
-        self.data_req_o = Signal()
-        self.address_o = Signal(64)
-        self.data_wdata_o = Signal(DATA_WIDTH)
-        self.data_we_o = Signal()
-        self.data_be_o = Signal(DATA_WIDTH/8)
-        self.data_size_o = Signal(2)
-        self.data_gnt_i = Signal()
-        self.data_rvalid_i = Signal()
-        self.data_rdata_i = Signal(DATA_WIDTH)
-
-    def elaborate(self, platform):
-        #enum logic [1:0] { IDLE, REQ, SERVING } state_d, state_q;
-
-        class Packet:
-            def __init__(self, pwid, DATA_WIDTH):
-                self.id = Signal(pwid)
-                self.address = Signal(64)
-                self.data = Signal(64)
-                self.size = Signal(2)
-                self.be = Signal(DATA_WIDTH/8)
-                self.we = Signal()
-
-        request_index = Signal(self.pwid)
-        req_q = Packet(self.pwid, self.DATA_WIDTH)
-        req_d = Packet(self.pwid, self.DATA_WIDTH)
-
-        # request register
-        sync += req_q.eq(req_d)
-
-        # request port
-        comb += self.address_o             .eq(req_q.address)
-        comb += self.data_wdata_o          .eq(req_q.data)
-        comb += self.data_be_o             .eq(req_q.be)
-        comb += self.data_size_o           .eq(req_q.size)
-        comb += self.data_we_o             .eq(req_q.we)
-        comb += self.id_o                  .eq(req_q.id)
-        comb += self.data_gnt_o            .eq(0)
-        # read port
-        comb += self.data_rvalid_o         .eq(0)
-        comb += self.data_rdata_o          .eq(0)
-        comb += self.data_rdata_o[req_q.id].eq(data_rdata_i)
-
-        m.submodules.pp = pp = PriorityEncoder(self.NR_PORTS)
-        comb += pp.i.eq(self.data_req_i) # select one request (priority-based)
-        comb += request_index.eq(pp.o)
-
-        with m.Switch("state") as s:
-
-            with m.Case("IDLE"):
-                # wait for incoming requests (priority encoder data_req_i)
-                with m.If(~pp.n): # one output valid from encoder
-                    comb += self.data_req_o   .eq(self.data_req_i[i])
-                    comb += self.data_gnt_o[i].eq(self.data_req_i[i])
-                    # save the request
-                    comb += req_d.address.eq(self.address_i[i])
-                    comb += req_d.id.eq(request_index)
-                    comb += req_d.data.eq(self.data_wdata_i[i])
-                    comb += req_d.size.eq(self.data_size_i[i])
-                    comb += req_d.be.eq(self.data_be_i[i])
-                    comb += req_d.we.eq(self.data_we_i[i])
-                    m.next = "SERVING"
-
-                comb += self.address_o    .eq(self.address_i[request_index])
-                comb += self.data_wdata_o .eq(self.data_wdata_i[request_index])
-                comb += self.data_be_o    .eq(self.data_be_i[request_index])
-                comb += self.data_size_o  .eq(self.data_size_i[request_index])
-                comb += self.data_we_o    .eq(self.data_we_i[request_index])
-                comb += self.id_o         .eq(request_index)
-
-            with m.Case("SERVING"):
-                comb += self.data_req_o.eq(1)
-                with m.If (self.data_rvalid_i):
-                    comb += self.data_rvalid_o[req_q.id].eq(1)
-                    m.next = "IDLE"
-
-        # ------------
-        # Assertions
-        # ------------
-
-        """
-#pragma translate_off
-`ifndef VERILATOR
-# make sure that we eventually get an rvalid after we received a grant
-assert property (@(posedge clk_i) data_gnt_i |-> ##[1:$] data_rvalid_i )
-    else begin $error("There was a grant without a rvalid"); $stop(); end
-# assert that there is no grant without a request
-assert property (@(negedge clk_i) data_gnt_i |-> data_req_o)
-    else begin $error("There was a grant without a request."); $stop(); end
-# assert that the address does not contain X when request is sent
-assert property ( @(posedge clk_i) (data_req_o) |-> (!$isunknown(address_o)) )
-  else begin $error("address contains X when request is set"); $stop(); end
-
-`endif
-#pragma translate_on
-        """
-
diff --git a/src/unused/TLB/ariane/mmu.py b/src/unused/TLB/ariane/mmu.py
deleted file mode 100644 (file)
index a14862c..0000000
+++ /dev/null
@@ -1,474 +0,0 @@
-"""
-# Copyright 2018 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the "License"); you may not use this file except in
-# compliance with the License.  You may obtain a copy of the License at
-# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-#
-# Author: Florian Zaruba, ETH Zurich
-# Date: 19/04/2017
-# Description: Memory Management Unit for Ariane, contains TLB and
-#              address translation unit. SV48 as defined in
-#              Volume II: RISC-V Privileged Architectures V1.10 Page 63
-
-import ariane_pkg::*;
-"""
-
-from nmigen import Const, Signal, Cat, Module, Mux
-from nmigen.cli import verilog, rtlil
-
-from ptw import DCacheReqI, DCacheReqO, TLBUpdate, PTE, PTW
-from tlb import TLB
-from exceptcause import (INSTR_ACCESS_FAULT, INSTR_PAGE_FAULT,
-                         LOAD_PAGE_FAULT, STORE_PAGE_FAULT)
-
-PRIV_LVL_M = Const(0b11, 2)
-PRIV_LVL_S = Const(0b01, 2)
-PRIV_LVL_U = Const(0b00, 2)
-
-
-class RVException:
-    def __init__(self):
-         self.cause = Signal(64) # cause of exception
-         self.tval = Signal(64) # more info of causing exception
-                                # (e.g.: instruction causing it),
-                                #        address of LD/ST fault
-         self.valid = Signal()
-
-    def eq(self, inp):
-        res = []
-        for (o, i) in zip(self.ports(), inp.ports()):
-            res.append(o.eq(i))
-        return res
-
-    def __iter__(self):
-        yield self.cause
-        yield self.tval
-        yield self.valid
-
-    def ports(self):
-        return list(self)
-
-
-class ICacheReqI:
-    def __init__(self):
-        self.fetch_valid = Signal()   # address translation valid
-        self.fetch_paddr = Signal(64) # physical address in
-        self.fetch_exception = RVException() # exception occurred during fetch
-
-    def __iter__(self):
-        yield self.fetch_valid
-        yield self.fetch_paddr
-        yield from self.fetch_exception
-
-    def ports(self):
-        return list(self)
-
-
-class ICacheReqO:
-    def __init__(self):
-        self.fetch_req = Signal()     # address translation request
-        self.fetch_vaddr = Signal(64) # virtual address out
-
-    def __iter__(self):
-        yield self.fetch_req
-        yield self.fetch_vaddr
-
-    def ports(self):
-        return list(self)
-
-
-class MMU:
-    def __init__(self, instr_tlb_entries = 4,
-                       data_tlb_entries  = 4,
-                       asid_width        = 1):
-        self.instr_tlb_entries = instr_tlb_entries
-        self.data_tlb_entries = data_tlb_entries
-        self.asid_width = asid_width
-
-        self.flush_i = Signal()
-        self.enable_translation_i = Signal()
-        self.en_ld_st_translation_i = Signal() # enable VM translation for LD/ST
-        # IF interface
-        self.icache_areq_i = ICacheReqO()
-        self.icache_areq_o = ICacheReqI()
-        # LSU interface
-        # this is a more minimalistic interface because the actual addressing
-        # logic is handled in the LSU as we distinguish load and stores,
-        # what we do here is simple address translation
-        self.misaligned_ex_i = RVException()
-        self.lsu_req_i = Signal()   # request address translation
-        self.lsu_vaddr_i = Signal(64) # virtual address in
-        self.lsu_is_store_i = Signal() # the translation is requested by a store
-        # if we need to walk the page table we can't grant in the same cycle
-
-        # Cycle 0
-        self.lsu_dtlb_hit_o = Signal() # sent in the same cycle as the request
-                                       # if translation hits in the DTLB
-        # Cycle 1
-        self.lsu_valid_o = Signal()  # translation is valid
-        self.lsu_paddr_o = Signal(64) # translated address
-        self.lsu_exception_o = RVException() # addr translate threw exception
-
-        # General control signals
-        self.priv_lvl_i = Signal(2)
-        self.ld_st_priv_lvl_i = Signal(2)
-        self.sum_i = Signal()
-        self.mxr_i = Signal()
-        # input logic flag_mprv_i,
-        self.satp_ppn_i = Signal(44)
-        self.asid_i = Signal(self.asid_width)
-        self.flush_tlb_i = Signal()
-        # Performance counters
-        self.itlb_miss_o = Signal()
-        self.dtlb_miss_o = Signal()
-        # PTW memory interface
-        self.req_port_i = DCacheReqO()
-        self.req_port_o = DCacheReqI()
-
-    def elaborate(self, platform):
-        m = Module()
-
-        iaccess_err = Signal()   # insufficient priv to access instr page
-        daccess_err = Signal()   # insufficient priv to access data page
-        ptw_active = Signal()    # PTW is currently walking a page table
-        walking_instr = Signal() # PTW is walking because of an ITLB miss
-        ptw_error = Signal()     # PTW threw an exception
-
-        update_vaddr = Signal(48)                                # guessed
-        uaddr64 = Cat(update_vaddr, Const(0, 25)) # extend to 64bit with zeros
-        update_ptw_itlb = TLBUpdate(self.asid_width)
-        update_ptw_dtlb = TLBUpdate(self.asid_width)
-
-        itlb_lu_access = Signal()
-        itlb_content = PTE()
-        itlb_is_2M = Signal()
-        itlb_is_1G = Signal()
-        itlb_is_512G = Signal()
-        itlb_lu_hit = Signal()
-
-        dtlb_lu_access = Signal()
-        dtlb_content = PTE()
-        dtlb_is_2M = Signal()
-        dtlb_is_1G = Signal()
-        dtlb_is_512G = Signal()
-        dtlb_lu_hit = Signal()
-
-        # Assignments
-        m.d.comb += [itlb_lu_access.eq(self.icache_areq_i.fetch_req),
-                     dtlb_lu_access.eq(self.lsu_req_i)
-                    ]
-
-        # ITLB
-        m.submodules.i_tlb = i_tlb = TLB(self.instr_tlb_entries,
-                                         self.asid_width)
-        m.d.comb += [i_tlb.flush_i.eq(self.flush_tlb_i),
-                     i_tlb.update_i.eq(update_ptw_itlb),
-                     i_tlb.lu_access_i.eq(itlb_lu_access),
-                     i_tlb.lu_asid_i.eq(self.asid_i),
-                     i_tlb.lu_vaddr_i.eq(self.icache_areq_i.fetch_vaddr),
-                     itlb_content.eq(i_tlb.lu_content_o),
-                     itlb_is_2M.eq(i_tlb.lu_is_2M_o),
-                     itlb_is_1G.eq(i_tlb.lu_is_1G_o),
-                     itlb_is_512G.eq(i_tlb.lu_is_512G_o),
-                     itlb_lu_hit.eq(i_tlb.lu_hit_o),
-                    ]
-
-        # DTLB
-        m.submodules.d_tlb = d_tlb = TLB(self.data_tlb_entries,
-                                         self.asid_width)
-        m.d.comb += [d_tlb.flush_i.eq(self.flush_tlb_i),
-                     d_tlb.update_i.eq(update_ptw_dtlb),
-                     d_tlb.lu_access_i.eq(dtlb_lu_access),
-                     d_tlb.lu_asid_i.eq(self.asid_i),
-                     d_tlb.lu_vaddr_i.eq(self.lsu_vaddr_i),
-                     dtlb_content.eq(d_tlb.lu_content_o),
-                     dtlb_is_2M.eq(d_tlb.lu_is_2M_o),
-                     dtlb_is_1G.eq(d_tlb.lu_is_1G_o),
-                     dtlb_is_512G.eq(d_tlb.lu_is_512G_o),
-                     dtlb_lu_hit.eq(d_tlb.lu_hit_o),
-                    ]
-
-        # PTW
-        m.submodules.ptw = ptw = PTW(self.asid_width)
-        m.d.comb += [ptw_active.eq(ptw.ptw_active_o),
-                     walking_instr.eq(ptw.walking_instr_o),
-                     ptw_error.eq(ptw.ptw_error_o),
-                     ptw.enable_translation_i.eq(self.enable_translation_i),
-
-                     update_vaddr.eq(ptw.update_vaddr_o),
-                     update_ptw_itlb.eq(ptw.itlb_update_o),
-                     update_ptw_dtlb.eq(ptw.dtlb_update_o),
-
-                     ptw.itlb_access_i.eq(itlb_lu_access),
-                     ptw.itlb_hit_i.eq(itlb_lu_hit),
-                     ptw.itlb_vaddr_i.eq(self.icache_areq_i.fetch_vaddr),
-
-                     ptw.dtlb_access_i.eq(dtlb_lu_access),
-                     ptw.dtlb_hit_i.eq(dtlb_lu_hit),
-                     ptw.dtlb_vaddr_i.eq(self.lsu_vaddr_i),
-
-                     ptw.req_port_i.eq(self.req_port_i),
-                     self.req_port_o.eq(ptw.req_port_o),
-                    ]
-
-        # ila_1 i_ila_1 (
-        #     .clk(clk_i), # input wire clk
-        #     .probe0({req_port_o.address_tag, req_port_o.address_index}),
-        #     .probe1(req_port_o.data_req), # input wire [63:0]  probe1
-        #     .probe2(req_port_i.data_gnt), # input wire [0:0]  probe2
-        #     .probe3(req_port_i.data_rdata), # input wire [0:0]  probe3
-        #     .probe4(req_port_i.data_rvalid), # input wire [0:0]  probe4
-        #     .probe5(ptw_error), # input wire [1:0]  probe5
-        #     .probe6(update_vaddr), # input wire [0:0]  probe6
-        #     .probe7(update_ptw_itlb.valid), # input wire [0:0]  probe7
-        #     .probe8(update_ptw_dtlb.valid), # input wire [0:0]  probe8
-        #     .probe9(dtlb_lu_access), # input wire [0:0]  probe9
-        #     .probe10(lsu_vaddr_i), # input wire [0:0]  probe10
-        #     .probe11(dtlb_lu_hit), # input wire [0:0]  probe11
-        #     .probe12(itlb_lu_access), # input wire [0:0]  probe12
-        #     .probe13(icache_areq_i.fetch_vaddr), # input wire [0:0]  probe13
-        #     .probe14(itlb_lu_hit) # input wire [0:0]  probe13
-        # );
-
-        #-----------------------
-        # Instruction Interface
-        #-----------------------
-        # The instruction interface is a simple request response interface
-
-        # MMU disabled: just pass through
-        m.d.comb += [self.icache_areq_o.fetch_valid.eq(
-                                                self.icache_areq_i.fetch_req),
-                     # play through in case we disabled address translation
-                     self.icache_areq_o.fetch_paddr.eq(
-                                                self.icache_areq_i.fetch_vaddr)
-                    ]
-        # two potential exception sources:
-        # 1. HPTW threw an exception -> signal with a page fault exception
-        # 2. We got an access error because of insufficient permissions ->
-        #    throw an access exception
-        m.d.comb += self.icache_areq_o.fetch_exception.valid.eq(0)
-        # Check whether we are allowed to access this memory region
-        # from a fetch perspective
-
-        # PLATEN TODO: use PermissionValidator instead [we like modules]
-        m.d.comb += iaccess_err.eq(self.icache_areq_i.fetch_req & \
-                                   (((self.priv_lvl_i == PRIV_LVL_U) & \
-                                      ~itlb_content.u) | \
-                                   ((self.priv_lvl_i == PRIV_LVL_S) & \
-                                    itlb_content.u)))
-
-        # MMU enabled: address from TLB, request delayed until hit.
-        # Error when TLB hit and no access right or TLB hit and
-        # translated address not valid (e.g.  AXI decode error),
-        # or when PTW performs walk due to ITLB miss and raises
-        # an error.
-        with m.If (self.enable_translation_i):
-            # we work with SV48, so if VM is enabled, check that
-            # all bits [47:38] are equal
-            with m.If (self.icache_areq_i.fetch_req & \
-                ~(((~self.icache_areq_i.fetch_vaddr[47:64]) == 0) | \
-                 (self.icache_areq_i.fetch_vaddr[47:64]) == 0)):
-                fe = self.icache_areq_o.fetch_exception
-                m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT),
-                             fe.tval.eq(self.icache_areq_i.fetch_vaddr),
-                             fe.valid.eq(1)
-                            ]
-
-            m.d.comb += self.icache_areq_o.fetch_valid.eq(0)
-
-            # 4K page
-            paddr = Signal.like(self.icache_areq_o.fetch_paddr)
-            paddr4k = Cat(self.icache_areq_i.fetch_vaddr[0:12],
-                          itlb_content.ppn)
-            m.d.comb += paddr.eq(paddr4k)
-            # Mega page
-            with m.If(itlb_is_2M):
-                m.d.comb += paddr[12:21].eq(
-                          self.icache_areq_i.fetch_vaddr[12:21])
-            # Giga page
-            with m.If(itlb_is_1G):
-                m.d.comb += paddr[12:30].eq(
-                          self.icache_areq_i.fetch_vaddr[12:30])
-            m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr)
-            # Tera page
-            with m.If(itlb_is_512G):
-                m.d.comb += paddr[12:39].eq(
-                          self.icache_areq_i.fetch_vaddr[12:39])
-            m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr)
-
-            # ---------
-            # ITLB Hit
-            # --------
-            # if we hit the ITLB output the request signal immediately
-            with m.If(itlb_lu_hit):
-                m.d.comb += self.icache_areq_o.fetch_valid.eq(
-                                          self.icache_areq_i.fetch_req)
-                # we got an access error
-                with m.If (iaccess_err):
-                    # throw a page fault
-                    fe = self.icache_areq_o.fetch_exception
-                    m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT),
-                                 fe.tval.eq(self.icache_areq_i.fetch_vaddr),
-                                 fe.valid.eq(1)
-                                ]
-            # ---------
-            # ITLB Miss
-            # ---------
-            # watch out for exceptions happening during walking the page table
-            with m.Elif(ptw_active & walking_instr):
-                m.d.comb += self.icache_areq_o.fetch_valid.eq(ptw_error)
-                fe = self.icache_areq_o.fetch_exception
-                m.d.comb += [fe.cause.eq(INSTR_PAGE_FAULT),
-                             fe.tval.eq(uaddr64),
-                             fe.valid.eq(1)
-                            ]
-
-        #-----------------------
-        # Data Interface
-        #-----------------------
-
-        lsu_vaddr = Signal(64)
-        dtlb_pte = PTE()
-        misaligned_ex = RVException()
-        lsu_req = Signal()
-        lsu_is_store = Signal()
-        dtlb_hit = Signal()
-        #dtlb_is_2M = Signal()
-        #dtlb_is_1G = Signal()
-        #dtlb_is_512 = Signal()
-
-        # check if we need to do translation or if we are always
-        # ready (e.g.: we are not translating anything)
-        m.d.comb += self.lsu_dtlb_hit_o.eq(Mux(self.en_ld_st_translation_i,
-                                          dtlb_lu_hit, 1))
-
-        # The data interface is simpler and only consists of a
-        # request/response interface
-        m.d.comb += [
-            # save request and DTLB response
-            lsu_vaddr.eq(self.lsu_vaddr_i),
-            lsu_req.eq(self.lsu_req_i),
-            misaligned_ex.eq(self.misaligned_ex_i),
-            dtlb_pte.eq(dtlb_content),
-            dtlb_hit.eq(dtlb_lu_hit),
-            lsu_is_store.eq(self.lsu_is_store_i),
-            #dtlb_is_2M.eq(dtlb_is_2M),
-            #dtlb_is_1G.eq(dtlb_is_1G),
-            ##dtlb_is_512.eq(self.dtlb_is_512G) #????
-        ]
-        m.d.sync += [
-            self.lsu_paddr_o.eq(lsu_vaddr),
-            self.lsu_valid_o.eq(lsu_req),
-            self.lsu_exception_o.eq(misaligned_ex),
-        ]
-
-        sverr = Signal()
-        usrerr = Signal()
-
-        m.d.comb += [
-            # mute misaligned exceptions if there is no request
-            # otherwise they will throw accidental exceptions
-            misaligned_ex.valid.eq(self.misaligned_ex_i.valid & self.lsu_req_i),
-
-            # SUM is not set and we are trying to access a user
-            # page in supervisor mode
-            sverr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_S & ~self.sum_i & \
-                       dtlb_pte.u),
-            # this is not a user page but we are in user mode and
-            # trying to access it
-            usrerr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_U & ~dtlb_pte.u),
-
-            # Check if the User flag is set, then we may only
-            # access it in supervisor mode if SUM is enabled
-            daccess_err.eq(sverr | usrerr),
-            ]
-
-        # translation is enabled and no misaligned exception occurred
-        with m.If(self.en_ld_st_translation_i & ~misaligned_ex.valid):
-            m.d.comb += lsu_req.eq(0)
-            # 4K page
-            paddr = Signal.like(lsu_vaddr)
-            paddr4k = Cat(lsu_vaddr[0:12], itlb_content.ppn)
-            m.d.comb += paddr.eq(paddr4k)
-            # Mega page
-            with m.If(dtlb_is_2M):
-                m.d.comb += paddr[12:21].eq(lsu_vaddr[12:21])
-            # Giga page
-            with m.If(dtlb_is_1G):
-                m.d.comb += paddr[12:30].eq(lsu_vaddr[12:30])
-            m.d.sync += self.lsu_paddr_o.eq(paddr)
-            # TODO platen tera_page
-
-            # ---------
-            # DTLB Hit
-            # --------
-            with m.If(dtlb_hit & lsu_req):
-                m.d.comb += lsu_req.eq(1)
-                # this is a store
-                with m.If (lsu_is_store):
-                    # check if the page is write-able and
-                    # we are not violating privileges
-                    # also check if the dirty flag is set
-                    with m.If(~dtlb_pte.w | daccess_err | ~dtlb_pte.d):
-                        le = self.lsu_exception_o
-                        m.d.sync += [le.cause.eq(STORE_PAGE_FAULT),
-                                     le.tval.eq(lsu_vaddr),
-                                     le.valid.eq(1)
-                                    ]
-
-                # this is a load, check for sufficient access
-                # privileges - throw a page fault if necessary
-                with m.Elif(daccess_err):
-                    le = self.lsu_exception_o
-                    m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT),
-                                 le.tval.eq(lsu_vaddr),
-                                 le.valid.eq(1)
-                                ]
-            # ---------
-            # DTLB Miss
-            # ---------
-            # watch out for exceptions
-            with m.Elif (ptw_active & ~walking_instr):
-                # page table walker threw an exception
-                with m.If (ptw_error):
-                    # an error makes the translation valid
-                    m.d.comb += lsu_req.eq(1)
-                    # the page table walker can only throw page faults
-                    with m.If (lsu_is_store):
-                        le = self.lsu_exception_o
-                        m.d.sync += [le.cause.eq(STORE_PAGE_FAULT),
-                                     le.tval.eq(uaddr64),
-                                     le.valid.eq(1)
-                                    ]
-                    with m.Else():
-                        m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT),
-                                     le.tval.eq(uaddr64),
-                                     le.valid.eq(1)
-                                    ]
-
-        return m
-
-    def ports(self):
-        return [self.flush_i, self.enable_translation_i,
-                self.en_ld_st_translation_i,
-                self.lsu_req_i,
-                self.lsu_vaddr_i, self.lsu_is_store_i, self.lsu_dtlb_hit_o,
-                self.lsu_valid_o, self.lsu_paddr_o,
-                self.priv_lvl_i, self.ld_st_priv_lvl_i, self.sum_i, self.mxr_i,
-                self.satp_ppn_i, self.asid_i, self.flush_tlb_i,
-                self.itlb_miss_o, self.dtlb_miss_o] + \
-                self.icache_areq_i.ports() + self.icache_areq_o.ports() + \
-                self.req_port_i.ports() + self.req_port_o.ports() + \
-                self.misaligned_ex_i.ports() + self.lsu_exception_o.ports()
-
-if __name__ == '__main__':
-    mmu = MMU()
-    vl = rtlil.convert(mmu, ports=mmu.ports())
-    with open("test_mmu.il", "w") as f:
-        f.write(vl)
-
diff --git a/src/unused/TLB/ariane/p_lru.txt b/src/unused/TLB/ariane/p_lru.txt
deleted file mode 100644 (file)
index 4bac768..0000000
+++ /dev/null
@@ -1,51 +0,0 @@
-pseudo-LRU
-
-two-way set associative - one bit
-
-   indicates which line of the two has been reference more recently
-
-
-four-way set associative - three bits
-
-   each bit represents one branch point in a binary decision tree; let 1
-   represent that the left side has been referenced more recently than the
-   right side, and 0 vice-versa
-
-              are all 4 lines valid?
-                   /       \
-                 yes        no, use an invalid line
-                  |
-                  |
-                  |
-             bit_0 == 0?            state | replace      ref to | next state
-              /       \             ------+--------      -------+-----------
-             y         n             00x  |  line_0      line_0 |    11_
-            /           \            01x  |  line_1      line_1 |    10_
-     bit_1 == 0?    bit_2 == 0?      1x0  |  line_2      line_2 |    0_1
-       /    \          /    \        1x1  |  line_3      line_3 |    0_0
-      y      n        y      n
-     /        \      /        \        ('x' means       ('_' means unchanged)
-   line_0  line_1  line_2  line_3      don't care)
-
-   (see Figure 3-7, p. 3-18, in Intel Embedded Pentium Processor Family Dev.
-    Manual, 1998, http://www.intel.com/design/intarch/manuals/273204.htm)
-
-
-note that there is a 6-bit encoding for true LRU for four-way set associative
-
-  bit 0: bank[1] more recently used than bank[0]
-  bit 1: bank[2] more recently used than bank[0]
-  bit 2: bank[2] more recently used than bank[1]
-  bit 3: bank[3] more recently used than bank[0]
-  bit 4: bank[3] more recently used than bank[1]
-  bit 5: bank[3] more recently used than bank[2]
-
-  this results in 24 valid bit patterns within the 64 possible bit patterns
-  (4! possible valid traces for bank references)
-
-  e.g., a trace of 0 1 2 3, where 0 is LRU and 3 is MRU, is encoded as 111111
-
-  you can implement a state machine with a 256x6 ROM (6-bit state encoding
-  appended with a 2-bit bank reference input will yield a new 6-bit state),
-  and you can implement an LRU bank indicator with a 64x2 ROM
-
diff --git a/src/unused/TLB/ariane/plru.py b/src/unused/TLB/ariane/plru.py
deleted file mode 100644 (file)
index fb4f5c0..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-# moved to nmutil https://git.libre-soc.org/?p=nmutil.git;a=tree
-from nmutil.plru import PLRU
diff --git a/src/unused/TLB/ariane/ptw.py b/src/unused/TLB/ariane/ptw.py
deleted file mode 100644 (file)
index 4046c71..0000000
+++ /dev/null
@@ -1,556 +0,0 @@
-"""
-# Copyright 2018 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the "License"); you may not use this file except in
-# compliance with the License.  You may obtain a copy of the License at
-# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-#
-# Author: David Schaffenrath, TU Graz
-# Author: Florian Zaruba, ETH Zurich
-# Date: 24.4.2017
-# Description: Hardware-PTW
-
-/* verilator lint_off WIDTH */
-import ariane_pkg::*;
-
-see linux kernel source:
-
-* "arch/riscv/include/asm/page.h"
-* "arch/riscv/include/asm/mmu_context.h"
-* "arch/riscv/Kconfig" (CONFIG_PAGE_OFFSET)
-
-"""
-
-from nmigen import Const, Signal, Cat, Module, Elaboratable
-from nmigen.hdl.ast import ArrayProxy
-from nmigen.cli import verilog, rtlil
-from math import log2
-
-
-DCACHE_SET_ASSOC = 8
-CONFIG_L1D_SIZE =  32*1024
-DCACHE_INDEX_WIDTH = int(log2(CONFIG_L1D_SIZE / DCACHE_SET_ASSOC))
-DCACHE_TAG_WIDTH = 56 - DCACHE_INDEX_WIDTH
-
-ASID_WIDTH = 8
-
-
-class DCacheReqI:
-    def __init__(self):
-        self.address_index = Signal(DCACHE_INDEX_WIDTH)
-        self.address_tag = Signal(DCACHE_TAG_WIDTH)
-        self.data_wdata = Signal(64)
-        self.data_req = Signal()
-        self.data_we = Signal()
-        self.data_be = Signal(8)
-        self.data_size = Signal(2)
-        self.kill_req = Signal()
-        self.tag_valid = Signal()
-
-    def eq(self, inp):
-        res = []
-        for (o, i) in zip(self.ports(), inp.ports()):
-            res.append(o.eq(i))
-        return res
-
-    def ports(self):
-        return [self.address_index, self.address_tag,
-                self.data_wdata, self.data_req,
-                self.data_we, self.data_be, self.data_size,
-                self.kill_req, self.tag_valid,
-            ]
-
-class DCacheReqO:
-    def __init__(self):
-        self.data_gnt = Signal()
-        self.data_rvalid = Signal()
-        self.data_rdata = Signal(64) # actually in PTE object format
-
-    def eq(self, inp):
-        res = []
-        for (o, i) in zip(self.ports(), inp.ports()):
-            res.append(o.eq(i))
-        return res
-
-    def ports(self):
-        return [self.data_gnt, self.data_rvalid, self.data_rdata]
-
-
-class PTE: #(RecordObject):
-    def __init__(self):
-        self.v = Signal()
-        self.r = Signal()
-        self.w = Signal()
-        self.x = Signal()
-        self.u = Signal()
-        self.g = Signal()
-        self.a = Signal()
-        self.d = Signal()
-        self.rsw = Signal(2)
-        self.ppn = Signal(44)
-        self.reserved = Signal(10)
-
-    def flatten(self):
-        return Cat(*self.ports())
-
-    def eq(self, x):
-        if isinstance(x, ArrayProxy):
-            res = []
-            for o in self.ports():
-                i = getattr(x, o.name)
-                res.append(i)
-            x = Cat(*res)
-        else:
-            x = x.flatten()
-        return self.flatten().eq(x)
-
-    def __iter__(self):
-        """ order is critical so that flatten creates LSB to MSB
-        """
-        yield self.v
-        yield self.r
-        yield self.w
-        yield self.x
-        yield self.u
-        yield self.g
-        yield self.a
-        yield self.d
-        yield self.rsw
-        yield self.ppn
-        yield self.reserved
-
-    def ports(self):
-        return list(self)
-
-
-class TLBUpdate:
-    def __init__(self, asid_width):
-        self.valid = Signal()      # valid flag
-        self.is_2M = Signal()
-        self.is_1G = Signal()
-        self.is_512G = Signal()
-        self.vpn = Signal(36)
-        self.asid = Signal(asid_width)
-        self.content = PTE()
-
-    def flatten(self):
-        return Cat(*self.ports())
-
-    def eq(self, x):
-        return self.flatten().eq(x.flatten())
-
-    def ports(self):
-        return [self.valid, self.is_2M, self.is_1G, self.vpn, self.asid] + \
-                self.content.ports()
-
-
-# SV48 defines four levels of page tables
-LVL1 = Const(0, 2) # defined to 0 so that ptw_lvl default-resets to LVL1
-LVL2 = Const(1, 2)
-LVL3 = Const(2, 2)
-LVL4 = Const(3, 2)
-
-
-class PTW(Elaboratable):
-    def __init__(self, asid_width=8):
-        self.asid_width = asid_width
-
-        self.flush_i = Signal() # flush everything, we need to do this because
-        # actually everything we do is speculative at this stage
-        # e.g.: there could be a CSR instruction that changes everything
-        self.ptw_active_o = Signal(reset=1)    # active if not IDLE
-        self.walking_instr_o = Signal()        # set when walking for TLB
-        self.ptw_error_o = Signal()            # set when an error occurred
-        self.enable_translation_i = Signal()   # CSRs indicate to enable SV48
-        self.en_ld_st_translation_i = Signal() # enable VM translation for ld/st
-
-        self.lsu_is_store_i = Signal()       # translation triggered by store
-        # PTW memory interface
-        self.req_port_i = DCacheReqO()
-        self.req_port_o = DCacheReqI()
-
-        # to TLBs, update logic
-        self.itlb_update_o = TLBUpdate(asid_width)
-        self.dtlb_update_o = TLBUpdate(asid_width)
-
-        self.update_vaddr_o = Signal(48)
-
-        self.asid_i = Signal(self.asid_width)
-        # from TLBs
-        # did we miss?
-        self.itlb_access_i = Signal()
-        self.itlb_hit_i = Signal()
-        self.itlb_vaddr_i = Signal(64)
-
-        self.dtlb_access_i = Signal()
-        self.dtlb_hit_i = Signal()
-        self.dtlb_vaddr_i = Signal(64)
-        # from CSR file
-        self.satp_ppn_i = Signal(44) # ppn from satp
-        self.mxr_i = Signal()
-        # Performance counters
-        self.itlb_miss_o = Signal()
-        self.dtlb_miss_o = Signal()
-
-    def ports(self):
-        return [self.ptw_active_o, self.walking_instr_o, self.ptw_error_o,
-                ]
-        return [
-                self.enable_translation_i, self.en_ld_st_translation_i,
-                self.lsu_is_store_i, self.req_port_i, self.req_port_o,
-                self.update_vaddr_o,
-                self.asid_i,
-                self.itlb_access_i, self.itlb_hit_i, self.itlb_vaddr_i,
-                self.dtlb_access_i, self.dtlb_hit_i, self.dtlb_vaddr_i,
-                self.satp_ppn_i, self.mxr_i,
-                self.itlb_miss_o, self.dtlb_miss_o
-            ] + self.itlb_update_o.ports() + self.dtlb_update_o.ports()
-
-    def elaborate(self, platform):
-        m = Module()
-
-        # input registers
-        data_rvalid = Signal()
-        data_rdata = Signal(64)
-
-        # NOTE: pte decodes the incoming bit-field (data_rdata). data_rdata
-        # is spec'd in 64-bit binary-format: better to spec as Record?
-        pte = PTE()
-        m.d.comb += pte.flatten().eq(data_rdata)
-
-        # SV48 defines four levels of page tables
-        ptw_lvl = Signal(2) # default=0=LVL1 on reset (see above)
-        ptw_lvl1 = Signal()
-        ptw_lvl2 = Signal()
-        ptw_lvl3 = Signal()
-        ptw_lvl4 = Signal()
-        m.d.comb += [ptw_lvl1.eq(ptw_lvl == LVL1),
-                     ptw_lvl2.eq(ptw_lvl == LVL2),
-                     ptw_lvl3.eq(ptw_lvl == LVL3),
-                     ptw_lvl4.eq(ptw_lvl == LVL4)
-                     ]
-
-        # is this an instruction page table walk?
-        is_instr_ptw = Signal()
-        global_mapping = Signal()
-        # latched tag signal
-        tag_valid = Signal()
-        # register the ASID
-        tlb_update_asid = Signal(self.asid_width)
-        # register VPN we need to walk, SV48 defines a 48 bit virtual addr
-        vaddr = Signal(64)
-        # 4 byte aligned physical pointer
-        ptw_pptr = Signal(56)
-
-        end = DCACHE_INDEX_WIDTH + DCACHE_TAG_WIDTH
-        m.d.sync += [
-            # Assignments
-            self.update_vaddr_o.eq(vaddr),
-
-            self.walking_instr_o.eq(is_instr_ptw),
-            # directly output the correct physical address
-            self.req_port_o.address_index.eq(ptw_pptr[0:DCACHE_INDEX_WIDTH]),
-            self.req_port_o.address_tag.eq(ptw_pptr[DCACHE_INDEX_WIDTH:end]),
-            # we are never going to kill this request
-            self.req_port_o.kill_req.eq(0),              # XXX assign comb?
-            # we are never going to write with the HPTW
-            self.req_port_o.data_wdata.eq(Const(0, 64)), # XXX assign comb?
-            # -----------
-            # TLB Update
-            # -----------
-            self.itlb_update_o.vpn.eq(vaddr[12:48]),
-            self.dtlb_update_o.vpn.eq(vaddr[12:48]),
-            # update the correct page table level
-            self.itlb_update_o.is_2M.eq(ptw_lvl3),
-            self.itlb_update_o.is_1G.eq(ptw_lvl2),
-            self.itlb_update_o.is_512G.eq(ptw_lvl1),
-            self.dtlb_update_o.is_2M.eq(ptw_lvl3),
-            self.dtlb_update_o.is_1G.eq(ptw_lvl2),
-            self.dtlb_update_o.is_512G.eq(ptw_lvl1),
-            
-            # output the correct ASID
-            self.itlb_update_o.asid.eq(tlb_update_asid),
-            self.dtlb_update_o.asid.eq(tlb_update_asid),
-            # set the global mapping bit
-            self.itlb_update_o.content.eq(pte),
-            self.itlb_update_o.content.g.eq(global_mapping),
-            self.dtlb_update_o.content.eq(pte),
-            self.dtlb_update_o.content.g.eq(global_mapping),
-
-            self.req_port_o.tag_valid.eq(tag_valid),
-        ]
-
-        #-------------------
-        # Page table walker   #needs update
-        #-------------------
-        # A virtual address va is translated into a physical address pa as
-        # follows:
-        # 1. Let a be sptbr.ppn × PAGESIZE, and let i = LEVELS-1. (For Sv48,
-        #    PAGESIZE=2^12 and LEVELS=4.)
-        # 2. Let pte be the value of the PTE at address a+va.vpn[i]×PTESIZE.
-        #    (For Sv32, PTESIZE=4.)
-        # 3. If pte.v = 0, or if pte.r = 0 and pte.w = 1, stop and raise an
-        #    access exception.
-        # 4. Otherwise, the PTE is valid. If pte.r = 1 or pte.x = 1, go to
-        #    step 5.  Otherwise, this PTE is a pointer to the next level of
-        #    the page table.
-        #    Let i=i-1. If i < 0, stop and raise an access exception.
-        #    Otherwise, let a = pte.ppn × PAGESIZE and go to step 2.
-        # 5. A leaf PTE has been found. Determine if the requested memory
-        #    access is allowed by the pte.r, pte.w, and pte.x bits. If not,
-        #    stop and raise an access exception. Otherwise, the translation is
-        #    successful.  Set pte.a to 1, and, if the memory access is a
-        #    store, set pte.d to 1.
-        #    The translated physical address is given as follows:
-        #      - pa.pgoff = va.pgoff.
-        #      - If i > 0, then this is a superpage translation and
-        #        pa.ppn[i-1:0] = va.vpn[i-1:0].
-        #      - pa.ppn[LEVELS-1:i] = pte.ppn[LEVELS-1:i].
-        # 6. If i > 0 and pa.ppn[i − 1 : 0] != 0, this is a misaligned
-        #    superpage stop and raise a page-fault exception.
-
-        m.d.sync += tag_valid.eq(0)
-
-        # default assignments
-        m.d.comb += [
-            # PTW memory interface
-            self.req_port_o.data_req.eq(0),
-            self.req_port_o.data_be.eq(Const(0xFF, 8)),
-            self.req_port_o.data_size.eq(Const(0b11, 2)),
-            self.req_port_o.data_we.eq(0),
-            self.ptw_error_o.eq(0),
-            self.itlb_update_o.valid.eq(0),
-            self.dtlb_update_o.valid.eq(0),
-
-            self.itlb_miss_o.eq(0),
-            self.dtlb_miss_o.eq(0),
-        ]
-
-        # ------------
-        # State Machine
-        # ------------
-
-        with m.FSM() as fsm:
-
-            with m.State("IDLE"):
-                self.idle(m, is_instr_ptw, ptw_lvl, global_mapping,
-                          ptw_pptr, vaddr, tlb_update_asid)
-
-            with m.State("WAIT_GRANT"):
-                self.grant(m, tag_valid, data_rvalid)
-
-            with m.State("PTE_LOOKUP"):
-                # we wait for the valid signal
-                with m.If(data_rvalid):
-                    self.lookup(m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4,
-                                data_rvalid, global_mapping,
-                                is_instr_ptw, ptw_pptr)
-
-            # Propagate error to MMU/LSU
-            with m.State("PROPAGATE_ERROR"):
-                m.next = "IDLE"
-                m.d.comb += self.ptw_error_o.eq(1)
-
-            # wait for the rvalid before going back to IDLE
-            with m.State("WAIT_RVALID"):
-                with m.If(data_rvalid):
-                    m.next = "IDLE"
-
-        m.d.sync += [data_rdata.eq(self.req_port_i.data_rdata),
-                     data_rvalid.eq(self.req_port_i.data_rvalid)
-                    ]
-
-        return m
-
-    def set_grant_state(self, m):
-        # should we have flushed before we got an rvalid,
-        # wait for it until going back to IDLE
-        with m.If(self.flush_i):
-            with m.If (self.req_port_i.data_gnt):
-                m.next = "WAIT_RVALID"
-            with m.Else():
-                m.next = "IDLE"
-        with m.Else():
-            m.next = "WAIT_GRANT"
-
-    def idle(self, m, is_instr_ptw, ptw_lvl, global_mapping,
-                          ptw_pptr, vaddr, tlb_update_asid):
-        # by default we start with the top-most page table
-        m.d.sync += [is_instr_ptw.eq(0),
-                     ptw_lvl.eq(LVL1),
-                     global_mapping.eq(0),
-                     self.ptw_active_o.eq(0), # deactive (IDLE)
-                    ]
-        # work out itlb/dtlb miss
-        m.d.comb += self.itlb_miss_o.eq(self.enable_translation_i & \
-                                 self.itlb_access_i & \
-                                 ~self.itlb_hit_i & \
-                                 ~self.dtlb_access_i)
-        m.d.comb += self.dtlb_miss_o.eq(self.en_ld_st_translation_i & \
-                                        self.dtlb_access_i & \
-                                        ~self.dtlb_hit_i)
-        # we got an ITLB miss?
-        with m.If(self.itlb_miss_o):
-            pptr = Cat(Const(0, 3), self.itlb_vaddr_i[30:48],
-                       self.satp_ppn_i)
-            m.d.sync += [ptw_pptr.eq(pptr),
-                         is_instr_ptw.eq(1),
-                         vaddr.eq(self.itlb_vaddr_i),
-                         tlb_update_asid.eq(self.asid_i),
-                        ]
-            self.set_grant_state(m)
-
-        # we got a DTLB miss?
-        with m.Elif(self.dtlb_miss_o):
-            pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:48],
-                       self.satp_ppn_i)
-            m.d.sync += [ptw_pptr.eq(pptr),
-                         vaddr.eq(self.dtlb_vaddr_i),
-                         tlb_update_asid.eq(self.asid_i),
-                        ]
-            self.set_grant_state(m)
-
-    def grant(self, m, tag_valid, data_rvalid):
-        # we've got a data WAIT_GRANT so tell the
-        # cache that the tag is valid
-
-        # send a request out
-        m.d.comb += self.req_port_o.data_req.eq(1)
-        # wait for the WAIT_GRANT
-        with m.If(self.req_port_i.data_gnt):
-            # send the tag valid signal one cycle later
-            m.d.sync += tag_valid.eq(1)
-            # should we have flushed before we got an rvalid,
-            # wait for it until going back to IDLE
-            with m.If(self.flush_i):
-                with m.If (~data_rvalid):
-                    m.next = "WAIT_RVALID"
-                with m.Else():
-                    m.next = "IDLE"
-            with m.Else():
-                m.next = "PTE_LOOKUP"
-
-    def lookup(self, m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4, 
-                            data_rvalid, global_mapping,
-                            is_instr_ptw, ptw_pptr):
-        # temporaries
-        pte_rx = Signal(reset_less=True)
-        pte_exe = Signal(reset_less=True)
-        pte_inv = Signal(reset_less=True)
-        pte_a = Signal(reset_less=True)
-        st_wd = Signal(reset_less=True)
-        m.d.comb += [pte_rx.eq(pte.r | pte.x),
-                    pte_exe.eq(~pte.x | ~pte.a),
-                    pte_inv.eq(~pte.v | (~pte.r & pte.w)),
-                    pte_a.eq(pte.a & (pte.r | (pte.x & self.mxr_i))),
-                    st_wd.eq(self.lsu_is_store_i & (~pte.w | ~pte.d))]
-
-        l1err = Signal(reset_less=True)
-        l2err = Signal(reset_less=True)
-        l3err = Signal(reset_less=True)
-        m.d.comb += [l3err.eq((ptw_lvl3) & pte.ppn[0:9] != Const(0,0)),
-                     l2err.eq((ptw_lvl2) & pte.ppn[0:18] != Const(0, 18)),
-                     l1err.eq((ptw_lvl1) & pte.ppn[0:27] != Const(0, 27))]
-
-        # check if the global mapping bit is set
-        with m.If (pte.g):
-            m.d.sync += global_mapping.eq(1)
-
-        m.next = "IDLE"
-
-        # -------------
-        # Invalid PTE
-        # -------------
-        # If pte.v = 0, or if pte.r = 0 and pte.w = 1,
-        # stop and raise a page-fault exception.
-        with m.If (pte_inv):
-            m.next = "PROPAGATE_ERROR"
-
-        # -----------
-        # Valid PTE
-        # -----------
-
-        # it is a valid PTE
-        # if pte.r = 1 or pte.x = 1 it is a valid PTE
-        with m.Elif (pte_rx):
-            # Valid translation found (either 1G, 2M or 4K)
-            with m.If(is_instr_ptw):
-                # ------------
-                # Update ITLB
-                # ------------
-                # If page not executable, we can directly raise error.
-                # This doesn't put a useless entry into the TLB.
-                # The same idea applies to the access flag since we let
-                # the access flag be managed by SW.
-                with m.If (pte_exe):
-                    m.next = "IDLE"
-                with m.Else():
-                    m.d.comb += self.itlb_update_o.valid.eq(1)
-
-            with m.Else():
-                # ------------
-                # Update DTLB
-                # ------------
-                # Check if the access flag has been set, otherwise
-                # throw page-fault and let software handle those bits.
-                # If page not readable (there are no write-only pages)
-                # directly raise an error. This doesn't put a useless
-                # entry into the TLB.
-                with m.If(pte_a):
-                    m.d.comb += self.dtlb_update_o.valid.eq(1)
-                with m.Else():
-                    m.next = "PROPAGATE_ERROR"
-                # Request is a store: perform additional checks
-                # If the request was a store and the page not
-                # write-able, raise an error
-                # the same applies if the dirty flag is not set
-                with m.If (st_wd):
-                    m.d.comb += self.dtlb_update_o.valid.eq(0)
-                    m.next = "PROPAGATE_ERROR"
-
-            # check if the ppn is correctly aligned: Case (6)
-            with m.If(l1err | l2err | l3err):
-                m.next = "PROPAGATE_ERROR"
-                m.d.comb += [self.dtlb_update_o.valid.eq(0),
-                             self.itlb_update_o.valid.eq(0)]
-
-        # this is a pointer to the next TLB level
-        with m.Else():
-            # pointer to next level of page table
-            with m.If (ptw_lvl1):
-                # we are in the second level now
-                pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:39], pte.ppn)
-                m.d.sync += [ptw_pptr.eq(pptr),
-                             ptw_lvl.eq(LVL2)
-                            ]
-            with m.If(ptw_lvl2):
-                # here we received a pointer to the third level
-                pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[21:30], pte.ppn)
-                m.d.sync += [ptw_pptr.eq(pptr),
-                             ptw_lvl.eq(LVL3)
-                            ]
-            with m.If(ptw_lvl3): #guess: shift page levels by one
-                # here we received a pointer to the fourth level
-                # the last one is near the page offset
-                pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[12:21], pte.ppn)
-                m.d.sync += [ptw_pptr.eq(pptr),
-                             ptw_lvl.eq(LVL4)
-                            ]
-            self.set_grant_state(m)
-
-            with m.If (ptw_lvl4):
-                # Should already be the last level
-                # page table => Error
-                m.d.sync += ptw_lvl.eq(LVL4)
-                m.next = "PROPAGATE_ERROR"
-
-
-if __name__ == '__main__':
-    ptw = PTW()
-    vl = rtlil.convert(ptw, ports=ptw.ports())
-    with open("test_ptw.il", "w") as f:
-        f.write(vl)
diff --git a/src/unused/TLB/ariane/test/__init__.py b/src/unused/TLB/ariane/test/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/src/unused/TLB/ariane/test/test_plru.py b/src/unused/TLB/ariane/test/test_plru.py
deleted file mode 100644 (file)
index 9222d79..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-import sys
-from soc.TLB.ariane.plru import PLRU
-from nmigen.compat.sim import run_simulation
-
-
-def tbench(dut):
-    yield
-
-
-if __name__ == "__main__":
-    dut = PLRU(4)
-    run_simulation(dut, tbench(dut), vcd_name="test_plru.vcd")
-    print("PLRU Unit Test Success")
diff --git a/src/unused/TLB/ariane/test/test_ptw.py b/src/unused/TLB/ariane/test/test_ptw.py
deleted file mode 100644 (file)
index 3969756..0000000
+++ /dev/null
@@ -1,127 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from soc.TLB.ariane.ptw import PTW, PTE
-
-# unit was changed, test needs to be changed
-
-
-def tbench(dut):
-
-    addr = 0x8000000
-
-    #pte = PTE()
-    # yield pte.v.eq(1)
-    # yield pte.r.eq(1)
-
-    yield dut.req_port_i.data_gnt.eq(1)
-    yield dut.req_port_i.data_rvalid.eq(1)
-    yield dut.req_port_i.data_rdata.eq(0x43)  # pte.flatten())
-
-    # data lookup
-    yield dut.en_ld_st_translation_i.eq(1)
-    yield dut.asid_i.eq(1)
-
-    yield dut.dtlb_access_i.eq(1)
-    yield dut.dtlb_hit_i.eq(0)
-    yield dut.dtlb_vaddr_i.eq(0x400000000)
-
-    yield
-    yield
-    yield
-
-    yield dut.dtlb_access_i.eq(1)
-    yield dut.dtlb_hit_i.eq(0)
-    yield dut.dtlb_vaddr_i.eq(0x200000)
-
-    yield
-    yield
-    yield
-
-    yield dut.req_port_i.data_gnt.eq(0)
-    yield dut.dtlb_access_i.eq(1)
-    yield dut.dtlb_hit_i.eq(0)
-    yield dut.dtlb_vaddr_i.eq(0x400000011)
-
-    yield
-    yield dut.req_port_i.data_gnt.eq(1)
-    yield
-    yield
-
-    # data lookup, PTW levels 1-2-3
-    addr = 0x4000000
-    yield dut.dtlb_vaddr_i.eq(addr)
-    yield dut.mxr_i.eq(0x1)
-    yield dut.req_port_i.data_gnt.eq(1)
-    yield dut.req_port_i.data_rvalid.eq(1)
-    # pte.flatten())
-    yield dut.req_port_i.data_rdata.eq(0x41 | (addr >> 12) << 10)
-
-    yield dut.en_ld_st_translation_i.eq(1)
-    yield dut.asid_i.eq(1)
-
-    yield dut.dtlb_access_i.eq(1)
-    yield dut.dtlb_hit_i.eq(0)
-    yield dut.dtlb_vaddr_i.eq(addr)
-
-    yield
-    yield
-    yield
-    yield
-    yield
-    yield
-    yield
-    yield
-
-    yield dut.req_port_i.data_gnt.eq(0)
-    yield dut.dtlb_access_i.eq(1)
-    yield dut.dtlb_hit_i.eq(0)
-    yield dut.dtlb_vaddr_i.eq(0x400000011)
-
-    yield
-    yield dut.req_port_i.data_gnt.eq(1)
-    yield
-    yield
-    yield
-    yield
-
-    # instruction lookup
-    yield dut.en_ld_st_translation_i.eq(0)
-    yield dut.enable_translation_i.eq(1)
-    yield dut.asid_i.eq(1)
-
-    yield dut.itlb_access_i.eq(1)
-    yield dut.itlb_hit_i.eq(0)
-    yield dut.itlb_vaddr_i.eq(0x800000)
-
-    yield
-    yield
-    yield
-
-    yield dut.itlb_access_i.eq(1)
-    yield dut.itlb_hit_i.eq(0)
-    yield dut.itlb_vaddr_i.eq(0x200000)
-
-    yield
-    yield
-    yield
-
-    yield dut.req_port_i.data_gnt.eq(0)
-    yield dut.itlb_access_i.eq(1)
-    yield dut.itlb_hit_i.eq(0)
-    yield dut.itlb_vaddr_i.eq(0x800011)
-
-    yield
-    yield dut.req_port_i.data_gnt.eq(1)
-    yield
-    yield
-
-    yield
-
-
-def test_ptw():
-    dut = PTW()
-    run_simulation(dut, tbench(dut), vcd_name="test_ptw.vcd")
-    print("PTW Unit Test Success")
-
-
-if __name__ == "__main__":
-    test_ptw()
diff --git a/src/unused/TLB/ariane/test/test_tlb.py b/src/unused/TLB/ariane/test/test_tlb.py
deleted file mode 100644 (file)
index e1b17b8..0000000
+++ /dev/null
@@ -1,67 +0,0 @@
-from nmigen.compat.sim import run_simulation
-
-from soc.TLB.ariane.tlb import TLB
-
-
-def set_vaddr(addr):
-    yield dut.lu_vaddr_i.eq(addr)
-    yield dut.update_i.vpn.eq(addr >> 12)
-
-
-def tbench(dut):
-    yield dut.lu_access_i.eq(1)
-    yield dut.lu_asid_i.eq(1)
-    yield dut.update_i.valid.eq(1)
-    yield dut.update_i.is_1G.eq(0)
-    yield dut.update_i.is_2M.eq(0)
-    yield dut.update_i.asid.eq(1)
-    yield dut.update_i.content.ppn.eq(0)
-    yield dut.update_i.content.rsw.eq(0)
-    yield dut.update_i.content.r.eq(1)
-
-    yield
-
-    addr = 0x80000
-    yield from set_vaddr(addr)
-    yield
-
-    addr = 0x90001
-    yield from set_vaddr(addr)
-    yield
-
-    addr = 0x28000000
-    yield from set_vaddr(addr)
-    yield
-
-    addr = 0x28000001
-    yield from set_vaddr(addr)
-
-    addr = 0x28000001
-    yield from set_vaddr(addr)
-    yield
-
-    addr = 0x1000040000
-    yield from set_vaddr(addr)
-    yield
-
-    addr = 0x1000040001
-    yield from set_vaddr(addr)
-    yield
-
-    yield dut.update_i.is_1G.eq(1)
-    addr = 0x2040000
-    yield from set_vaddr(addr)
-    yield
-
-    yield dut.update_i.is_1G.eq(1)
-    addr = 0x2040001
-    yield from set_vaddr(addr)
-    yield
-
-    yield
-
-
-if __name__ == "__main__":
-    dut = TLB()
-    run_simulation(dut, tbench(dut), vcd_name="test_tlb.vcd")
-    print("TLB Unit Test Success")
diff --git a/src/unused/TLB/ariane/test/test_tlb_content.py b/src/unused/TLB/ariane/test/test_tlb_content.py
deleted file mode 100644 (file)
index 1bc60d8..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-from nmigen.compat.sim import run_simulation
-
-from soc.TLB.ariane.tlb_content import TLBContent
-from soc.TestUtil.test_helper import assert_op, assert_eq
-
-
-def update(dut, a, t, g, m):
-    yield dut.replace_en_i.eq(1)
-    yield dut.update_i.valid.eq(1)
-    yield dut.update_i.is_512G.eq(t)
-    yield dut.update_i.is_1G.eq(g)
-    yield dut.update_i.is_2M.eq(m)
-    yield dut.update_i.vpn.eq(a)
-    yield
-    yield
-
-
-def check_hit(dut, hit, pagesize):
-    hit_d = yield dut.lu_hit_o
-    assert_eq("hit", hit_d, hit)
-
-    if(hit):
-        if(pagesize == "t"):
-            hitp = yield dut.lu_is_512G_o
-            assert_eq("lu_is_512G_o", hitp, 1)
-        elif(pagesize == "g"):
-            hitp = yield dut.lu_is_1G_o
-            assert_eq("lu_is_1G_o", hitp, 1)
-        elif(pagesize == "m"):
-            hitp = yield dut.lu_is_2M_o
-            assert_eq("lu_is_2M_o", hitp, 1)
-
-
-def addr(a, b, c, d):
-    return a | b << 9 | c << 18 | d << 27
-
-
-def tbench(dut):
-    yield dut.vpn0.eq(0x0A)
-    yield dut.vpn1.eq(0x0B)
-    yield dut.vpn2.eq(0x0C)
-    yield dut.vpn3.eq(0x0D)
-    yield from update(dut, addr(0xFF, 0xFF, 0xFF, 0x0D), 1, 0, 0)
-    yield from check_hit(dut, 1, "t")
-
-    yield from update(dut, addr(0xFF, 0xFF, 0x0C, 0x0D), 0, 1, 0)
-    yield from check_hit(dut, 1, "g")
-
-    yield from update(dut, addr(0xFF, 0x0B, 0x0C, 0x0D), 0, 0, 1)
-    yield from check_hit(dut, 1, "m")
-
-    yield from update(dut, addr(0x0A, 0x0B, 0x0C, 0x0D), 0, 0, 0)
-    yield from check_hit(dut, 1, "")
-
-    yield from update(dut, addr(0xAA, 0xBB, 0xCC, 0xDD), 0, 0, 0)
-    yield from check_hit(dut, 0, "miss")
-
-
-if __name__ == "__main__":
-    dut = TLBContent(4, 4)
-    #
-    run_simulation(dut, tbench(dut), vcd_name="test_tlb_content.vcd")
-    print("TLBContent Unit Test Success")
diff --git a/src/unused/TLB/ariane/tlb.py b/src/unused/TLB/ariane/tlb.py
deleted file mode 100644 (file)
index 72b67a2..0000000
+++ /dev/null
@@ -1,176 +0,0 @@
-"""
-# Copyright 2018 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the "License"); you may not use this file except in
-# compliance with the License.  You may obtain a copy of the License at
-# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-#
-# Author: David Schaffenrath, TU Graz
-# Author: Florian Zaruba, ETH Zurich
-# Date: 21.4.2017
-# Description: Translation Lookaside Buffer, SV48
-#              fully set-associative
-
-Implementation in c++:
-https://raw.githubusercontent.com/Tony-Hu/TreePLRU/master/TreePLRU.cpp
-
-Text description:
-https://people.cs.clemson.edu/~mark/464/p_lru.txt
-
-Online simulator:
-http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/vm.html
-"""
-from math import log2
-from nmigen import Signal, Module, Cat, Const, Array, Elaboratable
-from nmigen.cli import verilog, rtlil
-from nmigen.lib.coding import Encoder
-
-from soc.TLB.ariane.ptw import TLBUpdate, PTE, ASID_WIDTH
-from soc.TLB.ariane.plru import PLRU
-from soc.TLB.ariane.tlb_content import TLBContent
-
-TLB_ENTRIES = 8
-
-
-class TLB(Elaboratable):
-    def __init__(self, tlb_entries=8, asid_width=8):
-        self.tlb_entries = tlb_entries
-        self.asid_width = asid_width
-
-        self.flush_i = Signal()  # Flush signal
-        # Lookup signals
-        self.lu_access_i = Signal()
-        self.lu_asid_i = Signal(self.asid_width)
-        self.lu_vaddr_i = Signal(64)
-        self.lu_content_o = PTE()
-        self.lu_is_2M_o = Signal()
-        self.lu_is_1G_o = Signal()
-        self.lu_is_512G_o = Signal()
-        self.lu_hit_o = Signal()
-        # Update TLB
-        self.pte_width = len(self.lu_content_o.flatten())
-        self.update_i = TLBUpdate(asid_width)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        vpn3 = Signal(9)  # FIXME unused signal
-        vpn2 = Signal(9)
-        vpn1 = Signal(9)
-        vpn0 = Signal(9)
-
-        # -------------
-        # Translation
-        # -------------
-
-        # SV48 defines four levels of page tables
-        m.d.comb += [vpn0.eq(self.lu_vaddr_i[12:21]),
-                     vpn1.eq(self.lu_vaddr_i[21:30]),
-                     vpn2.eq(self.lu_vaddr_i[30:39]),
-                     vpn3.eq(self.lu_vaddr_i[39:48]),  # FIXME
-                     ]
-
-        tc = []
-        for i in range(self.tlb_entries):
-            tlc = TLBContent(self.pte_width, self.asid_width)
-            setattr(m.submodules, "tc%d" % i, tlc)
-            tc.append(tlc)
-            # connect inputs
-            tlc.update_i = self.update_i     # saves a lot of graphviz links
-            m.d.comb += [tlc.vpn0.eq(vpn0),
-                         tlc.vpn1.eq(vpn1),
-                         tlc.vpn2.eq(vpn2),
-                         # TODO 4th
-                         tlc.flush_i.eq(self.flush_i),
-                         # tlc.update_i.eq(self.update_i),
-                         tlc.lu_asid_i.eq(self.lu_asid_i)]
-        tc = Array(tc)
-
-        # --------------
-        # Select hit
-        # --------------
-
-        # use Encoder to select hit index
-        # XXX TODO: assert that there's only one valid entry (one lu_hit)
-        hitsel = Encoder(self.tlb_entries)
-        m.submodules.hitsel = hitsel
-
-        hits = []
-        for i in range(self.tlb_entries):
-            hits.append(tc[i].lu_hit_o)
-        m.d.comb += hitsel.i.eq(Cat(*hits))  # (goes into plru as well)
-        idx = hitsel.o
-
-        active = Signal(reset_less=True)
-        m.d.comb += active.eq(~hitsel.n)
-        with m.If(active):
-            # active hit, send selected as output
-            m.d.comb += [self.lu_is_512G_o.eq(tc[idx].lu_is_512G_o),
-                         self.lu_is_1G_o.eq(tc[idx].lu_is_1G_o),
-                         self.lu_is_2M_o.eq(tc[idx].lu_is_2M_o),
-                         self.lu_hit_o.eq(1),
-                         self.lu_content_o.flatten().eq(tc[idx].lu_content_o),
-                         ]
-
-        # --------------
-        # PLRU.
-        # --------------
-
-        p = PLRU(self.tlb_entries)
-        plru_tree = Signal(p.TLBSZ)
-        m.submodules.plru = p
-
-        # connect PLRU inputs/outputs
-        # XXX TODO: assert that there's only one valid entry (one replace_en)
-        en = []
-        for i in range(self.tlb_entries):
-            en.append(tc[i].replace_en_i)
-        m.d.comb += [Cat(*en).eq(p.replace_en_o),  # output from PLRU into tags
-                     p.lu_hit.eq(hitsel.i),
-                     p.lu_access_i.eq(self.lu_access_i),
-                     p.plru_tree.eq(plru_tree)]
-        m.d.sync += plru_tree.eq(p.plru_tree_o)
-
-        # --------------
-        # Sanity checks
-        # --------------
-
-        assert (self.tlb_entries % 2 == 0) and (self.tlb_entries > 1), \
-            "TLB size must be a multiple of 2 and greater than 1"
-        assert (self.asid_width >= 1), \
-            "ASID width must be at least 1"
-
-        return m
-
-        """
-        # Just for checking
-        function int countSetBits(logic[self.tlb_entries-1:0] vector);
-          automatic int count = 0;
-          foreach (vector[idx]) begin
-            count += vector[idx];
-          end
-          return count;
-        endfunction
-
-        assert property (@(posedge clk_i)(countSetBits(lu_hit) <= 1))
-          else $error("More then one hit in TLB!"); $stop(); end
-        assert property (@(posedge clk_i)(countSetBits(replace_en) <= 1))
-          else $error("More then one TLB entry selected for next replace!");
-        """
-
-    def ports(self):
-        return [self.flush_i, self.lu_access_i,
-                self.lu_asid_i, self.lu_vaddr_i,
-                self.lu_is_2M_o, self.lu_1G_o, self.lu_is_512G_o, self.lu_hit_o
-                ] + self.lu_content_o.ports() + self.update_i.ports()
-
-
-if __name__ == '__main__':
-    tlb = TLB()
-    vl = rtlil.convert(tlb, ports=tlb.ports())
-    with open("test_tlb.il", "w") as f:
-        f.write(vl)
diff --git a/src/unused/TLB/ariane/tlb_content.py b/src/unused/TLB/ariane/tlb_content.py
deleted file mode 100644 (file)
index bfd17c1..0000000
+++ /dev/null
@@ -1,143 +0,0 @@
-from nmigen import Signal, Module, Cat, Const, Elaboratable
-
-from soc.TLB.ariane.ptw import TLBUpdate, PTE
-
-
-class TLBEntry:
-    def __init__(self, asid_width):
-        self.asid = Signal(asid_width, name="ent_asid")
-        # SV48 defines four levels of page tables
-        self.vpn0 = Signal(9, name="ent_vpn0")
-        self.vpn1 = Signal(9, name="ent_vpn1")
-        self.vpn2 = Signal(9, name="ent_vpn2")
-        self.vpn3 = Signal(9, name="ent_vpn3")
-        self.is_2M = Signal(name="ent_is_2M")
-        self.is_1G = Signal(name="ent_is_1G")
-        self.is_512G = Signal(name="ent_is_512G")
-        self.valid = Signal(name="ent_valid")
-
-    def flatten(self):
-        return Cat(*self.ports())
-
-    def eq(self, x):
-        return self.flatten().eq(x.flatten())
-
-    def ports(self):
-        return [self.asid, self.vpn0, self.vpn1, self.vpn2,
-                self.is_2M, self.is_1G, self.valid]
-
-
-class TLBContent(Elaboratable):
-    def __init__(self, pte_width, asid_width):
-        self.asid_width = asid_width
-        self.pte_width = pte_width
-        self.flush_i = Signal()  # Flush signal
-        # Update TLB
-        self.update_i = TLBUpdate(asid_width)
-        self.vpn3 = Signal(9)
-        self.vpn2 = Signal(9)
-        self.vpn1 = Signal(9)
-        self.vpn0 = Signal(9)
-        self.replace_en_i = Signal()  # replace the following entry,
-        # set by replacement strategy
-        # Lookup signals
-        self.lu_asid_i = Signal(asid_width)
-        self.lu_content_o = Signal(pte_width)
-        self.lu_is_512G_o = Signal()
-        self.lu_is_2M_o = Signal()
-        self.lu_is_1G_o = Signal()
-        self.lu_hit_o = Signal()
-
-    def elaborate(self, platform):
-        m = Module()
-
-        tags = TLBEntry(self.asid_width)
-
-        content = Signal(self.pte_width)
-
-        m.d.comb += [self.lu_hit_o.eq(0),
-                     self.lu_is_512G_o.eq(0),
-                     self.lu_is_2M_o.eq(0),
-                     self.lu_is_1G_o.eq(0)]
-
-        # temporaries for lookup
-        asid_ok = Signal(reset_less=True)
-        # tags_ok = Signal(reset_less=True)
-
-        vpn3_ok = Signal(reset_less=True)
-        vpn2_ok = Signal(reset_less=True)
-        vpn1_ok = Signal(reset_less=True)
-        vpn0_ok = Signal(reset_less=True)
-
-        #tags_2M = Signal(reset_less=True)
-        vpn0_or_2M = Signal(reset_less=True)
-
-        m.d.comb += [
-            # compare asid and vpn*
-            asid_ok.eq(tags.asid == self.lu_asid_i),
-            vpn3_ok.eq(tags.vpn3 == self.vpn3),
-            vpn2_ok.eq(tags.vpn2 == self.vpn2),
-            vpn1_ok.eq(tags.vpn1 == self.vpn1),
-            vpn0_ok.eq(tags.vpn0 == self.vpn0),
-            vpn0_or_2M.eq(tags.is_2M | vpn0_ok)
-        ]
-
-        with m.If(asid_ok & tags.valid):
-            # first level, only vpn3 needs to match
-            with m.If(tags.is_512G & vpn3_ok):
-                m.d.comb += [self.lu_content_o.eq(content),
-                             self.lu_is_512G_o.eq(1),
-                             self.lu_hit_o.eq(1),
-                             ]
-            # second level , second level vpn2 and vpn3 need to match
-            with m.Elif(tags.is_1G & vpn2_ok & vpn3_ok):
-                m.d.comb += [self.lu_content_o.eq(content),
-                             self.lu_is_1G_o.eq(1),
-                             self.lu_hit_o.eq(1),
-                             ]
-            # not a giga page hit nor a tera page hit so check further
-            with m.Elif(vpn1_ok):
-                # this could be a 2 mega page hit or a 4 kB hit
-                # output accordingly
-                with m.If(vpn0_or_2M):
-                    m.d.comb += [self.lu_content_o.eq(content),
-                                 self.lu_is_2M_o.eq(tags.is_2M),
-                                 self.lu_hit_o.eq(1),
-                                 ]
-        # ------------------
-        # Update or Flush
-        # ------------------
-
-        # temporaries
-        replace_valid = Signal(reset_less=True)
-        m.d.comb += replace_valid.eq(self.update_i.valid & self.replace_en_i)
-
-        # flush
-        with m.If(self.flush_i):
-            # invalidate (flush) conditions: all if zero or just this ASID
-            with m.If(self.lu_asid_i == Const(0, self.asid_width) |
-                      (self.lu_asid_i == tags.asid)):
-                m.d.sync += tags.valid.eq(0)
-
-        # normal replacement
-        with m.Elif(replace_valid):
-            m.d.sync += [  # update tag array
-                tags.asid.eq(self.update_i.asid),
-                tags.vpn3.eq(self.update_i.vpn[27:36]),
-                tags.vpn2.eq(self.update_i.vpn[18:27]),
-                tags.vpn1.eq(self.update_i.vpn[9:18]),
-                tags.vpn0.eq(self.update_i.vpn[0:9]),
-                tags.is_512G.eq(self.update_i.is_512G),
-                tags.is_1G.eq(self.update_i.is_1G),
-                tags.is_2M.eq(self.update_i.is_2M),
-                tags.valid.eq(1),
-                # and content as well
-                content.eq(self.update_i.content.flatten())
-            ]
-        return m
-
-    def ports(self):
-        return [self.flush_i,
-                self.lu_asid_i,
-                self.lu_is_2M_o, self.lu_is_1G_o, self.lu_is_512G_o, self.lu_hit_o,
-                ] + self.update_i.content.ports() + self.update_i.ports()
diff --git a/src/unused/TLB/test/__init__.py b/src/unused/TLB/test/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/src/unused/TLB/test/test_LFSR2.py b/src/unused/TLB/test/test_LFSR2.py
deleted file mode 100644 (file)
index 33208f8..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-# SPDX-License-Identifier: LGPL-2.1-or-later
-# See Notices.txt for copyright information
-from soc.TLB.LFSR import LFSR, LFSRPolynomial, LFSR_POLY_3
-
-from nmigen.back.pysim import Simulator, Delay, Tick
-import unittest
-
-
-class TestLFSR(unittest.TestCase):
-    def test_poly(self):
-        v = LFSRPolynomial()
-        self.assertEqual(repr(v), "LFSRPolynomial([0])")
-        self.assertEqual(str(v), "1")
-        v = LFSRPolynomial([1])
-        self.assertEqual(repr(v), "LFSRPolynomial([1, 0])")
-        self.assertEqual(str(v), "x + 1")
-        v = LFSRPolynomial([0, 1])
-        self.assertEqual(repr(v), "LFSRPolynomial([1, 0])")
-        self.assertEqual(str(v), "x + 1")
-        v = LFSRPolynomial([1, 2])
-        self.assertEqual(repr(v), "LFSRPolynomial([2, 1, 0])")
-        self.assertEqual(str(v), "x^2 + x + 1")
-        v = LFSRPolynomial([2])
-        self.assertEqual(repr(v), "LFSRPolynomial([2, 0])")
-        self.assertEqual(str(v), "x^2 + 1")
-        self.assertEqual(str(LFSR_POLY_3), "x^3 + x^2 + 1")
-
-    def test_lfsr_3(self):
-        module = LFSR(LFSR_POLY_3)
-        traces = [module.state, module.enable]
-        with Simulator(module,
-                       vcd_file=open("Waveforms/test_LFSR2.vcd", "w"),
-                       gtkw_file=open("Waveforms/test_LFSR2.gtkw", "w"),
-                       traces=traces) as sim:
-            sim.add_clock(1e-6, phase=0.25e-6)
-            delay = Delay(1e-7)
-
-            def async_process():
-                yield module.enable.eq(0)
-                yield Tick()
-                self.assertEqual((yield module.state), 0x1)
-                yield Tick()
-                self.assertEqual((yield module.state), 0x1)
-                yield module.enable.eq(1)
-                yield Tick()
-                yield delay
-                self.assertEqual((yield module.state), 0x2)
-                yield Tick()
-                yield delay
-                self.assertEqual((yield module.state), 0x5)
-                yield Tick()
-                yield delay
-                self.assertEqual((yield module.state), 0x3)
-                yield Tick()
-                yield delay
-                self.assertEqual((yield module.state), 0x7)
-                yield Tick()
-                yield delay
-                self.assertEqual((yield module.state), 0x6)
-                yield Tick()
-                yield delay
-                self.assertEqual((yield module.state), 0x4)
-                yield Tick()
-                yield delay
-                self.assertEqual((yield module.state), 0x1)
-                yield Tick()
-
-            sim.add_process(async_process)
-            sim.run()
diff --git a/src/unused/TLB/test/test_address_encoder.py b/src/unused/TLB/test/test_address_encoder.py
deleted file mode 100644 (file)
index 70d435d..0000000
+++ /dev/null
@@ -1,116 +0,0 @@
-from nmigen.compat.sim import run_simulation
-from soc.TLB.AddressEncoder import AddressEncoder
-from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
-
-
-# This function allows for the easy setting of values to the AddressEncoder
-# Arguments:
-#   dut: The AddressEncoder being tested
-#   i (Input): The array of single bits to be written
-def set_encoder(dut, i):
-    yield dut.i.eq(i)
-    yield
-
-# Checks the single match of the AddressEncoder
-# Arguments:
-#   dut: The AddressEncoder being tested
-#   sm (Single Match): The expected match result
-#   op (Operation): (0 => ==), (1 => !=)
-
-
-def check_single_match(dut, sm, op):
-    out_sm = yield dut.single_match
-    assert_op("Single Match", out_sm, sm, op)
-
-# Checks the multiple match of the AddressEncoder
-# Arguments:
-#   dut: The AddressEncoder being tested
-#   mm (Multiple Match): The expected match result
-#   op (Operation): (0 => ==), (1 => !=)
-
-
-def check_multiple_match(dut, mm, op):
-    out_mm = yield dut.multiple_match
-    assert_op("Multiple Match", out_mm, mm, op)
-
-# Checks the output of the AddressEncoder
-# Arguments:
-#   dut: The AddressEncoder being tested
-#   o (Output): The expected output
-#   op (Operation): (0 => ==), (1 => !=)
-
-
-def check_output(dut, o, op):
-    out_o = yield dut.o
-    assert_op("Output", out_o, o, op)
-
-# Checks the state of the AddressEncoder
-# Arguments:
-#   dut: The AddressEncoder being tested
-#   sm (Single Match): The expected match result
-#   mm (Multiple Match): The expected match result
-#   o (Output): The expected output
-#   ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-#   mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-#   o_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-
-
-def check_all(dut, sm, mm, o, sm_op, mm_op, o_op):
-    yield from check_single_match(dut, sm, sm_op)
-    yield from check_multiple_match(dut, mm, mm_op)
-    yield from check_output(dut, o, o_op)
-
-
-def tbench(dut):
-    # Check invalid input
-    in_val = 0b000
-    single_match = 0
-    multiple_match = 0
-    output = 0
-    yield from set_encoder(dut, in_val)
-    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
-
-    # Check single bit
-    in_val = 0b001
-    single_match = 1
-    multiple_match = 0
-    output = 0
-    yield from set_encoder(dut, in_val)
-    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
-
-    # Check another single bit
-    in_val = 0b100
-    single_match = 1
-    multiple_match = 0
-    output = 2
-    yield from set_encoder(dut, in_val)
-    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
-
-    # Check multiple match
-    # We expected the lowest bit to be returned which is address 0
-    in_val = 0b101
-    single_match = 0
-    multiple_match = 1
-    output = 0
-    yield from set_encoder(dut, in_val)
-    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
-
-    # Check another multiple match
-    # We expected the lowest bit to be returned which is address 1
-    in_val = 0b110
-    single_match = 0
-    multiple_match = 1
-    output = 1
-    yield from set_encoder(dut, in_val)
-    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
-
-
-def test_addr():
-    dut = AddressEncoder(4)
-    run_simulation(dut, tbench(dut),
-                   vcd_name="Waveforms/test_address_encoder.vcd")
-    print("AddressEncoder Unit Test Success")
-
-
-if __name__ == "__main__":
-    test_addr()
diff --git a/src/unused/TLB/test/test_cam.py b/src/unused/TLB/test/test_cam.py
deleted file mode 100644 (file)
index d11cd97..0000000
+++ /dev/null
@@ -1,218 +0,0 @@
-from nmigen.compat.sim import run_simulation
-
-from soc.TLB.Cam import Cam
-
-from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
-
-# This function allows for the easy setting of values to the Cam
-# Arguments:
-#   dut: The Cam being tested
-#   e (Enable): Whether the block is going to be enabled
-#   we (Write Enable): Whether the Cam will write on the next cycle
-#   a (Address): Where the data will be written if write enable is high
-#   d (Data): Either what we are looking for or will write to the address
-
-
-def set_cam(dut, e, we, a, d):
-    yield dut.enable.eq(e)
-    yield dut.write_enable.eq(we)
-    yield dut.address_in.eq(a)
-    yield dut.data_in.eq(d)
-    yield
-
-# Checks the multiple match of the Cam
-# Arguments:
-#   dut: The Cam being tested
-#   mm (Multiple Match): The expected match result
-#   op (Operation): (0 => ==), (1 => !=)
-
-
-def check_multiple_match(dut, mm, op):
-    out_mm = yield dut.multiple_match
-    assert_op("Multiple Match", out_mm, mm, op)
-
-# Checks the single match of the Cam
-# Arguments:
-#   dut: The Cam being tested
-#   sm (Single Match): The expected match result
-#   op (Operation): (0 => ==), (1 => !=)
-
-
-def check_single_match(dut, sm, op):
-    out_sm = yield dut.single_match
-    assert_op("Single Match", out_sm, sm, op)
-
-# Checks the address output of the Cam
-# Arguments:
-#   dut: The Cam being tested
-#   ma (Match Address): The expected match result
-#   op (Operation): (0 => ==), (1 => !=)
-
-
-def check_match_address(dut, ma, op):
-    out_ma = yield dut.match_address
-    assert_op("Match Address", out_ma, ma, op)
-
-# Checks the state of the Cam
-# Arguments:
-#   dut: The Cam being tested
-#   sm (Single Match): The expected match result
-#   mm (Multiple Match): The expected match result
-#   ma: (Match Address): The expected address output
-#   ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-#   mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-#   ma_op (Operation): Operation for the address assertion (0 => ==), (1 => !=)
-
-
-def check_all(dut, mm, sm, ma, mm_op, sm_op, ma_op):
-    yield from check_multiple_match(dut, mm, mm_op)
-    yield from check_single_match(dut, sm, sm_op)
-    yield from check_match_address(dut, ma, ma_op)
-
-
-def tbench(dut):
-    # NA
-    enable = 0
-    write_enable = 0
-    address = 0
-    data = 0
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_single_match(dut, single_match, 0)
-
-    # Read Miss Multiple
-    # Note that the default starting entry data bits are all 0
-    enable = 1
-    write_enable = 0
-    address = 0
-    data = 0
-    multiple_match = 1
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_multiple_match(dut, multiple_match, 0)
-
-    # Read Miss
-    # Note that the default starting entry data bits are all 0
-    enable = 1
-    write_enable = 0
-    address = 0
-    data = 1
-    multiple_match = 0
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_single_match(dut, single_match, 0)
-
-    # Write Entry 0
-    enable = 1
-    write_enable = 1
-    address = 0
-    data = 4
-    multiple_match = 0
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_single_match(dut, single_match, 0)
-
-    # Read Hit Entry 0
-    enable = 1
-    write_enable = 0
-    address = 0
-    data = 4
-    multiple_match = 0
-    single_match = 1
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
-
-    # Search Hit
-    enable = 1
-    write_enable = 0
-    address = 0
-    data = 4
-    multiple_match = 0
-    single_match = 1
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
-
-    # Search Miss
-    enable = 1
-    write_enable = 0
-    address = 0
-    data = 5
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_single_match(dut, single_match, 0)
-
-    # Multiple Match test
-    # Write Entry 1
-    enable = 1
-    write_enable = 1
-    address = 1
-    data = 5
-    multiple_match = 0
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_single_match(dut, single_match, 0)
-
-    # Write Entry 2
-    # Same data as Entry 1
-    enable = 1
-    write_enable = 1
-    address = 2
-    data = 5
-    multiple_match = 0
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_single_match(dut, single_match, 0)
-
-    # Read Hit Data 5
-    enable = 1
-    write_enable = 0
-    address = 1
-    data = 5
-    multiple_match = 1
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
-
-    # Verify read_warning is not caused
-    # Write Entry 0
-    enable = 1
-    write_enable = 1
-    address = 0
-    data = 7
-    multiple_match = 0
-    single_match = 0
-    yield from set_cam(dut, enable, write_enable, address, data)
-    # Note there is no yield we immediately attempt to read in the next cycle
-
-    # Read Hit Data 7
-    enable = 1
-    write_enable = 0
-    address = 0
-    data = 7
-    multiple_match = 0
-    single_match = 1
-    yield from set_cam(dut, enable, write_enable, address, data)
-    yield
-    yield from check_single_match(dut, single_match, 0)
-
-    yield
-
-
-def test_cam():
-    dut = Cam(4, 4)
-    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam.vcd")
-    print("Cam Unit Test Success")
-
-
-if __name__ == "__main__":
-    test_cam()
diff --git a/src/unused/TLB/test/test_cam_entry.py b/src/unused/TLB/test/test_cam_entry.py
deleted file mode 100644 (file)
index 961445b..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-from nmigen.compat.sim import run_simulation
-
-from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
-from soc.TLB.CamEntry import CamEntry
-
-# This function allows for the easy setting of values to the Cam Entry
-# Arguments:
-#   dut: The CamEntry being tested
-#   c (command): NA (0), Read (1), Write (2), Reserve (3)
-#   d (data): The data to be set
-
-
-def set_cam_entry(dut, c, d):
-    # Write desired values
-    yield dut.command.eq(c)
-    yield dut.data_in.eq(d)
-    yield
-    # Reset all lines
-    yield dut.command.eq(0)
-    yield dut.data_in.eq(0)
-    yield
-
-# Checks the data state of the CAM entry
-# Arguments:
-#   dut: The CamEntry being tested
-#   d (Data): The expected data
-#   op (Operation): (0 => ==), (1 => !=)
-
-
-def check_data(dut, d, op):
-    out_d = yield dut.data
-    assert_op("Data", out_d, d, op)
-
-# Checks the match state of the CAM entry
-# Arguments:
-#   dut: The CamEntry being tested
-#   m (Match): The expected match
-#   op (Operation): (0 => ==), (1 => !=)
-
-
-def check_match(dut, m, op):
-    out_m = yield dut.match
-    assert_op("Match", out_m, m, op)
-
-# Checks the state of the CAM entry
-# Arguments:
-#   dut: The CamEntry being tested
-#   d (data): The expected data
-#   m (match): The expected match
-#   d_op (Operation): Operation for the data assertion (0 => ==), (1 => !=)
-#   m_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
-
-
-def check_all(dut, d, m, d_op, m_op):
-    yield from check_data(dut, d, d_op)
-    yield from check_match(dut, m, m_op)
-
-# This tbench goes through the paces of testing the CamEntry module
-# It is done by writing and then reading various combinations of key/data pairs
-# and reading the results with varying keys to verify the resulting stored
-# data is correct.
-
-
-def tbench(dut):
-    # Check write
-    command = 2
-    data = 1
-    match = 0
-    yield from set_cam_entry(dut, command, data)
-    yield from check_all(dut, data, match, 0, 0)
-
-    # Check read miss
-    command = 1
-    data = 2
-    match = 0
-    yield from set_cam_entry(dut, command, data)
-    yield from check_all(dut, data, match, 1, 0)
-
-    # Check read hit
-    command = 1
-    data = 1
-    match = 1
-    yield from set_cam_entry(dut, command, data)
-    yield from check_all(dut, data, match, 0, 0)
-
-    # Check overwrite
-    command = 2
-    data = 5
-    match = 0
-    yield from set_cam_entry(dut, command, data)
-    yield
-    yield from check_all(dut, data, match, 0, 0)
-
-    # Check read hit
-    command = 1
-    data = 5
-    match = 1
-    yield from set_cam_entry(dut, command, data)
-    yield from check_all(dut, data, match, 0, 0)
-
-    # Check reset
-    command = 3
-    data = 0
-    match = 0
-    yield from set_cam_entry(dut, command, data)
-    yield from check_all(dut, data, match, 0, 0)
-
-    # Extra clock cycle for waveform
-    yield
-
-
-def test_camentry():
-    dut = CamEntry(4)
-    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam_entry.vcd")
-    print("CamEntry Unit Test Success")
-
-
-if __name__ == "__main__":
-    test_camentry()
diff --git a/src/unused/TLB/test/test_permission_validator.py b/src/unused/TLB/test/test_permission_validator.py
deleted file mode 100644 (file)
index b52b545..0000000
+++ /dev/null
@@ -1,150 +0,0 @@
-from nmigen.compat.sim import run_simulation
-
-from soc.TLB.PermissionValidator import PermissionValidator
-
-from soc.TestUtil.test_helper import assert_op
-
-
-def set_validator(dut, d, xwr, sm, sa, asid):
-    yield dut.data.eq(d)
-    yield dut.xwr.eq(xwr)
-    yield dut.super_mode.eq(sm)
-    yield dut.super_access.eq(sa)
-    yield dut.asid.eq(asid)
-    yield
-
-
-def check_valid(dut, v, op):
-    out_v = yield dut.valid
-    assert_op("Valid", out_v, v, op)
-
-
-def tbench(dut):
-    # 80 bits represented. Ignore the MSB as it will be truncated
-    # ASID is bits first 4 hex values (bits 64 - 78)
-
-    # Test user mode entry valid
-    # Global Bit matching ASID
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000031
-    # Ignore MSB it will be truncated
-    asid = 0x7FFF
-    super_mode = 0
-    super_access = 0
-    xwr = 0
-    valid = 1
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test user mode entry valid
-    # Global Bit nonmatching ASID
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000031
-    # Ignore MSB it will be truncated
-    asid = 0x7FF6
-    super_mode = 0
-    super_access = 0
-    xwr = 0
-    valid = 1
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test user mode entry invalid
-    # Global Bit nonmatching ASID
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000021
-    # Ignore MSB it will be truncated
-    asid = 0x7FF6
-    super_mode = 0
-    super_access = 0
-    xwr = 0
-    valid = 0
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test user mode entry valid
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000011
-    # Ignore MSB it will be truncated
-    asid = 0x7FFF
-    super_mode = 0
-    super_access = 0
-    xwr = 0
-    valid = 1
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test user mode entry invalid
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000011
-    # Ignore MSB it will be truncated
-    asid = 0x7FF6
-    super_mode = 0
-    super_access = 0
-    xwr = 0
-    valid = 0
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test supervisor mode entry valid
-    # The entry is NOT in user mode
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000001
-    # Ignore MSB it will be truncated
-    asid = 0x7FFF
-    super_mode = 1
-    super_access = 0
-    xwr = 0
-    valid = 1
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test supervisor mode entry invalid
-    # The entry is in user mode
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000011
-    # Ignore MSB it will be truncated
-    asid = 0x7FFF
-    super_mode = 1
-    super_access = 0
-    xwr = 0
-    valid = 0
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test supervisor mode entry valid
-    # The entry is NOT in user mode with access
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000001
-    # Ignore MSB it will be truncated
-    asid = 0x7FFF
-    super_mode = 1
-    super_access = 1
-    xwr = 0
-    valid = 1
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-    # Test supervisor mode entry valid
-    # The entry is in user mode with access
-    # Ensure that user mode and valid is enabled!
-    data = 0x7FFF0000000000000011
-    # Ignore MSB it will be truncated
-    asid = 0x7FFF
-    super_mode = 1
-    super_access = 1
-    xwr = 0
-    valid = 1
-    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
-    yield from check_valid(dut, valid, 0)
-
-
-def test_permv():
-    dut = PermissionValidator(15, 64)
-    run_simulation(dut, tbench(
-        dut), vcd_name="Waveforms/test_permission_validator.vcd")
-    print("PermissionValidator Unit Test Success")
-
-
-if __name__ == "__main__":
-    test_permv()
diff --git a/src/unused/TLB/test/test_pte_entry.py b/src/unused/TLB/test/test_pte_entry.py
deleted file mode 100644 (file)
index 51b3dcf..0000000
+++ /dev/null
@@ -1,114 +0,0 @@
-from nmigen.compat.sim import run_simulation
-
-from soc.TLB.PteEntry import PteEntry
-
-from soc.TestUtil.test_helper import assert_op
-
-
-def set_entry(dut, i):
-    yield dut.i.eq(i)
-    yield
-
-
-def check_dirty(dut, d, op):
-    out_d = yield dut.d
-    assert_op("Dirty", out_d, d, op)
-
-
-def check_accessed(dut, a, op):
-    out_a = yield dut.a
-    assert_op("Accessed", out_a, a, op)
-
-
-def check_global(dut, o, op):
-    out = yield dut.g
-    assert_op("Global", out, o, op)
-
-
-def check_user(dut, o, op):
-    out = yield dut.u
-    assert_op("User Mode", out, o, op)
-
-
-def check_xwr(dut, o, op):
-    out = yield dut.xwr
-    assert_op("XWR", out, o, op)
-
-
-def check_asid(dut, o, op):
-    out = yield dut.asid
-    assert_op("ASID", out, o, op)
-
-
-def check_pte(dut, o, op):
-    out = yield dut.pte
-    assert_op("ASID", out, o, op)
-
-
-def check_valid(dut, v, op):
-    out_v = yield dut.v
-    assert_op("Valid", out_v, v, op)
-
-
-def check_all(dut, d, a, g, u, xwr, v, asid, pte):
-    yield from check_dirty(dut, d, 0)
-    yield from check_accessed(dut, a, 0)
-    yield from check_global(dut, g, 0)
-    yield from check_user(dut, u, 0)
-    yield from check_xwr(dut, xwr, 0)
-    yield from check_asid(dut, asid, 0)
-    yield from check_pte(dut, pte, 0)
-    yield from check_valid(dut, v, 0)
-
-
-def tbench(dut):
-    # 80 bits represented. Ignore the MSB as it will be truncated
-    # ASID is bits first 4 hex values (bits 64 - 78)
-
-    i = 0x7FFF0000000000000031
-    dirty = 0
-    access = 0
-    glob = 1
-    user = 1
-    xwr = 0
-    valid = 1
-    asid = 0x7FFF
-    pte = 0x0000000000000031
-    yield from set_entry(dut, i)
-    yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
-
-    i = 0x0FFF00000000000000FF
-    dirty = 1
-    access = 1
-    glob = 1
-    user = 1
-    xwr = 7
-    valid = 1
-    asid = 0x0FFF
-    pte = 0x00000000000000FF
-    yield from set_entry(dut, i)
-    yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
-
-    i = 0x0721000000001100001F
-    dirty = 0
-    access = 0
-    glob = 0
-    user = 1
-    xwr = 7
-    valid = 1
-    asid = 0x0721
-    pte = 0x000000001100001F
-    yield from set_entry(dut, i)
-    yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
-
-    yield
-
-
-def test_pteentry():
-    dut = PteEntry(15, 64)
-    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_pte_entry.vcd")
-    print("PteEntry Unit Test Success")
-
-
-if __name__ == "__main__":
-    test_pteentry()
diff --git a/src/unused/TLB/test/test_set_associative_cache.py b/src/unused/TLB/test/test_set_associative_cache.py
deleted file mode 100644 (file)
index edec055..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-from nmigen.compat.sim import run_simulation
-
-from soc.TLB.SetAssociativeCache import SetAssociativeCache
-
-from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
-
-
-def set_sac(dut, e, c, s, t, d):
-    yield dut.enable.eq(e)
-    yield dut.command.eq(c)
-    yield dut.cset.eq(s)
-    yield dut.tag.eq(t)
-    yield dut.data_i.eq(d)
-    yield
-
-
-def tbench(dut):
-    enable = 1
-    command = 2
-    cset = 1
-    tag = 2
-    data = 3
-    yield from set_sac(dut, enable, command, cset, tag, data)
-    yield
-
-    enable = 1
-    command = 2
-    cset = 1
-    tag = 5
-    data = 8
-    yield from set_sac(dut, enable, command, cset, tag, data)
-    yield
-
-
-def test_assoc_cache():
-    dut = SetAssociativeCache(4, 4, 4, 4)
-    run_simulation(dut, tbench(
-        dut), vcd_name="Waveforms/test_set_associative_cache.vcd")
-    print("Set Associative Cache Unit Test Success")
-
-
-if __name__ == "__main__":
-    test_assoc_cache()
diff --git a/src/unused/TLB/test/test_tlb.py b/src/unused/TLB/test/test_tlb.py
deleted file mode 100644 (file)
index 3865662..0000000
+++ /dev/null
@@ -1,86 +0,0 @@
-#import tracemalloc
-# tracemalloc.start()
-
-from nmigen.compat.sim import run_simulation
-
-from soc.TLB.TLB import TLB
-
-from soc.TestUtil.test_helper import assert_op, assert_eq
-
-# self.supermode = Signal(1) # Supervisor Mode
-# self.super_access = Signal(1) # Supervisor Access
-# self.command = Signal(2) # 00=None, 01=Search, 10=Write L1, 11=Write L2
-# self.xwr = Signal(3) # Execute, Write, Read
-# self.mode = Signal(4) # 4 bits for access to Sv48 on Rv64
-#self.address_L1 = Signal(range(L1_size))
-# self.asid = Signal(asid_size) # Address Space IDentifier (ASID)
-# self.vma = Signal(vma_size) # Virtual Memory Address (VMA)
-# self.pte_in = Signal(pte_size) # To be saved Page Table Entry (PTE)
-#
-# self.hit = Signal(1) # Denotes if the VMA had a mapped PTE
-# self.perm_valid = Signal(1) # Denotes if the permissions are correct
-# self.pte_out = Signal(pte_size) # PTE that was mapped to by the VMA
-
-COMMAND_READ = 1
-COMMAND_WRITE_L1 = 2
-
-# Checks the data state of the CAM entry
-# Arguments:
-#   dut: The CamEntry being tested
-#   d (Data): The expected data
-#   op (Operation): (0 => ==), (1 => !=)
-
-
-def check_hit(dut, d):
-    hit_d = yield dut.hit
-    #assert_eq("hit", hit_d, d)
-
-
-def tst_command(dut, cmd, xwr, cycles):
-    yield dut.command.eq(cmd)
-    yield dut.xwr.eq(xwr)
-    for i in range(0, cycles):
-        yield
-
-
-def tst_write_L1(dut, vma, address_L1, asid, pte_in):
-    yield dut.address_L1.eq(address_L1)
-    yield dut.asid.eq(asid)
-    yield dut.vma.eq(vma)
-    yield dut.pte_in.eq(pte_in)
-    yield from tst_command(dut, COMMAND_WRITE_L1, 7, 2)
-
-
-def tst_search(dut, vma, found):
-    yield dut.vma.eq(vma)
-    yield from tst_command(dut, COMMAND_READ, 7, 1)
-    yield from check_hit(dut, found)
-
-
-def zero(dut):
-    yield dut.supermode.eq(0)
-    yield dut.super_access.eq(0)
-    yield dut.mode.eq(0)
-    yield dut.address_L1.eq(0)
-    yield dut.asid.eq(0)
-    yield dut.vma.eq(0)
-    yield dut.pte_in.eq(0)
-
-
-def tbench(dut):
-    yield from zero(dut)
-    yield dut.mode.eq(0xF)  # enable TLB
-    # test hit
-    yield from tst_write_L1(dut, 0xFEEDFACE, 0, 0xFFFF, 0xF0F0)
-    yield from tst_search(dut, 0xFEEDFACE, 1)
-    yield from tst_search(dut, 0xFACEFEED, 0)
-
-
-def test_tlb():
-    dut = TLB(15, 36, 64, 8)
-    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_tlb.vcd")
-    print("TLB Unit Test Success")
-
-
-if __name__ == "__main__":
-    test_tlb()
diff --git a/src/unused/__init__.py b/src/unused/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/src/unused/experiment/l0_cache.py b/src/unused/experiment/l0_cache.py
deleted file mode 100644 (file)
index 3b8a7d9..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-class DualPortSplitter(Elaboratable):
-    """DualPortSplitter
-
-    * one incoming PortInterface
-    * two *OUTGOING* PortInterfaces
-    * uses LDSTSplitter to do it
-
-    (actually, thinking about it LDSTSplitter could simply be
-     modified to conform to PortInterface: one in, two out)
-
-    once that is done each pair of ports may be wired directly
-    to the dual ports of L0CacheBuffer
-
-    The split is carried out so that, regardless of alignment or
-    mis-alignment, outgoing PortInterface[0] takes bit 4 == 0
-    of the address, whilst outgoing PortInterface[1] takes
-    bit 4 == 1.
-
-    PortInterface *may* need to be changed so that the length is
-    a binary number (accepting values 1-16).
-    """
-
-    def __init__(self,inp):
-        self.outp = [PortInterface(name="outp_0"),
-                     PortInterface(name="outp_1")]
-        print(self.outp)
-
-    def elaborate(self, platform):
-        m = Module()
-        comb = m.d.comb
-        m.submodules.splitter = splitter = LDSTSplitter(64, 48, 4)
-        self.inp = splitter.pi
-        comb += splitter.addr_i.eq(self.inp.addr)  # XXX
-        #comb += splitter.len_i.eq()
-        #comb += splitter.valid_i.eq()
-        comb += splitter.is_ld_i.eq(self.inp.is_ld_i)
-        comb += splitter.is_st_i.eq(self.inp.is_st_i)
-        #comb += splitter.st_data_i.eq()
-        #comb += splitter.sld_valid_i.eq()
-        #comb += splitter.sld_data_i.eq()
-        #comb += splitter.sst_valid_i.eq()
-        return m
diff --git a/src/unused/iommu/__init__.py b/src/unused/iommu/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/src/unused/iommu/axi_rab/__init__.py b/src/unused/iommu/axi_rab/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/src/unused/iommu/axi_rab/axi4_ar_buffer.py b/src/unused/iommu/axi_rab/axi4_ar_buffer.py
deleted file mode 100644 (file)
index 1f3a5ff..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright 2018 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the "License"); you may not use this file except in
-# compliance with the License.  You may obtain a copy of the License at
-# http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-# module axi4_ar_buffer
-#  #(
-#    parameter AXI_ID_WIDTH   = 4,
-#    parameter AXI_USER_WIDTH = 4
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_arid,
-#    input  logic               [31:0] s_axi4_araddr,
-#    input  logic                      s_axi4_arvalid,
-#    output logic                      s_axi4_arready,
-#    input  logic                [7:0] s_axi4_arlen,
-#    input  logic                [2:0] s_axi4_arsize,
-#    input  logic                [1:0] s_axi4_arburst,
-#    input  logic                      s_axi4_arlock,
-#    input  logic                [2:0] s_axi4_arprot,
-#    input  logic                [3:0] s_axi4_arcache,
-#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_aruser,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_arid,
-#    output logic               [31:0] m_axi4_araddr,
-#    output logic                      m_axi4_arvalid,
-#    input  logic                      m_axi4_arready,
-#    output logic                [7:0] m_axi4_arlen,
-#    output logic                [2:0] m_axi4_arsize,
-#    output logic                [1:0] m_axi4_arburst,
-#    output logic                      m_axi4_arlock,
-#    output logic                [2:0] m_axi4_arprot,
-#    output logic                [3:0] m_axi4_arcache,
-#    output logic [AXI_USER_WIDTH-1:0] m_axi4_aruser
-#  );
-
-
-class axi4_ar_buffer(Elaboratable):
-
-    def __init__(self):
-        # self.axi4_aclk = Signal() # input
-        # self.axi4_arstn = Signal() # input
-        self.s_axi4_arid = Signal(AXI_ID_WIDTH)  # input
-        self.s_axi4_araddr = Signal(32)  # input
-        self.s_axi4_arvalid = Signal()  # input
-        self.s_axi4_arready = Signal()  # output
-        self.s_axi4_arlen = Signal(8)  # input
-        self.s_axi4_arsize = Signal(3)  # input
-        self.s_axi4_arburst = Signal(2)  # input
-        self.s_axi4_arlock = Signal()  # input
-        self.s_axi4_arprot = Signal(3)  # input
-        self.s_axi4_arcache = Signal(4)  # input
-        self.s_axi4_aruser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_arid = Signal(AXI_ID_WIDTH)  # output
-        self.m_axi4_araddr = Signal(32)  # output
-        self.m_axi4_arvalid = Signal()  # output
-        self.m_axi4_arready = Signal()  # input
-        self.m_axi4_arlen = Signal(8)  # output
-        self.m_axi4_arsize = Signal(3)  # output
-        self.m_axi4_arburst = Signal(2)  # output
-        self.m_axi4_arlock = Signal()  # output
-        self.m_axi4_arprot = Signal(3)  # output
-        self.m_axi4_arcache = Signal(4)  # output
-        self.m_axi4_aruser = Signal(AXI_USER_WIDTH)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        #  #TODO use record types here
-        #  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+52:0] data_in;
-        #  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+52:0] data_out;
-
-        # assign data_in                                           [3:0] = s_axi4_arcache;
-        # assign data_in                                           [6:4] = s_axi4_arprot;
-        # assign data_in                                             [7] = s_axi4_arlock;
-        # assign data_in                                           [9:8] = s_axi4_arburst;
-        # assign data_in                                         [12:10] = s_axi4_arsize;
-        # assign data_in                                         [20:13] = s_axi4_arlen;
-        # assign data_in                                         [52:21] = s_axi4_araddr;
-        # assign data_in                            [52+AXI_ID_WIDTH:53] = s_axi4_arid;
-        # assign data_in[52+AXI_ID_WIDTH+AXI_USER_WIDTH:53+AXI_ID_WIDTH] = s_axi4_aruser;
-        #
-        # assign m_axi4_arcache = data_out[3:0];
-        # assign m_axi4_arprot  = data_out[6:4];
-        # assign m_axi4_arlock  = data_out[7];
-        # assign m_axi4_arburst = data_out[9:8];
-        # assign m_axi4_arsize  = data_out[12:10];
-        # assign m_axi4_arlen   = data_out[20:13];
-        # assign m_axi4_araddr  = data_out[52:21];
-        # assign m_axi4_arid    = data_out[52+AXI_ID_WIDTH:53];
-        # assign m_axi4_aruser  = data_out[52+AXI_ID_WIDTH+AXI_USER_WIDTH:53+AXI_ID_WIDTH];
-
-        # m.d.comb += self.m_axi4_arcache.eq(..)
-        # m.d.comb += self.m_axi4_arprot.eq(..)
-        # m.d.comb += self.m_axi4_arlock.eq(..)
-        # m.d.comb += self.m_axi4_arburst.eq(..)
-        # m.d.comb += self.m_axi4_arsize.eq(..)
-        # m.d.comb += self.m_axi4_arlen.eq(..)
-        # m.d.comb += self.m_axi4_araddr.eq(..)
-        # m.d.comb += self.m_axi4_arid.eq(..)
-        # m.d.comb += self.m_axi4_aruser.eq(..)
-        return m
-
-# TODO convert axi_buffer_rab.sv
-#
-#  axi_buffer_rab
-#    #(
-#      .DATA_WIDTH   ( AXI_ID_WIDTH+AXI_USER_WIDTH+53  ),
-#      .BUFFER_DEPTH ( 4                               )
-#      )
-#    u_buffer
-#    (
-#      .clk       ( axi4_aclk      ),
-#      .rstn      ( axi4_arstn     ),
-#      .valid_out ( m_axi4_arvalid ),
-#      .data_out  ( data_out       ),
-#      .ready_in  ( m_axi4_arready ),
-#      .valid_in  ( s_axi4_arvalid ),
-#      .data_in   ( data_in        ),
-#      .ready_out ( s_axi4_arready )
-#    );
-#
-
-# endmodule
diff --git a/src/unused/iommu/axi_rab/axi4_ar_sender.py b/src/unused/iommu/axi_rab/axi4_ar_sender.py
deleted file mode 100644 (file)
index 4cbd97d..0000000
+++ /dev/null
@@ -1,232 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_ar_sender(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.l1_done_o = Signal()  # output
-        self.l1_accept_i = Signal()  # input
-        self.l1_drop_i = Signal()  # input
-        self.l1_save_i = Signal()  # input
-        self.l2_done_o = Signal()  # output
-        self.l2_accept_i = Signal()  # input
-        self.l2_drop_i = Signal()  # input
-        self.l2_sending_o = Signal()  # output
-        self.l1_araddr_i = Signal(AXI_ADDR_WIDTH)  # input
-        self.l2_araddr_i = Signal(AXI_ADDR_WIDTH)  # input
-        self.s_axi4_arid = Signal(AXI_ID_WIDTH)  # input
-        self.s_axi4_arvalid = Signal()  # input
-        self.s_axi4_arready = Signal()  # output
-        self.s_axi4_arlen = Signal(8)  # input
-        self.s_axi4_arsize = Signal(3)  # input
-        self.s_axi4_arburst = Signal(2)  # input
-        self.s_axi4_arlock = Signal()  # input
-        self.s_axi4_arprot = Signal(3)  # input
-        self.s_axi4_arcache = Signal(4)  # input
-        self.s_axi4_aruser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_arid = Signal(AXI_ID_WIDTH)  # output
-        self.m_axi4_araddr = Signal(AXI_ADDR_WIDTH)  # output
-        self.m_axi4_arvalid = Signal()  # output
-        self.m_axi4_arready = Signal()  # input
-        self.m_axi4_arlen = Signal(8)  # output
-        self.m_axi4_arsize = Signal(3)  # output
-        self.m_axi4_arburst = Signal(2)  # output
-        self.m_axi4_arlock = Signal()  # output
-        self.m_axi4_arprot = Signal(3)  # output
-        self.m_axi4_arcache = Signal(4)  # output
-        self.m_axi4_aruser = Signal(AXI_USER_WIDTH)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.l1_save.eq(self.None)
-        m.d.comb += self.l1_done_o.eq(self.None)
-        m.d.comb += self.m_axi4_arvalid.eq(self.None)
-        m.d.comb += self.s_axi4_arready.eq(self.None)
-        m.d.comb += self.m_axi4_aruser.eq(self.None)
-        m.d.comb += self.m_axi4_arcache.eq(self.None)
-        m.d.comb += self.m_axi4_arprot.eq(self.None)
-        m.d.comb += self.m_axi4_arlock.eq(self.None)
-        m.d.comb += self.m_axi4_arburst.eq(self.None)
-        m.d.comb += self.m_axi4_arsize.eq(self.None)
-        m.d.comb += self.m_axi4_arlen.eq(self.None)
-        m.d.comb += self.m_axi4_araddr.eq(self.None)
-        m.d.comb += self.m_axi4_arid.eq(self.None)
-        m.d.comb += self.l2_sending_o.eq(self.None)
-        m.d.comb += self.l2_sent.eq(self.None)
-        m.d.comb += self.l2_done_o.eq(self.None)
-        m.d.comb += self.m_axi4_aruser.eq(self.s_axi4_aruser)
-        m.d.comb += self.m_axi4_arcache.eq(self.s_axi4_arcache)
-        m.d.comb += self.m_axi4_arprot.eq(self.s_axi4_arprot)
-        m.d.comb += self.m_axi4_arlock.eq(self.s_axi4_arlock)
-        m.d.comb += self.m_axi4_arburst.eq(self.s_axi4_arburst)
-        m.d.comb += self.m_axi4_arsize.eq(self.s_axi4_arsize)
-        m.d.comb += self.m_axi4_arlen.eq(self.s_axi4_arlen)
-        m.d.comb += self.m_axi4_araddr.eq(self.l1_araddr_i)
-        m.d.comb += self.m_axi4_arid.eq(self.s_axi4_arid)
-        m.d.comb += self.l2_sending_o.eq(self.1: 'b0)
-        m.d.comb += self.l2_available_q.eq(self.1: 'b0)
-        m.d.comb += self.l2_done_o.eq(self.1: 'b0)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_ar_sender
-#  #(
-#    parameter AXI_ADDR_WIDTH = 40,
-#    parameter AXI_ID_WIDTH   = 4,
-#    parameter AXI_USER_WIDTH = 4,
-#    parameter ENABLE_L2TLB   = 0
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    output logic                      l1_done_o,
-#    input  logic                      l1_accept_i,
-#    input  logic                      l1_drop_i,
-#    input  logic                      l1_save_i,
-#
-#    output logic                      l2_done_o,
-#    input  logic                      l2_accept_i,
-#    input  logic                      l2_drop_i,
-#    output logic                      l2_sending_o,
-#
-#    input  logic [AXI_ADDR_WIDTH-1:0] l1_araddr_i,
-#    input  logic [AXI_ADDR_WIDTH-1:0] l2_araddr_i,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_arid,
-#    input  logic                      s_axi4_arvalid,
-#    output logic                      s_axi4_arready,
-#    input  logic                [7:0] s_axi4_arlen,
-#    input  logic                [2:0] s_axi4_arsize,
-#    input  logic                [1:0] s_axi4_arburst,
-#    input  logic                      s_axi4_arlock,
-#    input  logic                [2:0] s_axi4_arprot,
-#    input  logic                [3:0] s_axi4_arcache,
-#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_aruser,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_arid,
-#    output logic [AXI_ADDR_WIDTH-1:0] m_axi4_araddr,
-#    output logic                      m_axi4_arvalid,
-#    input  logic                      m_axi4_arready,
-#    output logic                [7:0] m_axi4_arlen,
-#    output logic                [2:0] m_axi4_arsize,
-#    output logic                [1:0] m_axi4_arburst,
-#    output logic                      m_axi4_arlock,
-#    output logic                [2:0] m_axi4_arprot,
-#    output logic                [3:0] m_axi4_arcache,
-#    output logic [AXI_USER_WIDTH-1:0] m_axi4_aruser
-#  );
-#
-#  logic l1_save;
-#
-#  logic l2_sent;
-#  logic l2_available_q;
-#
-#  assign l1_save      = l1_save_i & l2_available_q;
-#
-#  assign l1_done_o    = s_axi4_arvalid & s_axi4_arready ;
-#
-#  // if 1: accept and forward a transaction translated by L1
-#  //    2: drop or save request (if L2 slot not occupied already)
-#  assign m_axi4_arvalid = (s_axi4_arvalid & l1_accept_i) |
-#                          l2_sending_o;
-#  assign s_axi4_arready = (m_axi4_arvalid & m_axi4_arready & ~l2_sending_o) |
-#                          (s_axi4_arvalid & (l1_drop_i | l1_save));
-#
-# generate
-#  if (ENABLE_L2TLB == 1) begin
-#    logic [AXI_USER_WIDTH-1:0] l2_axi4_aruser  ;
-#    logic                [3:0] l2_axi4_arcache ;
-#    logic                [3:0] l2_axi4_arregion;
-#    logic                [3:0] l2_axi4_arqos   ;
-#    logic                [2:0] l2_axi4_arprot  ;
-#    logic                      l2_axi4_arlock  ;
-#    logic                [1:0] l2_axi4_arburst ;
-#    logic                [2:0] l2_axi4_arsize  ;
-#    logic                [7:0] l2_axi4_arlen   ;
-#    logic   [AXI_ID_WIDTH-1:0] l2_axi4_arid    ;
-#
-#    assign m_axi4_aruser  = l2_sending_o ? l2_axi4_aruser   : s_axi4_aruser;
-#    assign m_axi4_arcache = l2_sending_o ? l2_axi4_arcache  : s_axi4_arcache;
-#    assign m_axi4_arprot  = l2_sending_o ? l2_axi4_arprot   : s_axi4_arprot;
-#    assign m_axi4_arlock  = l2_sending_o ? l2_axi4_arlock   : s_axi4_arlock;
-#    assign m_axi4_arburst = l2_sending_o ? l2_axi4_arburst  : s_axi4_arburst;
-#    assign m_axi4_arsize  = l2_sending_o ? l2_axi4_arsize   : s_axi4_arsize;
-#    assign m_axi4_arlen   = l2_sending_o ? l2_axi4_arlen    : s_axi4_arlen;
-#    assign m_axi4_araddr  = l2_sending_o ? l2_araddr_i      : l1_araddr_i;
-#    assign m_axi4_arid    = l2_sending_o ? l2_axi4_arid     : s_axi4_arid;
-#
-#    // Buffer AXI signals in case of L1 miss
-#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
-#      if (axi4_arstn == 1'b0) begin
-#        l2_axi4_aruser  <=  'b0;
-#        l2_axi4_arcache <=  'b0;
-#        l2_axi4_arprot  <=  'b0;
-#        l2_axi4_arlock  <= 1'b0;
-#        l2_axi4_arburst <=  'b0;
-#        l2_axi4_arsize  <=  'b0;
-#        l2_axi4_arlen   <=  'b0;
-#        l2_axi4_arid    <=  'b0;
-#      end else if (l1_save) begin
-#        l2_axi4_aruser  <= s_axi4_aruser;
-#        l2_axi4_arcache <= s_axi4_arcache;
-#        l2_axi4_arprot  <= s_axi4_arprot;
-#        l2_axi4_arlock  <= s_axi4_arlock;
-#        l2_axi4_arburst <= s_axi4_arburst;
-#        l2_axi4_arsize  <= s_axi4_arsize;
-#        l2_axi4_arlen   <= s_axi4_arlen;
-#        l2_axi4_arid    <= s_axi4_arid;
-#      end
-#    end
-#
-#    // signal that an l1_save_i can be accepted
-#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
-#      if (axi4_arstn == 1'b0) begin
-#        l2_available_q <= 1'b1;
-#      end else if (l2_sent | l2_drop_i) begin
-#        l2_available_q <= 1'b1;
-#      end else if (l1_save) begin
-#        l2_available_q <= 1'b0;
-#      end
-#    end
-#
-#    assign l2_sending_o = l2_accept_i & ~l2_available_q;
-#    assign l2_sent      = l2_sending_o & m_axi4_arvalid & m_axi4_arready;
-#
-#    // if 1: having sent out a transaction translated by L2
-#    //    2: drop request (L2 slot is available again)
-#    assign l2_done_o    = l2_sent | l2_drop_i;
-#
-#  end else begin // !`ifdef ENABLE_L2TLB
-#    assign m_axi4_aruser  =  s_axi4_aruser;
-#    assign m_axi4_arcache =  s_axi4_arcache;
-#    assign m_axi4_arprot  =  s_axi4_arprot;
-#    assign m_axi4_arlock  =  s_axi4_arlock;
-#    assign m_axi4_arburst =  s_axi4_arburst;
-#    assign m_axi4_arsize  =  s_axi4_arsize;
-#    assign m_axi4_arlen   =  s_axi4_arlen;
-#    assign m_axi4_araddr  =  l1_araddr_i;
-#    assign m_axi4_arid    =  s_axi4_arid;
-#
-#    assign l2_sending_o   = 1'b0;
-#    assign l2_available_q = 1'b0;
-#    assign l2_done_o      = 1'b0;
-#  end // else: !if(ENABLE_L2TLB == 1)
-# endgenerate
-#
-# endmodule
-#
-#
diff --git a/src/unused/iommu/axi_rab/axi4_aw_buffer.py b/src/unused/iommu/axi_rab/axi4_aw_buffer.py
deleted file mode 100644 (file)
index f5ca37d..0000000
+++ /dev/null
@@ -1,157 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_aw_buffer(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.s_axi4_awid = Signal(AXI_ID_WIDTH)  # input
-        self.s_axi4_awaddr = Signal(32)  # input
-        self.s_axi4_awvalid = Signal()  # input
-        self.s_axi4_awready = Signal()  # output
-        self.s_axi4_awlen = Signal(8)  # input
-        self.s_axi4_awsize = Signal(3)  # input
-        self.s_axi4_awburst = Signal(2)  # input
-        self.s_axi4_awlock = Signal()  # input
-        self.s_axi4_awprot = Signal(3)  # input
-        self.s_axi4_awcache = Signal(4)  # input
-        self.s_axi4_awregion = Signal(4)  # input
-        self.s_axi4_awqos = Signal(4)  # input
-        self.s_axi4_awuser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_awid = Signal(AXI_ID_WIDTH)  # output
-        self.m_axi4_awaddr = Signal(32)  # output
-        self.m_axi4_awvalid = Signal()  # output
-        self.m_axi4_awready = Signal()  # input
-        self.m_axi4_awlen = Signal(8)  # output
-        self.m_axi4_awsize = Signal(3)  # output
-        self.m_axi4_awburst = Signal(2)  # output
-        self.m_axi4_awlock = Signal()  # output
-        self.m_axi4_awprot = Signal(3)  # output
-        self.m_axi4_awcache = Signal(4)  # output
-        self.m_axi4_awregion = Signal(4)  # output
-        self.m_axi4_awqos = Signal(4)  # output
-        self.m_axi4_awuser = Signal(AXI_USER_WIDTH)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.None.eq(self.s_axi4_awcache)
-        m.d.comb += self.None.eq(self.s_axi4_awprot)
-        m.d.comb += self.None.eq(self.s_axi4_awlock)
-        m.d.comb += self.None.eq(self.s_axi4_awburst)
-        m.d.comb += self.None.eq(self.s_axi4_awsize)
-        m.d.comb += self.None.eq(self.s_axi4_awlen)
-        m.d.comb += self.None.eq(self.s_axi4_awaddr)
-        m.d.comb += self.None.eq(self.s_axi4_awregion)
-        m.d.comb += self.None.eq(self.s_axi4_awqos)
-        m.d.comb += self.None.eq(self.s_axi4_awid)
-        m.d.comb += self.None.eq(self.s_axi4_awuser)
-        m.d.comb += self.m_axi4_awcache.eq(self.None)
-        m.d.comb += self.m_axi4_awprot.eq(self.None)
-        m.d.comb += self.m_axi4_awlock.eq(self.None)
-        m.d.comb += self.m_axi4_awburst.eq(self.None)
-        m.d.comb += self.m_axi4_awsize.eq(self.None)
-        m.d.comb += self.m_axi4_awlen.eq(self.None)
-        m.d.comb += self.m_axi4_awaddr.eq(self.None)
-        m.d.comb += self.m_axi4_awregion.eq(self.None)
-        m.d.comb += self.m_axi4_awqos.eq(self.None)
-        m.d.comb += self.m_axi4_awid.eq(self.None)
-        m.d.comb += self.m_axi4_awuser.eq(self.None)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_aw_buffer
-#  #(
-#    parameter AXI_ID_WIDTH   = 4,
-#    parameter AXI_USER_WIDTH = 4
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_awid,
-#    input  logic               [31:0] s_axi4_awaddr,
-#    input  logic                      s_axi4_awvalid,
-#    output logic                      s_axi4_awready,
-#    input  logic                [7:0] s_axi4_awlen,
-#    input  logic                [2:0] s_axi4_awsize,
-#    input  logic                [1:0] s_axi4_awburst,
-#    input  logic                      s_axi4_awlock,
-#    input  logic                [2:0] s_axi4_awprot,
-#    input  logic                [3:0] s_axi4_awcache,
-#    input  logic                [3:0] s_axi4_awregion,
-#    input  logic                [3:0] s_axi4_awqos,
-#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_awuser,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_awid,
-#    output logic               [31:0] m_axi4_awaddr,
-#    output logic                      m_axi4_awvalid,
-#    input  logic                      m_axi4_awready,
-#    output logic                [7:0] m_axi4_awlen,
-#    output logic                [2:0] m_axi4_awsize,
-#    output logic                [1:0] m_axi4_awburst,
-#    output logic                      m_axi4_awlock,
-#    output logic                [2:0] m_axi4_awprot,
-#    output logic                [3:0] m_axi4_awcache,
-#    output logic                [3:0] m_axi4_awregion,
-#    output logic                [3:0] m_axi4_awqos,
-#    output logic [AXI_USER_WIDTH-1:0] m_axi4_awuser
-#  );
-#
-#  wire [AXI_USER_WIDTH+AXI_ID_WIDTH+60:0] data_in;
-#  wire [AXI_USER_WIDTH+AXI_ID_WIDTH+60:0] data_out;
-#
-#  assign data_in                                            [3:0] = s_axi4_awcache;
-#  assign data_in                                            [6:4] = s_axi4_awprot;
-#  assign data_in                                              [7] = s_axi4_awlock;
-#  assign data_in                                            [9:8] = s_axi4_awburst;
-#  assign data_in                                          [12:10] = s_axi4_awsize;
-#  assign data_in                                          [20:13] = s_axi4_awlen;
-#  assign data_in                                          [52:21] = s_axi4_awaddr;
-#  assign data_in                                          [56:53] = s_axi4_awregion;
-#  assign data_in                                          [60:57] = s_axi4_awqos;
-#  assign data_in                             [60+AXI_ID_WIDTH:61] = s_axi4_awid;
-#  assign data_in [60+AXI_ID_WIDTH+AXI_USER_WIDTH:61+AXI_ID_WIDTH] = s_axi4_awuser;
-#
-#  assign m_axi4_awcache  = data_out[3:0];
-#  assign m_axi4_awprot   = data_out[6:4];
-#  assign m_axi4_awlock   = data_out[7];
-#  assign m_axi4_awburst  = data_out[9:8];
-#  assign m_axi4_awsize   = data_out[12:10];
-#  assign m_axi4_awlen    = data_out[20:13];
-#  assign m_axi4_awaddr   = data_out[52:21];
-#  assign m_axi4_awregion = data_out[56:53];
-#  assign m_axi4_awqos    = data_out[60:57];
-#  assign m_axi4_awid     = data_out[60+AXI_ID_WIDTH:61];
-#  assign m_axi4_awuser   = data_out[60+AXI_ID_WIDTH+AXI_USER_WIDTH:61+AXI_ID_WIDTH];
-#
-#  axi_buffer_rab
-#    #(
-#      .DATA_WIDTH   ( AXI_ID_WIDTH+AXI_USER_WIDTH+61  ),
-#      .BUFFER_DEPTH ( 4                               )
-#    )
-#    u_buffer
-#    (
-#      .clk       ( axi4_aclk      ),
-#      .rstn      ( axi4_arstn     ),
-#      .valid_out ( m_axi4_awvalid ),
-#      .data_out  ( data_out       ),
-#      .ready_in  ( m_axi4_awready ),
-#      .valid_in  ( s_axi4_awvalid ),
-#      .data_in   ( data_in        ),
-#      .ready_out ( s_axi4_awready )
-#    );
-# endmodule
-#
-#
diff --git a/src/unused/iommu/axi_rab/axi4_aw_sender.py b/src/unused/iommu/axi_rab/axi4_aw_sender.py
deleted file mode 100644 (file)
index fbc917d..0000000
+++ /dev/null
@@ -1,252 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_aw_sender(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.l1_done_o = Signal()  # output
-        self.l1_accept_i = Signal()  # input
-        self.l1_drop_i = Signal()  # input
-        self.l1_save_i = Signal()  # input
-        self.l2_done_o = Signal()  # output
-        self.l2_accept_i = Signal()  # input
-        self.l2_drop_i = Signal()  # input
-        self.l2_sending_o = Signal()  # output
-        self.l1_awaddr_i = Signal(AXI_ADDR_WIDTH)  # input
-        self.l2_awaddr_i = Signal(AXI_ADDR_WIDTH)  # input
-        self.s_axi4_awid = Signal(AXI_ID_WIDTH)  # input
-        self.s_axi4_awvalid = Signal()  # input
-        self.s_axi4_awready = Signal()  # output
-        self.s_axi4_awlen = Signal(8)  # input
-        self.s_axi4_awsize = Signal(3)  # input
-        self.s_axi4_awburst = Signal(2)  # input
-        self.s_axi4_awlock = Signal()  # input
-        self.s_axi4_awprot = Signal(3)  # input
-        self.s_axi4_awcache = Signal(4)  # input
-        self.s_axi4_awregion = Signal(4)  # input
-        self.s_axi4_awqos = Signal(4)  # input
-        self.s_axi4_awuser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_awid = Signal(AXI_ID_WIDTH)  # output
-        self.m_axi4_awaddr = Signal(AXI_ADDR_WIDTH)  # output
-        self.m_axi4_awvalid = Signal()  # output
-        self.m_axi4_awready = Signal()  # input
-        self.m_axi4_awlen = Signal(8)  # output
-        self.m_axi4_awsize = Signal(3)  # output
-        self.m_axi4_awburst = Signal(2)  # output
-        self.m_axi4_awlock = Signal()  # output
-        self.m_axi4_awprot = Signal(3)  # output
-        self.m_axi4_awcache = Signal(4)  # output
-        self.m_axi4_awregion = Signal(4)  # output
-        self.m_axi4_awqos = Signal(4)  # output
-        self.m_axi4_awuser = Signal(AXI_USER_WIDTH)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.l1_save.eq(self.None)
-        m.d.comb += self.l1_done_o.eq(self.None)
-        m.d.comb += self.m_axi4_awvalid.eq(self.None)
-        m.d.comb += self.s_axi4_awready.eq(self.None)
-        m.d.comb += self.m_axi4_awuser.eq(self.None)
-        m.d.comb += self.m_axi4_awcache.eq(self.None)
-        m.d.comb += self.m_axi4_awregion.eq(self.None)
-        m.d.comb += self.m_axi4_awqos.eq(self.None)
-        m.d.comb += self.m_axi4_awprot.eq(self.None)
-        m.d.comb += self.m_axi4_awlock.eq(self.None)
-        m.d.comb += self.m_axi4_awburst.eq(self.None)
-        m.d.comb += self.m_axi4_awsize.eq(self.None)
-        m.d.comb += self.m_axi4_awlen.eq(self.None)
-        m.d.comb += self.m_axi4_awaddr.eq(self.None)
-        m.d.comb += self.m_axi4_awid.eq(self.None)
-        m.d.comb += self.l2_sending_o.eq(self.None)
-        m.d.comb += self.l2_sent.eq(self.None)
-        m.d.comb += self.l2_done_o.eq(self.None)
-        m.d.comb += self.m_axi4_awuser.eq(self.s_axi4_awuser)
-        m.d.comb += self.m_axi4_awcache.eq(self.s_axi4_awcache)
-        m.d.comb += self.m_axi4_awregion.eq(self.s_axi4_awregion)
-        m.d.comb += self.m_axi4_awqos.eq(self.s_axi4_awqos)
-        m.d.comb += self.m_axi4_awprot.eq(self.s_axi4_awprot)
-        m.d.comb += self.m_axi4_awlock.eq(self.s_axi4_awlock)
-        m.d.comb += self.m_axi4_awburst.eq(self.s_axi4_awburst)
-        m.d.comb += self.m_axi4_awsize.eq(self.s_axi4_awsize)
-        m.d.comb += self.m_axi4_awlen.eq(self.s_axi4_awlen)
-        m.d.comb += self.m_axi4_awaddr.eq(self.l1_awaddr_i)
-        m.d.comb += self.m_axi4_awid.eq(self.s_axi4_awid)
-        m.d.comb += self.l2_sending_o.eq(self.1: 'b0)
-        m.d.comb += self.l2_available_q.eq(self.1: 'b0)
-        m.d.comb += self.l2_done_o.eq(self.1: 'b0)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_aw_sender
-#  #(
-#    parameter AXI_ADDR_WIDTH   = 40,
-#    parameter AXI_ID_WIDTH     = 4,
-#    parameter AXI_USER_WIDTH   = 4,
-#    parameter ENABLE_L2TLB     = 0
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    output logic                      l1_done_o,
-#    input  logic                      l1_accept_i,
-#    input  logic                      l1_drop_i,
-#    input  logic                      l1_save_i,
-#
-#    output logic                      l2_done_o,
-#    input  logic                      l2_accept_i,
-#    input  logic                      l2_drop_i,
-#    output logic                      l2_sending_o,
-#
-#    input  logic [AXI_ADDR_WIDTH-1:0] l1_awaddr_i,
-#    input  logic [AXI_ADDR_WIDTH-1:0] l2_awaddr_i,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_awid,
-#    input  logic                      s_axi4_awvalid,
-#    output logic                      s_axi4_awready,
-#    input  logic                [7:0] s_axi4_awlen,
-#    input  logic                [2:0] s_axi4_awsize,
-#    input  logic                [1:0] s_axi4_awburst,
-#    input  logic                      s_axi4_awlock,
-#    input  logic                [2:0] s_axi4_awprot,
-#    input  logic                [3:0] s_axi4_awcache,
-#    input  logic                [3:0] s_axi4_awregion,
-#    input  logic                [3:0] s_axi4_awqos,
-#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_awuser,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_awid,
-#    output logic [AXI_ADDR_WIDTH-1:0] m_axi4_awaddr,
-#    output logic                      m_axi4_awvalid,
-#    input  logic                      m_axi4_awready,
-#    output logic                [7:0] m_axi4_awlen,
-#    output logic                [2:0] m_axi4_awsize,
-#    output logic                [1:0] m_axi4_awburst,
-#    output logic                      m_axi4_awlock,
-#    output logic                [2:0] m_axi4_awprot,
-#    output logic                [3:0] m_axi4_awcache,
-#    output logic                [3:0] m_axi4_awregion,
-#    output logic                [3:0] m_axi4_awqos,
-#    output logic [AXI_USER_WIDTH-1:0] m_axi4_awuser
-#  );
-#
-#  logic l1_save;
-#
-#  logic l2_sent;
-#  logic l2_available_q;
-#
-#  assign l1_save      = l1_save_i & l2_available_q;
-#
-#  assign l1_done_o    = s_axi4_awvalid & s_axi4_awready ;
-#
-#  // if 1: accept and forward a transaction translated by L1
-#  //    2: drop or save request (if L2 slot not occupied already)
-#  assign m_axi4_awvalid = (s_axi4_awvalid & l1_accept_i) |
-#                          l2_sending_o;
-#  assign s_axi4_awready = (m_axi4_awvalid & m_axi4_awready & ~l2_sending_o) |
-#                          (s_axi4_awvalid & (l1_drop_i | l1_save));
-#
-# generate
-#  if (ENABLE_L2TLB    == 1) begin
-#    logic [AXI_USER_WIDTH-1:0] l2_axi4_awuser  ;
-#    logic                [3:0] l2_axi4_awcache ;
-#    logic                [3:0] l2_axi4_awregion;
-#    logic                [3:0] l2_axi4_awqos   ;
-#    logic                [2:0] l2_axi4_awprot  ;
-#    logic                      l2_axi4_awlock  ;
-#    logic                [1:0] l2_axi4_awburst ;
-#    logic                [2:0] l2_axi4_awsize  ;
-#    logic                [7:0] l2_axi4_awlen   ;
-#    logic   [AXI_ID_WIDTH-1:0] l2_axi4_awid    ;
-#
-#    assign m_axi4_awuser   = l2_sending_o ? l2_axi4_awuser   : s_axi4_awuser;
-#    assign m_axi4_awcache  = l2_sending_o ? l2_axi4_awcache  : s_axi4_awcache;
-#    assign m_axi4_awregion = l2_sending_o ? l2_axi4_awregion : s_axi4_awregion;
-#    assign m_axi4_awqos    = l2_sending_o ? l2_axi4_awqos    : s_axi4_awqos;
-#    assign m_axi4_awprot   = l2_sending_o ? l2_axi4_awprot   : s_axi4_awprot;
-#    assign m_axi4_awlock   = l2_sending_o ? l2_axi4_awlock   : s_axi4_awlock;
-#    assign m_axi4_awburst  = l2_sending_o ? l2_axi4_awburst  : s_axi4_awburst;
-#    assign m_axi4_awsize   = l2_sending_o ? l2_axi4_awsize   : s_axi4_awsize;
-#    assign m_axi4_awlen    = l2_sending_o ? l2_axi4_awlen    : s_axi4_awlen;
-#    assign m_axi4_awaddr   = l2_sending_o ? l2_awaddr_i      : l1_awaddr_i;
-#    assign m_axi4_awid     = l2_sending_o ? l2_axi4_awid     : s_axi4_awid;
-#
-#    // buffer AXI signals in case of L1 miss
-#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
-#      if (axi4_arstn == 1'b0) begin
-#        l2_axi4_awuser   <=  'b0;
-#        l2_axi4_awcache  <=  'b0;
-#        l2_axi4_awregion <=  'b0;
-#        l2_axi4_awqos    <=  'b0;
-#        l2_axi4_awprot   <=  'b0;
-#        l2_axi4_awlock   <= 1'b0;
-#        l2_axi4_awburst  <=  'b0;
-#        l2_axi4_awsize   <=  'b0;
-#        l2_axi4_awlen    <=  'b0;
-#        l2_axi4_awid     <=  'b0;
-#      end else if (l1_save) begin
-#        l2_axi4_awuser   <= s_axi4_awuser;
-#        l2_axi4_awcache  <= s_axi4_awcache;
-#        l2_axi4_awregion <= s_axi4_awregion;
-#        l2_axi4_awqos    <= s_axi4_awqos;
-#        l2_axi4_awprot   <= s_axi4_awprot;
-#        l2_axi4_awlock   <= s_axi4_awlock;
-#        l2_axi4_awburst  <= s_axi4_awburst;
-#        l2_axi4_awsize   <= s_axi4_awsize;
-#        l2_axi4_awlen    <= s_axi4_awlen;
-#        l2_axi4_awid     <= s_axi4_awid;
-#      end
-#    end
-#
-#    // signal that an l1_save_i can be accepted
-#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
-#      if (axi4_arstn == 1'b0) begin
-#        l2_available_q <= 1'b1;
-#      end else if (l2_sent | l2_drop_i) begin
-#        l2_available_q <= 1'b1;
-#      end else if (l1_save) begin
-#        l2_available_q <= 1'b0;
-#      end
-#    end
-#
-#    assign l2_sending_o = l2_accept_i & ~l2_available_q;
-#    assign l2_sent      = l2_sending_o & m_axi4_awvalid & m_axi4_awready;
-#
-#    // if 1: having sent out a transaction translated by L2
-#    //    2: drop request (L2 slot is available again)
-#    assign l2_done_o    = l2_sent | l2_drop_i;
-#
-#  end else begin // !`ifdef ENABLE_L2TLB
-#    assign m_axi4_awuser   =  s_axi4_awuser;
-#    assign m_axi4_awcache  =  s_axi4_awcache;
-#    assign m_axi4_awregion =  s_axi4_awregion;
-#    assign m_axi4_awqos    =  s_axi4_awqos;
-#    assign m_axi4_awprot   =  s_axi4_awprot;
-#    assign m_axi4_awlock   =  s_axi4_awlock;
-#    assign m_axi4_awburst  =  s_axi4_awburst;
-#    assign m_axi4_awsize   =  s_axi4_awsize;
-#    assign m_axi4_awlen    =  s_axi4_awlen;
-#    assign m_axi4_awaddr   =  l1_awaddr_i;
-#    assign m_axi4_awid     =  s_axi4_awid;
-#
-#    assign l2_sending_o    = 1'b0;
-#    assign l2_available_q  = 1'b0;
-#    assign l2_done_o       = 1'b0;
-#  end // !`ifdef ENABLE_L2TLB
-# endgenerate
-#
-# endmodule
-#
-#
diff --git a/src/unused/iommu/axi_rab/axi4_b_buffer.py b/src/unused/iommu/axi_rab/axi4_b_buffer.py
deleted file mode 100644 (file)
index 42fce1a..0000000
+++ /dev/null
@@ -1,94 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_b_buffer(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.s_axi4_bid = Signal(AXI_ID_WIDTH)  # output
-        self.s_axi4_bresp = Signal(2)  # output
-        self.s_axi4_bvalid = Signal()  # output
-        self.s_axi4_buser = Signal(AXI_USER_WIDTH)  # output
-        self.s_axi4_bready = Signal()  # input
-        self.m_axi4_bid = Signal(AXI_ID_WIDTH)  # input
-        self.m_axi4_bresp = Signal(2)  # input
-        self.m_axi4_bvalid = Signal()  # input
-        self.m_axi4_buser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_bready = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.None.eq(self.m_axi4_bresp)
-        m.d.comb += self.None.eq(self.m_axi4_bid)
-        m.d.comb += self.None.eq(self.m_axi4_buser)
-        m.d.comb += self.s_axi4_buser.eq(self.None)
-        m.d.comb += self.s_axi4_bid.eq(self.None)
-        m.d.comb += self.s_axi4_bresp.eq(self.None)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_b_buffer
-#  #(
-#    parameter AXI_ID_WIDTH   = 4,
-#    parameter AXI_USER_WIDTH = 4
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_bid,
-#    output logic                [1:0] s_axi4_bresp,
-#    output logic                      s_axi4_bvalid,
-#    output logic [AXI_USER_WIDTH-1:0] s_axi4_buser,
-#    input  logic                      s_axi4_bready,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_bid,
-#    input  logic                [1:0] m_axi4_bresp,
-#    input  logic                      m_axi4_bvalid,
-#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_buser,
-#    output logic                      m_axi4_bready
-#  );
-#
-#  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+1:0] data_in;
-#  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+1:0] data_out;
-#
-#  assign data_in                                         [1:0] = m_axi4_bresp;
-#  assign data_in                            [AXI_ID_WIDTH+1:2] = m_axi4_bid;
-#  assign data_in[AXI_ID_WIDTH+AXI_USER_WIDTH+1:AXI_ID_WIDTH+2] = m_axi4_buser;
-#
-#  assign s_axi4_buser = data_out[AXI_ID_WIDTH+AXI_USER_WIDTH+1:AXI_ID_WIDTH+2];
-#  assign s_axi4_bid   = data_out[AXI_ID_WIDTH+1:2];
-#  assign s_axi4_bresp = data_out[1:0];
-#
-#  axi_buffer_rab
-#  #(
-#    .DATA_WIDTH   ( AXI_ID_WIDTH+AXI_USER_WIDTH+2 ),
-#    .BUFFER_DEPTH ( 4                             )
-#    )
-#  u_buffer
-#  (
-#    .clk      ( axi4_aclk     ),
-#    .rstn     ( axi4_arstn    ),
-#    .valid_out( s_axi4_bvalid ),
-#    .data_out ( data_out      ),
-#    .ready_in ( s_axi4_bready ),
-#    .valid_in ( m_axi4_bvalid ),
-#    .data_in  ( data_in       ),
-#    .ready_out( m_axi4_bready )
-#  );
-#
-# endmodule
-#
-#
diff --git a/src/unused/iommu/axi_rab/axi4_b_sender.py b/src/unused/iommu/axi_rab/axi4_b_sender.py
deleted file mode 100644 (file)
index 1c61a2a..0000000
+++ /dev/null
@@ -1,136 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_b_sender(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.drop_i = Signal()  # input
-        self.done_o = Signal()  # output
-        self.id_i = Signal(AXI_ID_WIDTH)  # input
-        self.prefetch_i = Signal()  # input
-        self.hit_i = Signal()  # input
-        self.s_axi4_bid = Signal(AXI_ID_WIDTH)  # output
-        self.s_axi4_bresp = Signal(2)  # output
-        self.s_axi4_bvalid = Signal()  # output
-        self.s_axi4_buser = Signal(AXI_USER_WIDTH)  # output
-        self.s_axi4_bready = Signal()  # input
-        self.m_axi4_bid = Signal(AXI_ID_WIDTH)  # input
-        self.m_axi4_bresp = Signal(2)  # input
-        self.m_axi4_bvalid = Signal()  # input
-        self.m_axi4_buser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_bready = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.fifo_push.eq(self.None)
-        m.d.comb += self.done_o.eq(self.fifo_push)
-        m.d.comb += self.fifo_pop.eq(self.None)
-        m.d.comb += self.s_axi4_buser.eq(self.None)
-        m.d.comb += self.s_axi4_bid.eq(self.None)
-        m.d.comb += self.s_axi4_bresp.eq(self.None)
-        m.d.comb += self.s_axi4_bvalid.eq(self.None)
-        m.d.comb += self.m_axi4_bready.eq(self.None)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_b_sender
-#  #(
-#    parameter AXI_ID_WIDTH   = 10,
-#    parameter AXI_USER_WIDTH = 4
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    input  logic                      drop_i,
-#    output logic                      done_o,
-#    input  logic   [AXI_ID_WIDTH-1:0] id_i,
-#    input  logic                      prefetch_i,
-#    input  logic                      hit_i,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_bid,
-#    output logic                [1:0] s_axi4_bresp,
-#    output logic                      s_axi4_bvalid,
-#    output logic [AXI_USER_WIDTH-1:0] s_axi4_buser,
-#    input  logic                      s_axi4_bready,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_bid,
-#    input  logic                [1:0] m_axi4_bresp,
-#    input  logic                      m_axi4_bvalid,
-#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_buser,
-#    output logic                      m_axi4_bready
-#  );
-#
-#  logic                    fifo_valid;
-#  logic                    fifo_pop;
-#  logic                    fifo_push;
-#  logic                    fifo_ready;
-#  logic [AXI_ID_WIDTH-1:0] id;
-#  logic                    prefetch;
-#  logic                    hit;
-#
-#  logic                    dropping;
-#
-#  axi_buffer_rab
-#    #(
-#      .DATA_WIDTH   ( 2+AXI_ID_WIDTH  ),
-#      .BUFFER_DEPTH ( 4               )
-#      )
-#    u_fifo
-#      (
-#        .clk       ( axi4_aclk                 ),
-#        .rstn      ( axi4_arstn                ),
-#        // Pop
-#        .data_out  ( {prefetch,   hit,   id}   ),
-#        .valid_out ( fifo_valid                ),
-#        .ready_in  ( fifo_pop                  ),
-#        // Push
-#        .valid_in  ( fifo_push                 ),
-#        .data_in   ( {prefetch_i, hit_i, id_i} ),
-#        .ready_out ( fifo_ready                )
-#      );
-#
-#  assign fifo_push = drop_i & fifo_ready;
-#  assign done_o    = fifo_push;
-#
-#  assign fifo_pop  = dropping & s_axi4_bready;
-#
-#  always @ (posedge axi4_aclk or negedge axi4_arstn) begin
-#    if (axi4_arstn == 1'b0) begin
-#      dropping <= 1'b0;
-#    end else begin
-#      if (fifo_valid && ~dropping)
-#        dropping <= 1'b1;
-#      else if (fifo_pop)
-#        dropping <= 1'b0;
-#    end
-#  end
-#
-#  assign s_axi4_buser  = dropping ? {AXI_USER_WIDTH{1'b0}} : m_axi4_buser;
-#  assign s_axi4_bid    = dropping ? id : m_axi4_bid;
-#
-#  assign s_axi4_bresp  = (dropping & prefetch & hit) ? 2'b00 : // prefetch hit, mutli, prot
-#                         (dropping & prefetch      ) ? 2'b10 : // prefetch miss
-#                         (dropping            & hit) ? 2'b10 : // non-prefetch multi, prot
-#                         (dropping                 ) ? 2'b10 : // non-prefetch miss
-#                         m_axi4_bresp;
-#
-#  assign s_axi4_bvalid =  dropping | m_axi4_bvalid;
-#  assign m_axi4_bready = ~dropping & s_axi4_bready;
-#
-# endmodule
-#
-#
diff --git a/src/unused/iommu/axi_rab/axi4_r_buffer.py b/src/unused/iommu/axi_rab/axi4_r_buffer.py
deleted file mode 100644 (file)
index 91bdf0a..0000000
+++ /dev/null
@@ -1,120 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_r_buffer(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.s_axi4_rid = Signal(AXI_ID_WIDTH)  # output
-        self.s_axi4_rresp = Signal(2)  # output
-        self.s_axi4_rdata = Signal(AXI_DATA_WIDTH)  # output
-        self.s_axi4_rlast = Signal()  # output
-        self.s_axi4_rvalid = Signal()  # output
-        self.s_axi4_ruser = Signal(AXI_USER_WIDTH)  # output
-        self.s_axi4_rready = Signal()  # input
-        self.m_axi4_rid = Signal(AXI_ID_WIDTH)  # input
-        self.m_axi4_rresp = Signal(2)  # input
-        self.m_axi4_rdata = Signal(AXI_DATA_WIDTH)  # input
-        self.m_axi4_rlast = Signal()  # input
-        self.m_axi4_rvalid = Signal()  # input
-        self.m_axi4_ruser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_rready = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.None.eq(self.m_axi4_rresp)
-        m.d.comb += self.None.eq(self.m_axi4_rlast)
-        m.d.comb += self.None.eq(self.m_axi4_rid)
-        m.d.comb += self.None.eq(self.m_axi4_rdata)
-        m.d.comb += self.None.eq(self.m_axi4_ruser)
-        m.d.comb += self.s_axi4_rresp.eq(self.None)
-        m.d.comb += self.s_axi4_rlast.eq(self.None)
-        m.d.comb += self.s_axi4_rid.eq(self.None)
-        m.d.comb += self.s_axi4_rdata.eq(self.None)
-        m.d.comb += self.s_axi4_ruser.eq(self.None)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_r_buffer
-#  #(
-#    parameter AXI_DATA_WIDTH = 32,
-#    parameter AXI_ID_WIDTH   = 4,
-#    parameter AXI_USER_WIDTH = 4
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_rid,
-#    output logic                [1:0] s_axi4_rresp,
-#    output logic [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
-#    output logic                      s_axi4_rlast,
-#    output logic                      s_axi4_rvalid,
-#    output logic [AXI_USER_WIDTH-1:0] s_axi4_ruser,
-#    input  logic                      s_axi4_rready,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_rid,
-#    input  logic                [1:0] m_axi4_rresp,
-#    input  logic [AXI_DATA_WIDTH-1:0] m_axi4_rdata,
-#    input  logic                      m_axi4_rlast,
-#    input  logic                      m_axi4_rvalid,
-#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_ruser,
-#    output logic                      m_axi4_rready
-#  );
-#
-#  wire [AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3-1:0] data_in;
-#  wire [AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3-1:0] data_out;
-#
-#  localparam ID_START   = 3;
-#  localparam ID_END     = AXI_ID_WIDTH-1 + ID_START;
-#  localparam DATA_START = ID_END + 1;
-#  localparam DATA_END   = AXI_DATA_WIDTH-1 + DATA_START;
-#  localparam USER_START = DATA_END + 1;
-#  localparam USER_END   = AXI_USER_WIDTH-1 + USER_START;
-#
-#  assign data_in                [1:0] = m_axi4_rresp;
-#  assign data_in                  [2] = m_axi4_rlast;
-#  assign data_in    [ID_END:ID_START] = m_axi4_rid;
-#  assign data_in[DATA_END:DATA_START] = m_axi4_rdata;
-#  assign data_in[USER_END:USER_START] = m_axi4_ruser;
-#
-#  assign s_axi4_rresp  = data_out                [1:0];
-#  assign s_axi4_rlast  = data_out                  [2];
-#  assign s_axi4_rid    = data_out    [ID_END:ID_START];
-#  assign s_axi4_rdata  = data_out[DATA_END:DATA_START];
-#  assign s_axi4_ruser  = data_out[USER_END:USER_START];
-#
-#  axi_buffer_rab
-#  #(
-#    .DATA_WIDTH   ( AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3  ),
-#    .BUFFER_DEPTH ( 4                                             )
-#    )
-#  u_buffer
-#  (
-#    .clk       ( axi4_aclk     ),
-#    .rstn      ( axi4_arstn    ),
-#    // Pop
-#    .valid_out ( s_axi4_rvalid ),
-#    .data_out  ( data_out      ),
-#    .ready_in  ( s_axi4_rready ),
-#    // Push
-#    .valid_in  ( m_axi4_rvalid ),
-#    .data_in   ( data_in       ),
-#    .ready_out ( m_axi4_rready )
-#  );
-#
-# endmodule
-#
-#
diff --git a/src/unused/iommu/axi_rab/axi4_r_sender.py b/src/unused/iommu/axi_rab/axi4_r_sender.py
deleted file mode 100644 (file)
index d4e22bb..0000000
+++ /dev/null
@@ -1,206 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_r_sender(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.drop_i = Signal()  # input
-        self.drop_len_i = Signal(8)  # input
-        self.done_o = Signal()  # output
-        self.id_i = Signal(AXI_ID_WIDTH)  # input
-        self.prefetch_i = Signal()  # input
-        self.hit_i = Signal()  # input
-        self.s_axi4_rid = Signal(AXI_ID_WIDTH)  # output
-        self.s_axi4_rresp = Signal(2)  # output
-        self.s_axi4_rdata = Signal(AXI_DATA_WIDTH)  # output
-        self.s_axi4_rlast = Signal()  # output
-        self.s_axi4_rvalid = Signal()  # output
-        self.s_axi4_ruser = Signal(AXI_USER_WIDTH)  # output
-        self.s_axi4_rready = Signal()  # input
-        self.m_axi4_rid = Signal(AXI_ID_WIDTH)  # input
-        self.m_axi4_rresp = Signal(2)  # input
-        self.m_axi4_rdata = Signal(AXI_DATA_WIDTH)  # input
-        self.m_axi4_rlast = Signal()  # input
-        self.m_axi4_rvalid = Signal()  # input
-        self.m_axi4_ruser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_rready = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.fifo_push.eq(self.None)
-        m.d.comb += self.done_o.eq(self.fifo_push)
-        m.d.comb += self.s_axi4_rdata.eq(self.m_axi4_rdata)
-        m.d.comb += self.s_axi4_ruser.eq(self.None)
-        m.d.comb += self.s_axi4_rid.eq(self.None)
-        m.d.comb += self.s_axi4_rresp.eq(self.None)
-        m.d.comb += self.s_axi4_rvalid.eq(self.None)
-        m.d.comb += self.m_axi4_rready.eq(self.None)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# //import CfMath::log2;
-#
-# module axi4_r_sender
-#  #(
-#    parameter AXI_DATA_WIDTH = 32,
-#    parameter AXI_ID_WIDTH   = 4,
-#    parameter AXI_USER_WIDTH = 4
-#  )
-#  (
-#    input  logic                      axi4_aclk,
-#    input  logic                      axi4_arstn,
-#
-#    input  logic                      drop_i,
-#    input  logic                [7:0] drop_len_i,
-#    output logic                      done_o,
-#    input  logic   [AXI_ID_WIDTH-1:0] id_i,
-#    input  logic                      prefetch_i,
-#    input  logic                      hit_i,
-#
-#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_rid,
-#    output logic                [1:0] s_axi4_rresp,
-#    output logic [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
-#    output logic                      s_axi4_rlast,
-#    output logic                      s_axi4_rvalid,
-#    output logic [AXI_USER_WIDTH-1:0] s_axi4_ruser,
-#    input  logic                      s_axi4_rready,
-#
-#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_rid,
-#    input  logic                [1:0] m_axi4_rresp,
-#    input  logic [AXI_DATA_WIDTH-1:0] m_axi4_rdata,
-#    input  logic                      m_axi4_rlast,
-#    input  logic                      m_axi4_rvalid,
-#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_ruser,
-#    output logic                      m_axi4_rready
-#  );
-#
-#  localparam BUFFER_DEPTH = 16;
-#
-#  logic                    fifo_valid;
-#  logic                    fifo_pop;
-#  logic                    fifo_push;
-#  logic                    fifo_ready;
-#  logic [AXI_ID_WIDTH-1:0] id;
-#  logic              [7:0] len;
-#  logic                    prefetch;
-#  logic                    hit;
-#
-#  logic                    dropping;
-#
-#  enum logic [1:0]  { FORWARDING, DROPPING }
-#                            state_d,                state_q;
-#  logic                     burst_ongoing_d,        burst_ongoing_q;
-#  logic [7:0]               drop_cnt_d,             drop_cnt_q;
-#
-#  axi_buffer_rab
-#    #(
-#      .DATA_WIDTH       ( 2+AXI_ID_WIDTH+8  ),
-#      .BUFFER_DEPTH     ( BUFFER_DEPTH      )
-#      )
-#    u_fifo
-#      (
-#        .clk       ( axi4_aclk                              ),
-#        .rstn      ( axi4_arstn                             ),
-#        // Pop
-#        .data_out  ( {prefetch,   hit,   id,   len}         ),
-#        .valid_out ( fifo_valid                             ),
-#        .ready_in  ( fifo_pop                               ),
-#        // Push
-#        .valid_in  ( fifo_push                              ),
-#        .data_in   ( {prefetch_i, hit_i, id_i, drop_len_i}  ),
-#        .ready_out ( fifo_ready                             )
-#      );
-#
-#  assign fifo_push = drop_i & fifo_ready;
-#  assign done_o    = fifo_push;
-#
-#  always_comb begin
-#    burst_ongoing_d = burst_ongoing_q;
-#    drop_cnt_d      = drop_cnt_q;
-#    dropping        = 1'b0;
-#    s_axi4_rlast    = 1'b0;
-#    fifo_pop        = 1'b0;
-#    state_d         = state_q;
-#
-#    case (state_q)
-#      FORWARDING: begin
-#        s_axi4_rlast = m_axi4_rlast;
-#        // Remember whether there is currently a burst ongoing.
-#        if (m_axi4_rvalid && m_axi4_rready) begin
-#          if (m_axi4_rlast) begin
-#            burst_ongoing_d = 1'b0;
-#          end else begin
-#            burst_ongoing_d = 1'b1;
-#          end
-#        end
-#        // If there is no burst ongoing and the FIFO has a drop request ready, process it.
-#        if (!burst_ongoing_d && fifo_valid) begin
-#          drop_cnt_d  = len;
-#          state_d     = DROPPING;
-#        end
-#      end
-#
-#      DROPPING: begin
-#        dropping      = 1'b1;
-#        s_axi4_rlast  = (drop_cnt_q == '0);
-#        // Handshake on slave interface
-#        if (s_axi4_rready) begin
-#          drop_cnt_d -= 1;
-#          if (drop_cnt_q == '0) begin
-#            drop_cnt_d  = '0;
-#            fifo_pop    = 1'b1;
-#            state_d     = FORWARDING;
-#          end
-#        end
-#      end
-#
-#      default: begin
-#        state_d = FORWARDING;
-#      end
-#    endcase
-#  end
-#
-#  assign s_axi4_rdata  = m_axi4_rdata;
-#
-#  assign s_axi4_ruser  = dropping ? {AXI_USER_WIDTH{1'b0}} : m_axi4_ruser;
-#  assign s_axi4_rid    = dropping ? id : m_axi4_rid;
-#
-#  assign s_axi4_rresp  = (dropping & prefetch & hit) ? 2'b00 : // prefetch hit, mutli, prot
-#                         (dropping & prefetch      ) ? 2'b10 : // prefetch miss
-#                         (dropping            & hit) ? 2'b10 : // non-prefetch multi, prot
-#                         (dropping                 ) ? 2'b10 : // non-prefetch miss
-#                         m_axi4_rresp;
-#
-#  assign s_axi4_rvalid =  dropping | m_axi4_rvalid;
-#  assign m_axi4_rready = ~dropping & s_axi4_rready;
-#
-#  always_ff @(posedge axi4_aclk, negedge axi4_arstn) begin
-#    if (axi4_arstn == 1'b0) begin
-#      burst_ongoing_q <= 1'b0;
-#      drop_cnt_q      <=  'b0;
-#      state_q         <= FORWARDING;
-#    end else begin
-#      burst_ongoing_q <= burst_ongoing_d;
-#      drop_cnt_q      <= drop_cnt_d;
-#      state_q         <= state_d;
-#    end
-#  end
-#
-# endmodule
-#
-#
-#
-#
diff --git a/src/unused/iommu/axi_rab/axi4_w_buffer.py b/src/unused/iommu/axi_rab/axi4_w_buffer.py
deleted file mode 100644 (file)
index aa06dc2..0000000
+++ /dev/null
@@ -1,777 +0,0 @@
-# this file has been generated by sv2nmigen
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_w_buffer(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.l1_done_o = Signal()  # output
-        self.l1_accept_i = Signal()  # input
-        self.l1_save_i = Signal()  # input
-        self.l1_drop_i = Signal()  # input
-        self.l1_master_i = Signal()  # input
-        self.l1_id_i = Signal(AXI_ID_WIDTH)  # input
-        self.l1_len_i = Signal(8)  # input
-        self.l1_prefetch_i = Signal()  # input
-        self.l1_hit_i = Signal()  # input
-        self.l2_done_o = Signal()  # output
-        self.l2_accept_i = Signal()  # input
-        self.l2_drop_i = Signal()  # input
-        self.l2_master_i = Signal()  # input
-        self.l2_id_i = Signal(AXI_ID_WIDTH)  # input
-        self.l2_len_i = Signal(8)  # input
-        self.l2_prefetch_i = Signal()  # input
-        self.l2_hit_i = Signal()  # input
-        self.master_select_o = Signal()  # output
-        self.input_stall_o = Signal()  # output
-        self.output_stall_o = Signal()  # output
-        self.b_drop_o = Signal()  # output
-        self.b_done_i = Signal()  # input
-        self.id_o = Signal(AXI_ID_WIDTH)  # output
-        self.prefetch_o = Signal()  # output
-        self.hit_o = Signal()  # output
-        self.s_axi4_wdata = Signal(AXI_DATA_WIDTH)  # input
-        self.s_axi4_wvalid = Signal()  # input
-        self.s_axi4_wready = Signal()  # output
-        self.s_axi4_wstrb = Signal(1+ERROR p_expression_25)  # input
-        self.s_axi4_wlast = Signal()  # input
-        self.s_axi4_wuser = Signal(AXI_USER_WIDTH)  # input
-        self.m_axi4_wdata = Signal(AXI_DATA_WIDTH)  # output
-        self.m_axi4_wvalid = Signal()  # output
-        self.m_axi4_wready = Signal()  # input
-        self.m_axi4_wstrb = Signal(1+ERROR p_expression_25)  # output
-        self.m_axi4_wlast = Signal()  # output
-        self.m_axi4_wuser = Signal(AXI_USER_WIDTH)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-#
-# //import CfMath::log2;
-#
-# module axi4_w_buffer
-#  #(
-#    parameter AXI_DATA_WIDTH   = 32,
-#    parameter AXI_ID_WIDTH     = 4,
-#    parameter AXI_USER_WIDTH   = 4,
-#    parameter ENABLE_L2TLB     = 0,
-#    parameter HUM_BUFFER_DEPTH = 16
-#  )
-#  (
-#    input  logic                        axi4_aclk,
-#    input  logic                        axi4_arstn,
-#
-#    // L1 & L2 interfaces
-#    output logic                        l1_done_o,
-#    input  logic                        l1_accept_i,
-#    input  logic                        l1_save_i,
-#    input  logic                        l1_drop_i,
-#    input  logic                        l1_master_i,
-#    input  logic     [AXI_ID_WIDTH-1:0] l1_id_i,
-#    input  logic                  [7:0] l1_len_i,
-#    input  logic                        l1_prefetch_i,
-#    input  logic                        l1_hit_i,
-#
-#    output logic                        l2_done_o,
-#    input  logic                        l2_accept_i,
-#    input  logic                        l2_drop_i,
-#    input  logic                        l2_master_i,
-#    input  logic     [AXI_ID_WIDTH-1:0] l2_id_i,
-#    input  logic                  [7:0] l2_len_i,
-#    input  logic                        l2_prefetch_i,
-#    input  logic                        l2_hit_i,
-#
-#    output logic                        master_select_o,
-#    output logic                        input_stall_o,
-#    output logic                        output_stall_o,
-#
-#    // B sender interface
-#    output logic                        b_drop_o,
-#    input  logic                        b_done_i,
-#    output logic     [AXI_ID_WIDTH-1:0] id_o,
-#    output logic                        prefetch_o,
-#    output logic                        hit_o,
-#
-#    // AXI W channel interfaces
-#    input  logic   [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
-#    input  logic                        s_axi4_wvalid,
-#    output logic                        s_axi4_wready,
-#    input  logic [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
-#    input  logic                        s_axi4_wlast,
-#    input  logic   [AXI_USER_WIDTH-1:0] s_axi4_wuser,
-#
-#    output logic   [AXI_DATA_WIDTH-1:0] m_axi4_wdata,
-#    output logic                        m_axi4_wvalid,
-#    input  logic                        m_axi4_wready,
-#    output logic [AXI_DATA_WIDTH/8-1:0] m_axi4_wstrb,
-#    output logic                        m_axi4_wlast,
-#    output logic   [AXI_USER_WIDTH-1:0] m_axi4_wuser
-#  );
-#
-"""
-
-  localparam BUFFER_WIDTH  = AXI_DATA_WIDTH+AXI_USER_WIDTH+AXI_DATA_WIDTH/8+1;
-
-  localparam INPUT_BUFFER_DEPTH = 4;
-  localparam L1_FIFO_DEPTH      = 8;
-  localparam L2_FIFO_DEPTH      = 4;
-
-  logic      [AXI_DATA_WIDTH-1:0] axi4_wdata;
-  logic                           axi4_wvalid;
-  logic                           axi4_wready;
-  logic    [AXI_DATA_WIDTH/8-1:0] axi4_wstrb;
-  logic                           axi4_wlast;
-  logic      [AXI_USER_WIDTH-1:0] axi4_wuser;
-
-  logic                           l1_fifo_valid_out;
-  logic                           l1_fifo_ready_in;
-  logic                           l1_fifo_valid_in;
-  logic                           l1_fifo_ready_out;
-
-  logic                           l1_req;
-  logic                           l1_accept_cur, l1_save_cur, l1_drop_cur;
-  logic                           l1_master_cur;
-  logic        [AXI_ID_WIDTH-1:0] l1_id_cur;
-  logic                     [7:0] l1_len_cur;
-  logic                           l1_hit_cur, l1_prefetch_cur;
-  logic                           l1_save_in, l1_save_out;
-  logic [log2(L1_FIFO_DEPTH)-1:0] n_l1_save_SP;
-
-  logic                           l2_fifo_valid_out;
-  logic                           l2_fifo_ready_in;
-  logic                           l2_fifo_valid_in;
-  logic                           l2_fifo_ready_out;
-
-  logic                           l2_req;
-  logic                           l2_accept_cur, l2_drop_cur;
-  logic                           l2_master_cur;
-  logic        [AXI_ID_WIDTH-1:0] l2_id_cur;
-  logic                     [7:0] l2_len_cur;
-  logic                           l2_hit_cur, l2_prefetch_cur;
-
-  logic                           fifo_select, fifo_select_SN, fifo_select_SP;
-  logic                           w_done;
-  logic                           b_drop_set;
-
-  // HUM buffer signals
-  logic                           hum_buf_ready_out;
-  logic                           hum_buf_valid_in;
-  logic                           hum_buf_ready_in;
-  logic                           hum_buf_valid_out;
-  logic                           hum_buf_underfull;
-
-  logic      [AXI_DATA_WIDTH-1:0] hum_buf_wdata;
-  logic    [AXI_DATA_WIDTH/8-1:0] hum_buf_wstrb;
-  logic                           hum_buf_wlast;
-  logic      [AXI_USER_WIDTH-1:0] hum_buf_wuser;
-
-  logic                           hum_buf_drop_req_SN, hum_buf_drop_req_SP;
-  logic                     [7:0] hum_buf_drop_len_SN, hum_buf_drop_len_SP;
-  logic                           hum_buf_almost_full;
-
-  logic                           stop_store;
-  logic                           wlast_in, wlast_out;
-  logic signed              [3:0] n_wlast_SN,          n_wlast_SP;
-  logic                           block_forwarding;
-
-  // Search FSM
-  typedef enum logic        [3:0] {STORE,                       BYPASS,
-                                   WAIT_L1_BYPASS_YES,          WAIT_L2_BYPASS_YES,
-                                   WAIT_L1_BYPASS_NO,           WAIT_L2_BYPASS_NO,
-                                   FLUSH,                       DISCARD,
-                                   DISCARD_FINISH}
-                                  hum_buf_state_t;
-  hum_buf_state_t                 hum_buf_SP; // Present state
-  hum_buf_state_tbg                 hum_buf_SN; // Next State
-
-  axi_buffer_rab
-    #(
-      .DATA_WIDTH       ( BUFFER_WIDTH        ),
-      .BUFFER_DEPTH     ( INPUT_BUFFER_DEPTH  )
-      )
-    u_input_buf
-    (
-      .clk       ( axi4_aclk                                                ),
-      .rstn      ( axi4_arstn                                               ),
-      // Push
-      .data_in   ( {s_axi4_wuser, s_axi4_wstrb, s_axi4_wdata, s_axi4_wlast} ),
-      .valid_in  ( s_axi4_wvalid                                            ),
-      .ready_out ( s_axi4_wready                                            ),
-      // Pop
-      .data_out  ( {axi4_wuser,   axi4_wstrb,   axi4_wdata,   axi4_wlast}   ),
-      .valid_out ( axi4_wvalid                                              ),
-      .ready_in  ( axi4_wready                                              )
-    );
-
-  axi_buffer_rab
-    #(
-      .DATA_WIDTH       ( 2+AXI_ID_WIDTH+8+4  ),
-      .BUFFER_DEPTH     ( L1_FIFO_DEPTH       )
-      )
-    u_l1_fifo
-    (
-      .clk       ( axi4_aclk                                                                                                    ),
-      .rstn      ( axi4_arstn                                                                                                   ),
-      // Push
-      .data_in   ( {l1_prefetch_i,   l1_hit_i,   l1_id_i,   l1_len_i,   l1_master_i,   l1_accept_i,   l1_save_i,   l1_drop_i}   ),
-      .valid_in  ( l1_fifo_valid_in                                                                                             ),
-      .ready_out ( l1_fifo_ready_out                                                                                            ),
-      // Pop
-      .data_out  ( {l1_prefetch_cur, l1_hit_cur, l1_id_cur, l1_len_cur, l1_master_cur, l1_accept_cur, l1_save_cur, l1_drop_cur} ),
-      .valid_out ( l1_fifo_valid_out                                                                                            ),
-      .ready_in  ( l1_fifo_ready_in                                                                                             )
-    );
-
-    // Push upon receiving new requests from the TLB.
-    assign l1_req           = l1_accept_i | l1_save_i | l1_drop_i;
-    assign l1_fifo_valid_in = l1_req & l1_fifo_ready_out;
-
-    // Signal handshake
-    assign l1_done_o  = l1_fifo_valid_in;
-    assign l2_done_o  = l2_fifo_valid_in;
-
-    // Stall AW input of L1 TLB
-    assign input_stall_o = ~(l1_fifo_ready_out & l2_fifo_ready_out);
-
-    // Interface b_drop signals + handshake
-    always_comb begin
-      if (fifo_select == 1'b0) begin
-        prefetch_o       = l1_prefetch_cur;
-        hit_o            = l1_hit_cur;
-        id_o             = l1_id_cur;
-
-        l1_fifo_ready_in = w_done | b_done_i;
-        l2_fifo_ready_in = 1'b0;
-      end else begin
-        prefetch_o       = l2_prefetch_cur;
-        hit_o            = l2_hit_cur;
-        id_o             = l2_id_cur;
-
-        l1_fifo_ready_in = 1'b0;
-        l2_fifo_ready_in = w_done | b_done_i;
-      end
-    end
-
-    // Detect when an L1 transaction save request enters or exits the L1 FIFO.
-    assign l1_save_in  = l1_fifo_valid_in & l1_save_i;
-    assign l1_save_out = l1_fifo_ready_in & l1_save_cur;
-
-    // Count the number of L1 transaction to save in the L1 FIFO.
-    always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
-      if (axi4_arstn == 0) begin
-        n_l1_save_SP <= '0;
-      end else if (l1_save_in ^ l1_save_out) begin
-        if (l1_save_in) begin
-          n_l1_save_SP <= n_l1_save_SP + 1'b1;
-        end else if (l1_save_out) begin
-          n_l1_save_SP <= n_l1_save_SP - 1'b1;
-        end
-      end
-    end
-
-    // Stall forwarding of AW L1 hits if:
-    // 1. The HUM buffer does not allow to be bypassed.
-    // 2. There are multiple L1 save requests in the FIFO, i.e., multiple L2 outputs pending.
-    assign output_stall_o = (n_l1_save_SP > 1) || (block_forwarding == 1'b1);
-
-  generate
-  if (ENABLE_L2TLB == 1) begin : HUM_BUFFER
-
-    axi_buffer_rab_bram
-    #(
-      .DATA_WIDTH       ( BUFFER_WIDTH      ),
-      .BUFFER_DEPTH     ( HUM_BUFFER_DEPTH  )
-      )
-    u_hum_buf
-    (
-      .clk           ( axi4_aclk                                                    ),
-      .rstn          ( axi4_arstn                                                   ),
-      // Push
-      .data_in       ( {axi4_wuser,    axi4_wstrb,    axi4_wdata,    axi4_wlast}    ),
-      .valid_in      ( hum_buf_valid_in                                             ),
-      .ready_out     ( hum_buf_ready_out                                            ),
-      // Pop
-      .data_out      ( {hum_buf_wuser, hum_buf_wstrb, hum_buf_wdata, hum_buf_wlast} ),
-      .valid_out     ( hum_buf_valid_out                                            ),
-      .ready_in      ( hum_buf_ready_in                                             ),
-      // Clear
-      .almost_full   ( hum_buf_almost_full                                          ),
-      .underfull     ( hum_buf_underfull                                            ),
-      .drop_req      ( hum_buf_drop_req_SP                                          ),
-      .drop_len      ( hum_buf_drop_len_SP                                          )
-    );
-
-    axi_buffer_rab
-    #(
-      .DATA_WIDTH       ( 2+AXI_ID_WIDTH+8+3  ),
-      .BUFFER_DEPTH     ( L2_FIFO_DEPTH       )
-      )
-    u_l2_fifo
-    (
-      .clk       ( axi4_aclk                                                                                        ),
-      .rstn      ( axi4_arstn                                                                                       ),
-      // Push
-      .data_in   ( {l2_prefetch_i,   l2_hit_i,   l2_id_i,   l2_len_i,   l2_master_i,   l2_accept_i,   l2_drop_i}    ),
-      .valid_in  ( l2_fifo_valid_in                                                                                 ),
-      .ready_out ( l2_fifo_ready_out                                                                                ),
-      // Pop
-      .data_out  ( {l2_prefetch_cur, l2_hit_cur, l2_id_cur, l2_len_cur, l2_master_cur, l2_accept_cur, l2_drop_cur}  ),
-      .valid_out ( l2_fifo_valid_out                                                                                ),
-      .ready_in  ( l2_fifo_ready_in                                                                                 )
-    );
-
-    // Push upon receiving new result from TLB.
-    assign l2_req           = l2_accept_i | l2_drop_i;
-    assign l2_fifo_valid_in = l2_req & l2_fifo_ready_out;
-
-    assign wlast_in  =    axi4_wlast & hum_buf_valid_in  & hum_buf_ready_out;
-    assign wlast_out = hum_buf_wlast & hum_buf_valid_out & hum_buf_ready_in;
-
-    always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
-      if (axi4_arstn == 0) begin
-        fifo_select_SP      <= 1'b0;
-        hum_buf_drop_len_SP <=  'b0;
-        hum_buf_drop_req_SP <= 1'b0;
-        hum_buf_SP          <= STORE;
-        n_wlast_SP          <=  'b0;
-      end else begin
-        fifo_select_SP      <= fifo_select_SN;
-        hum_buf_drop_len_SP <= hum_buf_drop_len_SN;
-        hum_buf_drop_req_SP <= hum_buf_drop_req_SN;
-        hum_buf_SP          <= hum_buf_SN;
-        n_wlast_SP          <= n_wlast_SN;
-      end
-    end
-
-    always_comb begin
-      n_wlast_SN = n_wlast_SP;
-      if (hum_buf_drop_req_SP) begin  // Happens exactly once per burst to be dropped.
-        n_wlast_SN -= 1;
-      end
-      if (wlast_in) begin
-        n_wlast_SN += 1;
-      end
-      if (wlast_out) begin
-        n_wlast_SN -= 1;
-      end
-    end
-
-    always_comb begin : HUM_BUFFER_FSM
-      hum_buf_SN       = hum_buf_SP;
-
-      m_axi4_wlast     = 1'b0;
-      m_axi4_wdata     =  'b0;
-      m_axi4_wstrb     =  'b0;
-      m_axi4_wuser     =  'b0;
-
-      m_axi4_wvalid    = 1'b0;
-      axi4_wready      = 1'b0;
-
-      hum_buf_valid_in = 1'b0;
-      hum_buf_ready_in = 1'b0;
-
-      hum_buf_drop_req_SN = hum_buf_drop_req_SP;
-      hum_buf_drop_len_SN = hum_buf_drop_len_SP;
-      master_select_o  = 1'b0;
-
-      w_done           = 1'b0; // read from FIFO without handshake with B sender
-      b_drop_o         = 1'b0; // send data from FIFO to B sender (with handshake)
-      fifo_select      = 1'b0;
-
-      fifo_select_SN   = fifo_select_SP;
-      stop_store       = 1'b0;
-
-      block_forwarding = 1'b0;
-
-      unique case (hum_buf_SP)
-
-        STORE : begin
-          // Simply store the data in the buffer.
-          hum_buf_valid_in = axi4_wvalid & hum_buf_ready_out;
-          axi4_wready      = hum_buf_ready_out;
-
-          // We have got a full burst in the HUM buffer, thus stop storing.
-          if (wlast_in & !hum_buf_underfull | (n_wlast_SP > $signed(0))) begin
-            hum_buf_SN = WAIT_L1_BYPASS_YES;
-
-          // The buffer is full, thus wait for decision.
-          end else if (~hum_buf_ready_out) begin
-            hum_buf_SN = WAIT_L1_BYPASS_NO;
-          end
-
-          // Avoid the forwarding of L1 hits until we know whether we can bypass.
-          if (l1_fifo_valid_out & l1_save_cur) begin
-            block_forwarding = 1'b1;
-          end
-        end
-
-        WAIT_L1_BYPASS_YES : begin
-          // Wait for orders from L1 TLB.
-          if (l1_fifo_valid_out) begin
-
-            // L1 hit - forward data from buffer
-            if (l1_accept_cur) begin
-              m_axi4_wlast       = hum_buf_wlast;
-              m_axi4_wdata       = hum_buf_wdata;
-              m_axi4_wstrb       = hum_buf_wstrb;
-              m_axi4_wuser       = hum_buf_wuser;
-
-              m_axi4_wvalid      = hum_buf_valid_out;
-              hum_buf_ready_in   = m_axi4_wready;
-
-              master_select_o    = l1_master_cur;
-
-              // Detect last data beat.
-              if (wlast_out) begin
-                fifo_select      = 1'b0;
-                w_done           = 1'b1;
-                hum_buf_SN       = STORE;
-              end
-
-            // L1 miss - wait for L2
-            end else if (l1_save_cur) begin
-              fifo_select        = 1'b0;
-              w_done             = 1'b1;
-              hum_buf_SN         = WAIT_L2_BYPASS_YES;
-
-            // L1 prefetch, prot, multi - drop data
-            end else if (l1_drop_cur) begin
-              fifo_select_SN      = 1'b0; // L1
-              hum_buf_drop_req_SN = 1'b1;
-              hum_buf_drop_len_SN = l1_len_cur;
-              hum_buf_SN          = FLUSH;
-            end
-          end
-        end
-
-        WAIT_L2_BYPASS_YES : begin
-          // Wait for orders from L2 TLB.
-          if (l2_fifo_valid_out) begin
-
-            // L2 hit - forward data from buffer
-            if (l2_accept_cur) begin
-              m_axi4_wlast       = hum_buf_wlast;
-              m_axi4_wdata       = hum_buf_wdata;
-              m_axi4_wstrb       = hum_buf_wstrb;
-              m_axi4_wuser       = hum_buf_wuser;
-
-              m_axi4_wvalid      = hum_buf_valid_out;
-              hum_buf_ready_in   = m_axi4_wready;
-
-              master_select_o    = l2_master_cur;
-
-              // Detect last data beat.
-              if (wlast_out) begin
-                fifo_select      = 1'b1;
-                w_done           = 1'b1;
-                hum_buf_SN       = STORE;
-              end
-
-            // L2 miss/prefetch hit
-            end else if (l2_drop_cur) begin
-              fifo_select_SN      = 1'b1; // L2
-              hum_buf_drop_req_SN = 1'b1;
-              hum_buf_drop_len_SN = l2_len_cur;
-              hum_buf_SN          = FLUSH;
-            end
-
-          // While we wait for orders from L2 TLB, we can still drop and accept L1 transactions.
-          end else if (l1_fifo_valid_out) begin
-
-            // L1 hit
-            if (l1_accept_cur) begin
-              hum_buf_SN         = BYPASS;
-
-            // L1 prefetch/prot/multi
-            end else if (l1_drop_cur) begin
-              hum_buf_SN         = DISCARD;
-            end
-          end
-        end
-
-        FLUSH : begin
-          // Clear HUM buffer flush request.
-          hum_buf_drop_req_SN = 1'b0;
-
-          // perform handshake with B sender
-          fifo_select      = fifo_select_SP;
-          b_drop_o         = 1'b1;
-          if (b_done_i) begin
-            hum_buf_SN     = STORE;
-          end
-        end
-
-        BYPASS : begin
-          // Forward one full transaction from input buffer.
-          m_axi4_wlast       = axi4_wlast;
-          m_axi4_wdata       = axi4_wdata;
-          m_axi4_wstrb       = axi4_wstrb;
-          m_axi4_wuser       = axi4_wuser;
-
-          m_axi4_wvalid      = axi4_wvalid;
-          axi4_wready        = m_axi4_wready;
-
-          master_select_o    = l1_master_cur;
-
-          // We have got a full transaction.
-          if (axi4_wlast & axi4_wready & axi4_wvalid) begin
-            fifo_select      = 1'b0;
-            w_done           = 1'b1;
-            hum_buf_SN       = WAIT_L2_BYPASS_YES;
-          end
-        end
-
-        DISCARD : begin
-          // Discard one full transaction from input buffer.
-          axi4_wready        = 1'b1;
-
-          // We have got a full transaction.
-          if (axi4_wlast & axi4_wready & axi4_wvalid) begin
-            // Try to perform handshake with B sender.
-            fifo_select      = 1'b0;
-            b_drop_o         = 1'b1;
-            // We cannot wait here due to axi4_wready.
-            if (b_done_i) begin
-              hum_buf_SN     = WAIT_L2_BYPASS_YES;
-            end else begin
-              hum_buf_SN     = DISCARD_FINISH;
-            end
-          end
-        end
-
-        DISCARD_FINISH : begin
-          // Perform handshake with B sender.
-          fifo_select      = 1'b0;
-          b_drop_o         = 1'b1;
-          if (b_done_i) begin
-            hum_buf_SN     = WAIT_L2_BYPASS_YES;
-          end
-        end
-
-        WAIT_L1_BYPASS_NO : begin
-          // Do not allow the forwarding of L1 hits.
-          block_forwarding       = 1'b1;
-
-          // Wait for orders from L1 TLB.
-          if (l1_fifo_valid_out) begin
-
-            // L1 hit - forward data from/through HUM buffer and refill the buffer
-            if (l1_accept_cur) begin
-              // Forward data from HUM buffer.
-              m_axi4_wlast       = hum_buf_wlast;
-              m_axi4_wdata       = hum_buf_wdata;
-              m_axi4_wstrb       = hum_buf_wstrb;
-              m_axi4_wuser       = hum_buf_wuser;
-
-              m_axi4_wvalid      = hum_buf_valid_out;
-              hum_buf_ready_in   = m_axi4_wready;
-
-              master_select_o    = l1_master_cur;
-
-              // Refill the HUM buffer. Stop when buffer full.
-              stop_store         = ~hum_buf_ready_out;
-              hum_buf_valid_in   = stop_store ? 1'b0 : axi4_wvalid      ;
-              axi4_wready        = stop_store ? 1'b0 : hum_buf_ready_out;
-
-              // Detect last data beat.
-              if (wlast_out) begin
-                fifo_select      = 1'b0;
-                w_done           = 1'b1;
-                if (~hum_buf_ready_out | hum_buf_almost_full) begin
-                  hum_buf_SN     = WAIT_L1_BYPASS_NO;
-                end else begin
-                  hum_buf_SN     = STORE;
-                end
-              end
-
-              // Allow the forwarding of L1 hits.
-              block_forwarding   = 1'b0;
-
-            // L1 miss - wait for L2
-            end else if (l1_save_cur) begin
-              fifo_select        = 1'b0;
-              w_done             = 1'b1;
-              hum_buf_SN         = WAIT_L2_BYPASS_NO;
-
-            // L1 prefetch, prot, multi - drop data
-            end else if (l1_drop_cur) begin
-              fifo_select_SN      = 1'b0; // L1
-              hum_buf_drop_req_SN = 1'b1;
-              hum_buf_drop_len_SN = l1_len_cur;
-              hum_buf_SN          = FLUSH;
-
-              // Allow the forwarding of L1 hits.
-              block_forwarding   = 1'b0;
-            end
-          end
-        end
-
-        WAIT_L2_BYPASS_NO : begin
-          // Do not allow the forwarding of L1 hits.
-          block_forwarding       = 1'b1;
-
-          // Wait for orders from L2 TLB.
-          if (l2_fifo_valid_out) begin
-
-            // L2 hit - forward first part from HUM buffer, rest from input buffer
-            if (l2_accept_cur) begin
-              // Forward data from HUM buffer.
-              m_axi4_wlast       = hum_buf_wlast;
-              m_axi4_wdata       = hum_buf_wdata;
-              m_axi4_wstrb       = hum_buf_wstrb;
-              m_axi4_wuser       = hum_buf_wuser;
-
-              m_axi4_wvalid      = hum_buf_valid_out;
-              hum_buf_ready_in   = m_axi4_wready;
-
-              master_select_o    = l2_master_cur;
-
-              // Refill the HUM buffer. Stop when buffer full.
-              stop_store         = ~hum_buf_ready_out;
-              hum_buf_valid_in   = stop_store ? 1'b0 : axi4_wvalid      ;
-              axi4_wready        = stop_store ? 1'b0 : hum_buf_ready_out;
-
-              // Detect last data beat.
-              if (wlast_out) begin
-                fifo_select      = 1'b1;
-                w_done           = 1'b1;
-                if (~hum_buf_ready_out | hum_buf_almost_full) begin
-                  hum_buf_SN     = WAIT_L1_BYPASS_NO;
-                end else begin
-                  hum_buf_SN     = STORE;
-                end
-              end
-
-              // Allow the forwarding of L1 hits.
-              block_forwarding   = 1'b0;
-
-            // L2 miss/prefetch hit - drop data
-            end else if (l2_drop_cur) begin
-              fifo_select_SN      = 1'b1; // L2
-              hum_buf_drop_req_SN = 1'b1;
-              hum_buf_drop_len_SN = l2_len_cur;
-              hum_buf_SN          = FLUSH;
-
-              // Allow the forwarding of L1 hits.
-              block_forwarding   = 1'b0;
-            end
-          end
-        end
-
-
-        default: begin
-          hum_buf_SN = STORE;
-        end
-
-      endcase // hum_buf_SP
-    end // HUM_BUFFER_FSM
-
-    assign b_drop_set = 1'b0;
-
-  end else begin // HUM_BUFFER
-
-    // register to perform the handshake with B sender
-    always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
-      if (axi4_arstn == 0) begin
-        b_drop_o <= 1'b0;
-      end else if (b_done_i) begin
-        b_drop_o <= 1'b0;
-      end else if (b_drop_set) begin
-        b_drop_o <= 1'b1;;
-      end
-    end
-
-    always_comb begin : OUTPUT_CTRL
-
-      fifo_select   = 1'b0;
-      w_done        = 1'b0;
-      b_drop_set    = 1'b0;
-
-      m_axi4_wlast  = 1'b0;
-      m_axi4_wdata  =  'b0;
-      m_axi4_wstrb  =  'b0;
-      m_axi4_wuser  =  'b0;
-
-      m_axi4_wvalid = 1'b0;
-      axi4_wready   = 1'b0;
-
-      if (l1_fifo_valid_out) begin
-        // forward data
-        if (l1_accept_cur) begin
-          m_axi4_wlast  = axi4_wlast;
-          m_axi4_wdata  = axi4_wdata;
-          m_axi4_wstrb  = axi4_wstrb;
-          m_axi4_wuser  = axi4_wuser;
-
-          m_axi4_wvalid = axi4_wvalid;
-          axi4_wready   = m_axi4_wready;
-
-          // Simply pop from FIFO upon last data beat.
-          w_done        = axi4_wlast & axi4_wvalid & axi4_wready;
-
-        // discard entire burst
-        end else if (b_drop_o == 1'b0) begin
-          axi4_wready   = 1'b1;
-
-          // Simply pop from FIFO upon last data beat. Perform handshake with B sender.
-          if (axi4_wlast & axi4_wvalid & axi4_wready)
-            b_drop_set  = 1'b1;
-        end
-      end
-
-    end // OUTPUT_CTRL
-
-    assign master_select_o     = l1_master_cur;
-    assign l2_fifo_ready_out   = 1'b1;
-    assign block_forwarding    = 1'b0;
-
-    // unused signals
-    assign hum_buf_ready_out   = 1'b0;
-    assign hum_buf_valid_in    = 1'b0;
-    assign hum_buf_ready_in    = 1'b0;
-    assign hum_buf_valid_out   = 1'b0;
-    assign hum_buf_wdata       =  'b0;
-    assign hum_buf_wstrb       =  'b0;
-    assign hum_buf_wlast       = 1'b0;
-    assign hum_buf_wuser       =  'b0;
-    assign hum_buf_drop_len_SN =  'b0;
-    assign hum_buf_drop_req_SN = 1'b0;
-    assign hum_buf_almost_full = 1'b0;
-
-    assign l2_fifo_valid_in    = 1'b0;
-    assign l2_fifo_valid_out   = 1'b0;
-    assign l2_prefetch_cur     = 1'b0;
-    assign l2_hit_cur          = 1'b0;
-    assign l2_id_cur           =  'b0;
-    assign l2_len_cur          =  'b0;
-    assign l2_master_cur       = 1'b0;
-    assign l2_accept_cur       = 1'b0;
-    assign l2_drop_cur         = 1'b0;
-
-    assign l2_req              = 1'b0;
-
-    assign fifo_select_SN      = 1'b0;
-    assign fifo_select_SP      = 1'b0;
-
-    assign stop_store          = 1'b0;
-    assign n_wlast_SP          =  'b0;
-    assign wlast_in            = 1'b0;
-    assign wlast_out           = 1'b0;
-
-  end // HUM_BUFFER
-
-  endgenerate
-"""
diff --git a/src/unused/iommu/axi_rab/axi4_w_sender.py b/src/unused/iommu/axi_rab/axi4_w_sender.py
deleted file mode 100644 (file)
index 9916334..0000000
+++ /dev/null
@@ -1,78 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi4_w_sender(Elaboratable):
-
-    def __init__(self):
-        self.axi4_aclk = Signal()  # input
-        self.axi4_arstn = Signal()  # input
-        self.s_axi4_wdata = Signal()  # input
-        self.s_axi4_wvalid = Signal()  # input
-        self.s_axi4_wready = Signal()  # output
-        self.s_axi4_wstrb = Signal()  # input
-        self.s_axi4_wlast = Signal()  # input
-        self.s_axi4_wuser = Signal()  # input
-        self.m_axi4_wdata = Signal()  # output
-        self.m_axi4_wvalid = Signal()  # output
-        self.m_axi4_wready = Signal()  # input
-        self.m_axi4_wstrb = Signal()  # output
-        self.m_axi4_wlast = Signal()  # output
-        self.m_axi4_wuser = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.m_axi4_wdata.eq(self.s_axi4_wdata)
-        m.d.comb += self.m_axi4_wstrb.eq(self.s_axi4_wstrb)
-        m.d.comb += self.m_axi4_wlast.eq(self.s_axi4_wlast)
-        m.d.comb += self.m_axi4_wuser.eq(self.s_axi4_wuser)
-        m.d.comb += self.m_axi4_wvalid.eq(self.s_axi4_wvalid)
-        m.d.comb += self.s_axi4_wready.eq(self.m_axi4_wready)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module axi4_w_sender
-#  #(
-#    parameter AXI_DATA_WIDTH = 32,
-#    parameter AXI_USER_WIDTH = 2
-#  )
-#  (
-#    input                         axi4_aclk,
-#    input                         axi4_arstn,
-#
-#    input    [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
-#    input                         s_axi4_wvalid,
-#    output                        s_axi4_wready,
-#    input  [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
-#    input                         s_axi4_wlast,
-#    input    [AXI_USER_WIDTH-1:0] s_axi4_wuser,
-#
-#    output   [AXI_DATA_WIDTH-1:0] m_axi4_wdata,
-#    output                        m_axi4_wvalid,
-#    input                         m_axi4_wready,
-#    output [AXI_DATA_WIDTH/8-1:0] m_axi4_wstrb,
-#    output                        m_axi4_wlast,
-#    output   [AXI_USER_WIDTH-1:0] m_axi4_wuser
-#  );
-#
-#  assign m_axi4_wdata  = s_axi4_wdata;
-#  assign m_axi4_wstrb  = s_axi4_wstrb;
-#  assign m_axi4_wlast  = s_axi4_wlast;
-#  assign m_axi4_wuser  = s_axi4_wuser;
-#
-#  assign m_axi4_wvalid = s_axi4_wvalid;
-#  assign s_axi4_wready = m_axi4_wready;
-#
-# endmodule
-#
-#
diff --git a/src/unused/iommu/axi_rab/axi_buffer_rab.py b/src/unused/iommu/axi_rab/axi_buffer_rab.py
deleted file mode 100644 (file)
index b4d9929..0000000
+++ /dev/null
@@ -1,151 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi_buffer_rab(Elaboratable):
-
-    def __init__(self):
-        self.clk = Signal()  # input
-        self.rstn = Signal()  # input
-        self.data_out = Signal(DATA_WIDTH)  # output
-        self.valid_out = Signal()  # output
-        self.ready_in = Signal()  # input
-        self.valid_in = Signal()  # input
-        self.data_in = Signal(DATA_WIDTH)  # input
-        self.ready_out = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.d.comb += self.full.eq(self.None)
-        m.d.comb += self.data_out.eq(self.None)
-        m.d.comb += self.valid_out.eq(self.None)
-        m.d.comb += self.ready_out.eq(self.None)
-        return m
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# //import CfMath::log2;
-#
-# module axi_buffer_rab
-#  //#(
-#  //  parameter DATA_WIDTH,
-#  //  parameter BUFFER_DEPTH
-#  //)
-#  (
-#    input logic                   clk,
-#    input logic                   rstn,
-#
-#    // Downstream port
-#    output logic [DATA_WIDTH-1:0] data_out,
-#    output logic                  valid_out,
-#    input  logic                  ready_in,
-#
-#    // Upstream port
-#    input  logic                  valid_in,
-#    input  logic [DATA_WIDTH-1:0] data_in,
-#    output logic                  ready_out
-#  );
-#
-#  localparam integer LOG_BUFFER_DEPTH = log2(BUFFER_DEPTH);
-#
-#    // Internal data structures
-#    reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_in;   // location to which we last wrote
-#    reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_out;  // location from which we last sent
-#    reg     [LOG_BUFFER_DEPTH : 0] elements;     // number of elements in the buffer
-#    reg       [DATA_WIDTH - 1 : 0] buffer [BUFFER_DEPTH - 1 : 0];
-#
-#    wire full;
-#
-#    integer loop1;
-#
-#    assign full = (elements == BUFFER_DEPTH);
-#
-#    always @(posedge clk or negedge rstn)
-#      begin: elements_sequential
-#        if (rstn == 1'b0)
-#          elements <= 0;
-#        else
-#        begin
-#          // ------------------
-#          // Are we filling up?
-#          // ------------------
-#          // One out, none in
-#          if (ready_in && valid_out && (!valid_in || full))
-#            elements <= elements - 1;
-#          // None out, one in
-#          else if ((!valid_out || !ready_in) && valid_in && !full)
-#            elements <= elements + 1;
-#          // Else, either one out and one in, or none out and none in - stays unchanged
-#        end
-#      end
-#
-#    always @(posedge clk or negedge rstn)
-#      begin: buffers_sequential
-#        if (rstn == 1'b0)
-#        begin
-#          for (loop1 = 0 ; loop1 < BUFFER_DEPTH ; loop1 = loop1 + 1)
-#            buffer[loop1] <= 0;
-#        end
-#        else
-#        begin
-#          // Update the memory
-#          if (valid_in && !full)
-#            buffer[pointer_in] <= data_in;
-#        end
-#      end
-#
-#    always @(posedge clk or negedge rstn)
-#      begin: sequential
-#        if (rstn == 1'b0)
-#        begin
-#          pointer_out <= 0;
-#          pointer_in <= 0;
-#        end
-#        else
-#        begin
-#          // ------------------------------------
-#          // Check what to do with the input side
-#          // ------------------------------------
-#          // We have some input, increase by 1 the input pointer
-#          if (valid_in && !full)
-#          begin
-#            if (pointer_in == $unsigned(BUFFER_DEPTH - 1))
-#              pointer_in <= 0;
-#            else
-#              pointer_in <= pointer_in + 1;
-#          end
-#          // Else we don't have any input, the input pointer stays the same
-#
-#          // -------------------------------------
-#          // Check what to do with the output side
-#          // -------------------------------------
-#          // We had pushed one flit out, we can try to go for the next one
-#          if (ready_in && valid_out)
-#          begin
-#            if (pointer_out == $unsigned(BUFFER_DEPTH - 1))
-#              pointer_out <= 0;
-#            else
-#              pointer_out <= pointer_out + 1;
-#          end
-#          // Else stay on the same output location
-#        end
-#      end
-#
-#    // Update output ports
-#    assign data_out = buffer[pointer_out];
-#    assign valid_out = (elements != 0);
-#
-#    assign ready_out = ~full;
-#
-# endmodule
-#
-#
diff --git a/src/unused/iommu/axi_rab/axi_buffer_rab_bram.py b/src/unused/iommu/axi_rab/axi_buffer_rab_bram.py
deleted file mode 100644 (file)
index 349b314..0000000
+++ /dev/null
@@ -1,209 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi_buffer_rab_bram(Elaboratable):
-
-    def __init__(self):
-        self.clk = Signal()  # input
-        self.rstn = Signal()  # input
-        self.data_out = Signal(DATA_WIDTH)  # output
-        self.valid_out = Signal()  # output
-        self.ready_in = Signal()  # input
-        self.valid_in = Signal()  # input
-        self.data_in = Signal(DATA_WIDTH)  # input
-        self.ready_out = Signal()  # output
-        self.almost_full = Signal()  # output
-        self.underfull = Signal()  # output
-        self.drop_req = Signal()  # input
-        self.drop_len = Signal(8)  # input
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# ////import CfMath::log2;
-#
-# module axi_buffer_rab_bram
-#  //#(
-#  //  parameter DATA_WIDTH,
-#  //  parameter BUFFER_DEPTH
-#  //  )
-#   (
-#    input logic                   clk,
-#    input logic                   rstn,
-#
-#    // Downstream port
-#    output logic [DATA_WIDTH-1:0] data_out,
-#    output logic                  valid_out,
-#    input  logic                  ready_in,
-#
-#    // Upstream port
-#    input  logic                  valid_in,
-#    input  logic [DATA_WIDTH-1:0] data_in,
-#    output logic                  ready_out,
-#
-#    // Status and drop control
-#    output logic                  almost_full,
-#    output logic                  underfull,
-#    input  logic                  drop_req,
-#    // Number of items to drop.  As for AXI lengths, counting starts at zero, i.e., `drop_len == 0`
-#    // and `drop_req` means drop one item.
-#    input  logic [7:0]            drop_len
-#    );
-#
-"""  #docstring_begin
-  // The BRAM needs to be in "write-first" mode for first-word fall-through FIFO behavior.
-  // To still push and pop simultaneously if the buffer is full, we internally increase the
-  // buffer depth by 1.
-  localparam ACT_BUFFER_DEPTH     = BUFFER_DEPTH+1;
-  localparam ACT_LOG_BUFFER_DEPTH = log2(ACT_BUFFER_DEPTH+1);
-
-  /**
-    * Internal data structures
-    */
-  // Location to which we last wrote
-  logic        [ACT_LOG_BUFFER_DEPTH-1:0] ptr_in_d,                   ptr_in_q;
-  // Location from which we last sent
-  logic        [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_d,                  ptr_out_q;
-  // Required for fall-through behavior on the first word
-  logic        [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_bram;
-  // Number of elements in the buffer.  Can be negative if elements that have been dropped have not
-  // yet been written.
-  logic signed   [ACT_LOG_BUFFER_DEPTH:0] n_elems_d,                  n_elems_q;
-
-  logic           [DATA_WIDTH-1:0]        data_out_bram, data_out_q;
-  logic                                   valid_out_q;
-
-  logic full;
-
-  assign almost_full = (n_elems_q == BUFFER_DEPTH-1);
-  assign full        = (n_elems_q == BUFFER_DEPTH);
-
-  always_ff @(posedge clk, negedge rstn) begin
-    if (~rstn) begin
-      n_elems_q <= '0;
-      ptr_in_q  <= '0;
-      ptr_out_q <= '0;
-    end else begin
-      n_elems_q <= n_elems_d;
-      ptr_in_q  <= ptr_in_d;
-      ptr_out_q <= ptr_out_d;
-    end
-  end
-
-  // Update the number of elements.
-  always_comb begin
-    n_elems_d = n_elems_q;
-    if (drop_req) begin
-      n_elems_d -= (drop_len + 1);
-    end
-    if (valid_in && ready_out) begin
-      n_elems_d += 1;
-    end
-    if (valid_out && ready_in) begin
-      n_elems_d -= 1;
-    end
-  end
-
-  // Update the output pointer.
-  always_comb begin
-    ptr_out_d = ptr_out_q;
-    if (drop_req) begin
-      if ((ptr_out_q + drop_len + 1) > (ACT_BUFFER_DEPTH - 1)) begin
-        ptr_out_d = drop_len + 1 - (ACT_BUFFER_DEPTH - ptr_out_q);
-      end else begin
-        ptr_out_d += (drop_len + 1);
-      end
-    end
-    if (valid_out && ready_in) begin
-      if (ptr_out_d == (ACT_BUFFER_DEPTH - 1)) begin
-        ptr_out_d = '0;
-      end else begin
-        ptr_out_d += 1;
-      end
-    end
-  end
-
-  // The BRAM has a read latency of one cycle, so apply the new address one cycle earlier for
-  // first-word fall-through FIFO behavior.
-  //assign ptr_out_bram = (ptr_out_q == (ACT_BUFFER_DEPTH-1)) ? '0 : (ptr_out_q + 1);
-  assign ptr_out_bram = ptr_out_d;
-
-  // Update the input pointer.
-  always_comb begin
-    ptr_in_d = ptr_in_q;
-    if (valid_in && ready_out) begin
-      if (ptr_in_d == (ACT_BUFFER_DEPTH - 1)) begin
-        ptr_in_d = '0;
-      end else begin
-        ptr_in_d += 1;
-      end
-    end
-  end
-
-  // Update output ports.
-  assign valid_out = (n_elems_q > $signed(0));
-  assign underfull = (n_elems_q < $signed(0));
-  assign ready_out = ~full;
-
-  ram_tp_write_first #(
-    .ADDR_WIDTH ( ACT_LOG_BUFFER_DEPTH ),
-    .DATA_WIDTH ( DATA_WIDTH           )
-  )
-  ram_tp_write_first_0
-  (
-    .clk   ( clk              ),
-    .we    ( valid_in & ~full ),
-    .addr0 ( ptr_in_q         ),
-    .addr1 ( ptr_out_bram     ),
-    .d_i   ( data_in          ),
-    .d0_o  (                  ),
-    .d1_o  ( data_out_bram    )
-  );
-
-  // When reading from/writing two the same address on both ports ("Write-Read Collision"),
-  // the data on the read port is invalid (during the write cycle). In this implementation,
-  // this can happen only when the buffer is empty. Thus, we forward the data from an
-  // register in this case.
-  always @(posedge clk) begin
-    if (rstn == 1'b0) begin
-      data_out_q <= 'b0;
-    end else if ( (ptr_out_bram == ptr_in_q) && (valid_in && !full) ) begin
-      data_out_q <= data_in;
-    end
-  end
-
-  always @(posedge clk) begin
-    if (rstn == 1'b0) begin
-      valid_out_q <= 'b0;
-    end else begin
-      valid_out_q <= valid_out;
-    end
-  end
-
-  // Drive output data
-  always_comb begin
-    if (valid_out && !valid_out_q) begin // We have just written to an empty FIFO
-      data_out = data_out_q;
-    end else begin
-      data_out = data_out_bram;
-    end
-  end
-
-"""
-# endmodule
-#
-#
diff --git a/src/unused/iommu/axi_rab/axi_rab_cfg.py b/src/unused/iommu/axi_rab/axi_rab_cfg.py
deleted file mode 100644 (file)
index 43843b9..0000000
+++ /dev/null
@@ -1,707 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi_rab_cfg(Elaboratable):
-
-    def __init__(self):
-        self.Clk_CI = Signal()  # input
-        self.Rst_RBI = Signal()  # input
-        self.s_axi_awaddr = Signal(AXI_ADDR_WIDTH)  # input
-        self.s_axi_awvalid = Signal()  # input
-        self.s_axi_awready = Signal()  # output
-        self.s_axi_wdata = Signal()  # input
-        self.s_axi_wstrb = Signal(1+ERROR p_expression_25)  # input
-        self.s_axi_wvalid = Signal()  # input
-        self.s_axi_wready = Signal()  # output
-        self.s_axi_bresp = Signal(2)  # output
-        self.s_axi_bvalid = Signal()  # output
-        self.s_axi_bready = Signal()  # input
-        self.s_axi_araddr = Signal(AXI_ADDR_WIDTH)  # input
-        self.s_axi_arvalid = Signal()  # input
-        self.s_axi_arready = Signal()  # output
-        self.s_axi_rdata = Signal(AXI_DATA_WIDTH)  # output
-        self.s_axi_rresp = Signal(2)  # output
-        self.s_axi_rvalid = Signal()  # output
-        self.s_axi_rready = Signal()  # input
-        self.L1Cfg_DO = Signal()  # output
-        self.L1AllowMultiHit_SO = Signal()  # output
-        self.MissAddr_DI = Signal(ADDR_WIDTH_VIRT)  # input
-        self.MissMeta_DI = Signal(MISS_META_WIDTH)  # input
-        self.Miss_SI = Signal()  # input
-        self.MhFifoFull_SO = Signal()  # output
-        self.wdata_l2 = Signal()  # output
-        self.waddr_l2 = Signal()  # output
-        self.wren_l2 = Signal(N_PORTS)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# // --=========================================================================--
-# //
-# //  █████╗ ██╗  ██╗██╗    ██████╗  █████╗ ██████╗      ██████╗███████╗ ██████╗
-# // ██╔══██╗╚██╗██╔╝██║    ██╔══██╗██╔══██╗██╔══██╗    ██╔════╝██╔════╝██╔════╝
-# // ███████║ ╚███╔╝ ██║    ██████╔╝███████║██████╔╝    ██║     █████╗  ██║  ███╗
-# // ██╔══██║ ██╔██╗ ██║    ██╔══██╗██╔══██║██╔══██╗    ██║     ██╔══╝  ██║   ██║
-# // ██║  ██║██╔╝ ██╗██║    ██║  ██║██║  ██║██████╔╝    ╚██████╗██║     ╚██████╔╝
-# // ╚═╝  ╚═╝╚═╝  ╚═╝╚═╝    ╚═╝  ╚═╝╚═╝  ╚═╝╚═════╝      ╚═════╝╚═╝      ╚═════╝
-# //
-# //
-# // Author: Pirmin Vogel - vogelpi@iis.ee.ethz.ch
-# //
-# // Purpose : AXI4-Lite configuration and miss handling interface for RAB
-# //
-# // --=========================================================================--
-#
-# //import CfMath::log2;
-#
-# module axi_rab_cfg
-#  #(
-#    parameter N_PORTS         =   3,
-#    parameter N_REGS          = 196,
-#    parameter N_L2_SETS       =  32,
-#    parameter N_L2_SET_ENTRIES=  32,
-#    parameter ADDR_WIDTH_PHYS =  40,
-#    parameter ADDR_WIDTH_VIRT =  32,
-#    parameter N_FLAGS         =   4,
-#    parameter AXI_DATA_WIDTH  =  64,
-#    parameter AXI_ADDR_WIDTH  =  32,
-#    parameter MISS_META_WIDTH =  10,  // <= FIFO_WIDTH
-#    parameter MH_FIFO_DEPTH   =  16
-#    )
-#   (
-#    input  logic                                    Clk_CI,
-#    input  logic                                    Rst_RBI,
-#
-#    // AXI Lite interface
-#    input  logic [AXI_ADDR_WIDTH-1:0]               s_axi_awaddr,
-#    input  logic                                    s_axi_awvalid,
-#    output logic                                    s_axi_awready,
-#    input  logic [AXI_DATA_WIDTH/8-1:0][7:0]        s_axi_wdata,
-#    input  logic [AXI_DATA_WIDTH/8-1:0]             s_axi_wstrb,
-#    input  logic                                    s_axi_wvalid,
-#    output logic                                    s_axi_wready,
-#    output logic [1:0]                              s_axi_bresp,
-#    output logic                                    s_axi_bvalid,
-#    input  logic                                    s_axi_bready,
-#    input  logic [AXI_ADDR_WIDTH-1:0]               s_axi_araddr,
-#    input  logic                                    s_axi_arvalid,
-#    output logic                                    s_axi_arready,
-#    output logic [AXI_DATA_WIDTH-1:0]               s_axi_rdata,
-#    output logic [1:0]                              s_axi_rresp,
-#    output logic                                    s_axi_rvalid,
-#    input  logic                                    s_axi_rready,
-#
-#    // Slice configuration
-#    output logic [N_REGS-1:0][63:0]                 L1Cfg_DO,
-#    output logic                                    L1AllowMultiHit_SO,
-#
-#    // Miss handling
-#    input  logic [ADDR_WIDTH_VIRT-1:0]              MissAddr_DI,
-#    input  logic [MISS_META_WIDTH-1:0]              MissMeta_DI,
-#    input  logic                                    Miss_SI,
-#    output logic                                    MhFifoFull_SO,
-#
-#    // L2 TLB
-#    output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] wdata_l2,
-#    output logic [N_PORTS-1:0] [AXI_ADDR_WIDTH-1:0] waddr_l2,
-#    output logic [N_PORTS-1:0]                      wren_l2
-#  );
-#
-"""  #docstring_begin
-
-  localparam ADDR_LSB = log2(64/8); // 64 even if the AXI Lite interface is 32,
-                                    // because RAB slices are 64 bit wide.
-  localparam ADDR_MSB = log2(N_REGS)+ADDR_LSB-1;
-
-  localparam L2SINGLE_AMAP_SIZE = 16'h4000; // Maximum 2048 TLB entries in L2
-
-  localparam integer N_L2_ENTRIES = N_L2_SETS * N_L2_SET_ENTRIES;
-
-  localparam logic [AXI_ADDR_WIDTH-1:0] L2_VA_MAX_ADDR = (N_L2_ENTRIES-1) << 2;
-
-  logic [AXI_DATA_WIDTH/8-1:0][7:0] L1Cfg_DP[N_REGS]; // [Byte][Bit]
-  genvar j;
-
-  //  █████╗ ██╗  ██╗██╗██╗  ██╗      ██╗     ██╗████████╗███████╗
-  // ██╔══██╗╚██╗██╔╝██║██║  ██║      ██║     ██║╚══██╔══╝██╔════╝
-  // ███████║ ╚███╔╝ ██║███████║█████╗██║     ██║   ██║   █████╗
-  // ██╔══██║ ██╔██╗ ██║╚════██║╚════╝██║     ██║   ██║   ██╔══╝
-  // ██║  ██║██╔╝ ██╗██║     ██║      ███████╗██║   ██║   ███████╗
-  // ╚═╝  ╚═╝╚═╝  ╚═╝╚═╝     ╚═╝      ╚══════╝╚═╝   ╚═╝   ╚══════╝
-  //
-  logic [AXI_ADDR_WIDTH-1:0]        awaddr_reg;
-  logic                             awaddr_done_rise;
-  logic                             awaddr_done_reg;
-  logic                             awaddr_done_reg_dly;
-
-  logic [AXI_DATA_WIDTH/8-1:0][7:0] wdata_reg;
-  logic [AXI_DATA_WIDTH/8-1:0]      wstrb_reg;
-  logic                             wdata_done_rise;
-  logic                             wdata_done_reg;
-  logic                             wdata_done_reg_dly;
-
-  logic                             wresp_done_reg;
-  logic                             wresp_running_reg;
-
-  logic [AXI_ADDR_WIDTH-1:0]        araddr_reg;
-  logic                             araddr_done_reg;
-
-  logic [AXI_DATA_WIDTH-1:0]        rdata_reg;
-  logic                             rresp_done_reg;
-  logic                             rresp_running_reg;
-
-  logic                             awready;
-  logic                             wready;
-  logic                             bvalid;
-
-  logic                             arready;
-  logic                             rvalid;
-
-  logic                             wren;
-  logic                             wren_l1;
-
-  assign wren = ( wdata_done_rise & awaddr_done_reg ) | ( awaddr_done_rise & wdata_done_reg );
-  assign wdata_done_rise  = wdata_done_reg  & ~wdata_done_reg_dly;
-  assign awaddr_done_rise = awaddr_done_reg & ~awaddr_done_reg_dly;
-
-  // reg_dly
-  always @(posedge Clk_CI or negedge Rst_RBI)
-    begin
-       if (!Rst_RBI)
-         begin
-            wdata_done_reg_dly  <= 1'b0;
-            awaddr_done_reg_dly <= 1'b0;
-         end
-       else
-         begin
-            wdata_done_reg_dly  <= wdata_done_reg;
-            awaddr_done_reg_dly <= awaddr_done_reg;
-         end
-    end
-
-  // AW Channel
-  always @(posedge Clk_CI or negedge Rst_RBI)
-    begin
-       if (!Rst_RBI)
-         begin
-            awaddr_done_reg <= 1'b0;
-            awaddr_reg      <= '0;
-            awready         <= 1'b1;
-         end
-       else
-         begin
-            if (awready && s_axi_awvalid)
-              begin
-                 awready         <= 1'b0;
-                 awaddr_done_reg <= 1'b1;
-                 awaddr_reg      <= s_axi_awaddr;
-              end
-            else if (awaddr_done_reg && wresp_done_reg)
-              begin
-                 awready         <= 1'b1;
-                 awaddr_done_reg <= 1'b0;
-              end
-         end
-    end
-
-  // W Channel
-  always @(posedge Clk_CI or negedge Rst_RBI)
-    begin
-       if (!Rst_RBI)
-         begin
-            wdata_done_reg <= 1'b0;
-            wready         <= 1'b1;
-            wdata_reg      <= '0;
-            wstrb_reg      <= '0;
-         end
-       else
-         begin
-            if (wready && s_axi_wvalid)
-              begin
-                 wready         <= 1'b0;
-                 wdata_done_reg <= 1'b1;
-                 wdata_reg      <= s_axi_wdata;
-                 wstrb_reg      <= s_axi_wstrb;
-              end
-            else if (wdata_done_reg && wresp_done_reg)
-              begin
-                 wready         <= 1'b1;
-                 wdata_done_reg <= 1'b0;
-              end
-         end
-    end
-
-  // B Channel
-  always @(posedge Clk_CI or negedge Rst_RBI)
-    begin
-       if (!Rst_RBI)
-         begin
-            bvalid            <= 1'b0;
-            wresp_done_reg    <= 1'b0;
-            wresp_running_reg <= 1'b0;
-         end
-       else
-         begin
-            if (awaddr_done_reg && wdata_done_reg && !wresp_done_reg)
-              begin
-                 if (!wresp_running_reg)
-                   begin
-                      bvalid            <= 1'b1;
-                      wresp_running_reg <= 1'b1;
-                   end
-                 else if (s_axi_bready)
-                   begin
-                      bvalid            <= 1'b0;
-                      wresp_done_reg    <= 1'b1;
-                      wresp_running_reg <= 1'b0;
-                   end
-              end
-            else
-              begin
-                 bvalid            <= 1'b0;
-                 wresp_done_reg    <= 1'b0;
-                 wresp_running_reg <= 1'b0;
-              end
-         end
-    end
-
-  // AR Channel
-  always @(posedge Clk_CI or negedge Rst_RBI)
-    begin
-       if (!Rst_RBI)
-         begin
-            araddr_done_reg <= 1'b0;
-            arready         <= 1'b1;
-            araddr_reg       <= '0;
-         end
-       else
-         begin
-            if (arready && s_axi_arvalid)
-              begin
-                 arready         <= 1'b0;
-                 araddr_done_reg <= 1'b1;
-                 araddr_reg      <= s_axi_araddr;
-              end
-            else if (araddr_done_reg && rresp_done_reg)
-              begin
-                 arready         <= 1'b1;
-                 araddr_done_reg <= 1'b0;
-              end
-         end
-    end
-
-  // R Channel
-  always @(posedge Clk_CI or negedge Rst_RBI)
-    begin
-       if (!Rst_RBI)
-         begin
-            rresp_done_reg    <= 1'b0;
-            rvalid            <= 1'b0;
-            rresp_running_reg <= 1'b0;
-         end
-       else
-         begin
-            if (araddr_done_reg && !rresp_done_reg)
-              begin
-                 if (!rresp_running_reg)
-                   begin
-                      rvalid            <= 1'b1;
-                      rresp_running_reg <= 1'b1;
-                   end
-                 else if (s_axi_rready)
-                   begin
-                      rvalid            <= 1'b0;
-                      rresp_done_reg    <= 1'b1;
-                      rresp_running_reg <= 1'b0;
-                   end
-              end
-            else
-              begin
-                 rvalid            <= 1'b0;
-                 rresp_done_reg    <= 1'b0;
-                 rresp_running_reg <= 1'b0;
-              end
-         end
-    end
-
-  // ██╗     ██╗     ██████╗███████╗ ██████╗     ██████╗ ███████╗ ██████╗
-  // ██║    ███║    ██╔════╝██╔════╝██╔════╝     ██╔══██╗██╔════╝██╔════╝
-  // ██║    ╚██║    ██║     █████╗  ██║  ███╗    ██████╔╝█████╗  ██║  ███╗
-  // ██║     ██║    ██║     ██╔══╝  ██║   ██║    ██╔══██╗██╔══╝  ██║   ██║
-  // ███████╗██║    ╚██████╗██║     ╚██████╔╝    ██║  ██║███████╗╚██████╔╝
-  // ╚══════╝╚═╝     ╚═════╝╚═╝      ╚═════╝     ╚═╝  ╚═╝╚══════╝ ╚═════╝
-  //
-  assign wren_l1 = wren && (awaddr_reg < L2SINGLE_AMAP_SIZE);
-
-  always @( posedge Clk_CI or negedge Rst_RBI )
-    begin
-      var integer idx_reg, idx_byte;
-      if ( Rst_RBI == 1'b0 )
-        begin
-          for ( idx_reg = 0; idx_reg < N_REGS; idx_reg++ )
-            L1Cfg_DP[idx_reg] <= '0;
-        end
-      else if ( wren_l1 )
-          begin
-            if ( awaddr_reg[ADDR_LSB+1] == 1'b0 ) begin                     // VIRT_ADDR
-              for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
-                if ( (idx_byte < ADDR_WIDTH_VIRT/8) ) begin
-                  if ( wstrb_reg[idx_byte] ) begin
-                    L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte];
-                  end
-                end
-                else begin  // Let synthesizer optimize away unused registers.
-                  L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
-                end
-              end
-            end
-            else if ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b10 ) begin      // PHYS_ADDR
-              for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
-                if ( (idx_byte < ADDR_WIDTH_PHYS/8) ) begin
-                  if ( wstrb_reg[idx_byte] ) begin
-                    L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte];
-                  end
-                end
-                else begin  // Let synthesizer optimize away unused registers.
-                  L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
-                end
-              end
-            end
-            else begin // ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b11 )      // FLAGS
-              for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
-                if ( (idx_byte < 1) ) begin
-                  if ( wstrb_reg[idx_byte] ) begin
-                    L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte] & { {{8-N_FLAGS}{1'b0}}, {{N_FLAGS}{1'b1}} };
-                  end
-                end
-                else begin  // Let synthesizer optimize away unused registers.
-                  L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
-                end
-              end
-            end
-          end
-    end // always @ ( posedge Clk_CI or negedge Rst_RBI )
-
-  generate
-    // Mask unused bits -> Synthesizer should optimize away unused registers
-    for( j=0; j<N_REGS; j++ ) begin
-      if ( j[1] == 1'b0 ) // VIRT_ADDR
-        assign L1Cfg_DO[j] = { {{64-ADDR_WIDTH_VIRT}{1'b0}},{ADDR_WIDTH_VIRT{1'b1}} } & L1Cfg_DP[j];
-      else if ( j[1:0] == 2'b10 ) // PHYS_ADDR
-        assign L1Cfg_DO[j] = { {{64-ADDR_WIDTH_PHYS}{1'b0}},{ADDR_WIDTH_PHYS{1'b1}} } & L1Cfg_DP[j];
-      else // if ( j[1:0] == 2'b11 ) // FLAGS
-        assign L1Cfg_DO[j] = { {{64-N_FLAGS}{1'b0}},{N_FLAGS{1'b1}} } & L1Cfg_DP[j];
-    end
-  endgenerate
-
-  always_comb
-    begin
-      if ( araddr_reg[ADDR_LSB-1] == 1'b1 ) // read upper 32 bit, for debugging over 32-bit interface
-        rdata_reg = { {32'h00000000},{L1Cfg_DO[araddr_reg[ADDR_MSB:ADDR_LSB]][63:32]} };
-      else
-        rdata_reg = L1Cfg_DO[araddr_reg[ADDR_MSB:ADDR_LSB]];
-    end
-
-  assign s_axi_awready = awready;
-  assign s_axi_wready  = wready;
-
-  assign s_axi_bresp   = 2'b00;
-  assign s_axi_bvalid  = bvalid;
-
-  assign s_axi_arready = arready;
-  assign s_axi_rresp   = 2'b00;
-  assign s_axi_rvalid  = rvalid;
-
-  // ██╗     ██████╗      ██████╗███████╗ ██████╗
-  // ██║     ╚════██╗    ██╔════╝██╔════╝██╔════╝
-  // ██║      █████╔╝    ██║     █████╗  ██║  ███╗
-  // ██║     ██╔═══╝     ██║     ██╔══╝  ██║   ██║
-  // ███████╗███████╗    ╚██████╗██║     ╚██████╔╝
-  // ╚══════╝╚══════╝     ╚═════╝╚═╝      ╚═════╝
-  //
-  logic [N_PORTS-1:0] l2_addr_is_in_va_rams;
-  logic [N_PORTS-1:0] upper_word_is_written;
-  logic [N_PORTS-1:0] lower_word_is_written;
-  generate
-    for( j=0; j< N_PORTS; j++)
-      begin
-        if (AXI_DATA_WIDTH == 64) begin
-          assign l2_addr_is_in_va_rams[j] = (awaddr_reg >= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg[log2(L2SINGLE_AMAP_SIZE)-1:0] <= L2_VA_MAX_ADDR);
-          assign upper_word_is_written[j] = (wstrb_reg[7:4] != 4'b0000);
-          assign lower_word_is_written[j] = (wstrb_reg[3:0] != 4'b0000);
-        end else begin
-          assign l2_addr_is_in_va_rams[j] = 1'b0;
-          assign upper_word_is_written[j] = 1'b0;
-          assign lower_word_is_written[j] = 1'b0;
-        end
-
-        always @( posedge Clk_CI or negedge Rst_RBI ) begin
-          var integer idx_byte, off_byte;
-          if ( Rst_RBI == 1'b0 )
-            begin
-              wren_l2[j]  <= 1'b0;
-              wdata_l2[j] <= '0;
-            end
-          else if (wren)
-            begin
-              if ( (awaddr_reg >= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg < (j+2)*L2SINGLE_AMAP_SIZE) && (|wstrb_reg) )
-                wren_l2[j] <= 1'b1;
-              if      (AXI_DATA_WIDTH == 32) begin
-                for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ )
-                  wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte] & {8{wstrb_reg[idx_byte]}};
-              end
-              else if (AXI_DATA_WIDTH == 64) begin
-                if (lower_word_is_written[j] == 1'b1)
-                  off_byte = 0;
-                else
-                  off_byte = 4;
-                // always put the payload in the lower word and set upper word to 0
-                for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8/2; idx_byte++ )
-                    wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte+off_byte] & {8{wstrb_reg[idx_byte+off_byte]}};
-                wdata_l2[j][AXI_DATA_WIDTH-1:AXI_DATA_WIDTH/2] <= 'b0;
-              end
-              // pragma translate_off
-              else
-                $fatal(1, "Unsupported AXI_DATA_WIDTH!");
-              // pragma translate_on
-            end
-          else
-            wren_l2[j] <= '0;
-        end // always @ ( posedge Clk_CI or negedge Rst_RBI )
-
-        // Properly align the 32-bit word address when writing from 64-bit interface:
-        // Depending on the system, the incoming address is (non-)aligned to the 64-bit
-        // word when writing the upper 32-bit word.
-        always_comb begin
-          waddr_l2[j] = (awaddr_reg -(j+1)*L2SINGLE_AMAP_SIZE)/4;
-          if (wren_l2[j]) begin
-            if (AXI_DATA_WIDTH == 64) begin
-              if (upper_word_is_written[j] == 1'b1) begin
-                // address must be non-aligned
-                waddr_l2[j][0] = 1'b1;
-              end
-            end
-            // pragma translate_off
-            else if (AXI_DATA_WIDTH != 32) begin
-              $fatal(1, "Unsupported AXI_DATA_WIDTH!");
-            end
-            // pragma translate_on
-          end
-        end
-
-        // Assert that only one 32-bit word is ever written at a time to VA RAMs on 64-bit data
-        // systems.
-        // pragma translate_off
-        always_ff @ (posedge Clk_CI) begin
-          if (AXI_DATA_WIDTH == 64) begin
-            if  (l2_addr_is_in_va_rams[j]) begin
-              if (upper_word_is_written[j]) begin
-                assert (!lower_word_is_written[j])
-                  else $error("Unsupported write across two 32-bit words to VA RAMs!");
-              end
-              else if (lower_word_is_written[j]) begin
-                assert (!upper_word_is_written[j])
-                  else $error("Unsupported write across two 32-bit words to VA RAMs!");
-              end
-            end
-          end
-        end
-        // pragma translate_on
-
-      end // for (j=0; j< N_PORTS; j++)
-   endgenerate
-
-  // ███╗   ███╗██╗  ██╗    ███████╗██╗███████╗ ██████╗ ███████╗
-  // ████╗ ████║██║  ██║    ██╔════╝██║██╔════╝██╔═══██╗██╔════╝
-  // ██╔████╔██║███████║    █████╗  ██║█████╗  ██║   ██║███████╗
-  // ██║╚██╔╝██║██╔══██║    ██╔══╝  ██║██╔══╝  ██║   ██║╚════██║
-  // ██║ ╚═╝ ██║██║  ██║    ██║     ██║██║     ╚██████╔╝███████║
-  // ╚═╝     ╚═╝╚═╝  ╚═╝    ╚═╝     ╚═╝╚═╝      ╚═════╝ ╚══════╝
-  //
-  logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDin_D;
-  logic                       AddrFifoWen_S;
-  logic                       AddrFifoRen_S;
-  logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDout_D;
-  logic                       AddrFifoFull_S;
-  logic                       AddrFifoEmpty_S;
-  logic                       AddrFifoEmpty_SB;
-  logic                       AddrFifoFull_SB;
-
-  logic [MISS_META_WIDTH-1:0] MetaFifoDin_D;
-  logic                       MetaFifoWen_S;
-  logic                       MetaFifoRen_S;
-  logic [MISS_META_WIDTH-1:0] MetaFifoDout_D;
-  logic                       MetaFifoFull_S;
-  logic                       MetaFifoEmpty_S;
-  logic                       MetaFifoEmpty_SB;
-  logic                       MetaFifoFull_SB;
-
-  logic                       FifosDisabled_S;
-  logic                       ConfRegWen_S;
-  logic                 [1:0] ConfReg_DN;
-  logic                 [1:0] ConfReg_DP;
-
-  logic [AXI_DATA_WIDTH-1:0] wdata_reg_vec;
-
-  assign FifosDisabled_S    = ConfReg_DP[0];
-  assign L1AllowMultiHit_SO = ConfReg_DP[1];
-
-  assign AddrFifoEmpty_S = ~AddrFifoEmpty_SB;
-  assign MetaFifoEmpty_S = ~MetaFifoEmpty_SB;
-
-  assign AddrFifoFull_S = ~AddrFifoFull_SB;
-  assign MetaFifoFull_S = ~MetaFifoFull_SB;
-
-  assign MhFifoFull_SO = (AddrFifoWen_S & AddrFifoFull_S) | (MetaFifoWen_S & MetaFifoFull_S);
-
-  generate
-     for ( j=0; j<AXI_DATA_WIDTH/8; j++ )
-       assign wdata_reg_vec[(j+1)*8-1:j*8] = wdata_reg[j];
-  endgenerate
-
-  // write address FIFO
-  always_comb
-    begin
-       AddrFifoWen_S = 1'b0;
-       AddrFifoDin_D = 'b0;
-       if ( (Miss_SI == 1'b1) && (FifosDisabled_S == 1'b0) ) // register a new miss
-         begin
-            AddrFifoWen_S = 1'b1;
-            AddrFifoDin_D = MissAddr_DI;
-         end
-       else if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 'b0) && (FifosDisabled_S == 1'b0)) // write request from AXI interface
-         begin
-            AddrFifoWen_S = 1'b1;
-            AddrFifoDin_D = wdata_reg_vec[ADDR_WIDTH_VIRT-1:0];
-         end
-    end
-
-  // write meta FIFO
-  always_comb
-    begin
-       MetaFifoWen_S = 1'b0;
-       MetaFifoDin_D = 'b0;
-       if ( (Miss_SI == 1'b1) && (FifosDisabled_S == 1'b0) ) // register a new miss
-         begin
-            MetaFifoWen_S                      = 1'b1;
-            MetaFifoDin_D[MISS_META_WIDTH-1:0] = MissMeta_DI;
-         end
-       else if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 4'h8) && (FifosDisabled_S == 1'b0) ) // write request from AXI interface
-         begin
-            MetaFifoWen_S = 1'b1;
-            MetaFifoDin_D = wdata_reg_vec[MISS_META_WIDTH-1:0];
-         end
-    end
-
-  // write configuration register
-  always_comb
-    begin
-       ConfRegWen_S = 1'b0;
-       ConfReg_DN   = 1'b0;
-       if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 8'h10) ) // write request from AXI interface
-         begin
-            ConfRegWen_S = 1'b1;
-            ConfReg_DN   = wdata_reg_vec[$high(ConfReg_DN):0];
-         end
-    end
-
-  // AXI read data
-  always_comb
-    begin
-       s_axi_rdata   = rdata_reg; // read L1 config
-       AddrFifoRen_S = 1'b0;
-       MetaFifoRen_S = 1'b0;
-       if ( rvalid == 1'b1 )
-         begin
-            // read address FIFO
-            if ( araddr_reg[ADDR_MSB:0] == 'b0 )
-              begin
-                s_axi_rdata                      = {AXI_DATA_WIDTH{1'b0}};
-                s_axi_rdata[ADDR_WIDTH_VIRT-1:0] = AddrFifoDout_D;
-                if ( AddrFifoEmpty_S == 1'b0 )
-                  AddrFifoRen_S = 1'b1;
-              end
-            // read meta FIFO
-            else if ( araddr_reg[ADDR_MSB:0] == 4'h8 )
-              begin
-                s_axi_rdata                      = {AXI_DATA_WIDTH{1'b0}};
-                s_axi_rdata[31]                  = MetaFifoEmpty_S;
-                s_axi_rdata[MISS_META_WIDTH-1:0] = MetaFifoDout_D;
-                if ( MetaFifoEmpty_S == 1'b0 )
-                  MetaFifoRen_S = 1'b1;
-              end
-            // read configuration register
-            else if ( araddr_reg[ADDR_MSB:0] == 8'h10 )
-              begin
-                s_axi_rdata                      = {AXI_DATA_WIDTH{1'b0}};
-                s_axi_rdata[$high(ConfReg_DP):0] = ConfReg_DP;
-              end
-         end // if ( rvalid == 1'b1 )
-    end // always_comb begin
-
-  // configuration register
-  always_ff @(posedge Clk_CI or negedge Rst_RBI) begin
-    if (Rst_RBI == 1'b0)
-      begin
-        ConfReg_DP <= 'b0;
-      end
-    else if (ConfRegWen_S == 1'b1)
-      begin
-        ConfReg_DP <= ConfReg_DN;
-      end
-  end
-
-  generic_fifo
-    #(
-      .DATA_WIDTH ( ADDR_WIDTH_VIRT ),
-      .DATA_DEPTH ( MH_FIFO_DEPTH   )
-      )
-    fifo_addr_i
-    (
-      .clk         ( Clk_CI                          ),
-      .rst_n       ( Rst_RBI                         ),
-      .data_i      ( AddrFifoDin_D                   ),
-      .valid_i     ( AddrFifoWen_S & AddrFifoFull_SB ),
-      .grant_o     ( AddrFifoFull_SB                 ),
-      .data_o      ( AddrFifoDout_D                  ),
-      .valid_o     ( AddrFifoEmpty_SB                ),
-      .grant_i     ( AddrFifoRen_S                   ),
-      .test_mode_i ( 1'b0                            )
-    );
-
-  generic_fifo
-    #(
-      .DATA_WIDTH ( MISS_META_WIDTH ),
-      .DATA_DEPTH ( MH_FIFO_DEPTH   )
-      )
-    fifo_meta_i
-    (
-      .clk         ( Clk_CI                          ),
-      .rst_n       ( Rst_RBI                         ),
-      .data_i      ( MetaFifoDin_D                   ),
-      .valid_i     ( MetaFifoWen_S & MetaFifoFull_SB ),
-      .grant_o     ( MetaFifoFull_SB                 ),
-      .data_o      ( MetaFifoDout_D                  ),
-      .valid_o     ( MetaFifoEmpty_SB                ),
-      .grant_i     ( MetaFifoRen_S                   ),
-      .test_mode_i ( 1'b0                            )
-    );
-"""
-#
-# endmodule
-#
-#
diff --git a/src/unused/iommu/axi_rab/axi_rab_top.py b/src/unused/iommu/axi_rab/axi_rab_top.py
deleted file mode 100644 (file)
index ea1a802..0000000
+++ /dev/null
@@ -1,2642 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class axi_rab_top(Elaboratable):
-
-    def __init__(self):
-        self.Clk_CI = Signal()  # input
-        self.NonGatedClk_CI = Signal()  # input
-        self.Rst_RBI = Signal()  # input
-        self.s_axi4_awid = Signal()  # input
-        self.s_axi4_awaddr = Signal()  # input
-        self.s_axi4_awvalid = Signal(N_PORTS)  # input
-        self.s_axi4_awready = Signal(N_PORTS)  # output
-        self.s_axi4_awlen = Signal()  # input
-        self.s_axi4_awsize = Signal()  # input
-        self.s_axi4_awburst = Signal()  # input
-        self.s_axi4_awlock = Signal(N_PORTS)  # input
-        self.s_axi4_awprot = Signal()  # input
-        self.s_axi4_awcache = Signal()  # input
-        self.s_axi4_awregion = Signal()  # input
-        self.s_axi4_awqos = Signal()  # input
-        self.s_axi4_awuser = Signal()  # input
-        self.s_axi4_wdata = Signal()  # input
-        self.s_axi4_wvalid = Signal(N_PORTS)  # input
-        self.s_axi4_wready = Signal(N_PORTS)  # output
-        self.s_axi4_wstrb = Signal()  # input
-        self.s_axi4_wlast = Signal(N_PORTS)  # input
-        self.s_axi4_wuser = Signal()  # input
-        self.s_axi4_bid = Signal()  # output
-        self.s_axi4_bresp = Signal()  # output
-        self.s_axi4_bvalid = Signal(N_PORTS)  # output
-        self.s_axi4_buser = Signal()  # output
-        self.s_axi4_bready = Signal(N_PORTS)  # input
-        self.s_axi4_arid = Signal()  # input
-        self.s_axi4_araddr = Signal()  # input
-        self.s_axi4_arvalid = Signal(N_PORTS)  # input
-        self.s_axi4_arready = Signal(N_PORTS)  # output
-        self.s_axi4_arlen = Signal()  # input
-        self.s_axi4_arsize = Signal()  # input
-        self.s_axi4_arburst = Signal()  # input
-        self.s_axi4_arlock = Signal(N_PORTS)  # input
-        self.s_axi4_arprot = Signal()  # input
-        self.s_axi4_arcache = Signal()  # input
-        self.s_axi4_aruser = Signal()  # input
-        self.s_axi4_rid = Signal()  # output
-        self.s_axi4_rdata = Signal()  # output
-        self.s_axi4_rresp = Signal()  # output
-        self.s_axi4_rvalid = Signal(N_PORTS)  # output
-        self.s_axi4_rready = Signal(N_PORTS)  # input
-        self.s_axi4_rlast = Signal(N_PORTS)  # output
-        self.s_axi4_ruser = Signal()  # output
-        self.m0_axi4_awid = Signal()  # output
-        self.m0_axi4_awaddr = Signal()  # output
-        self.m0_axi4_awvalid = Signal(N_PORTS)  # output
-        self.m0_axi4_awready = Signal(N_PORTS)  # input
-        self.m0_axi4_awlen = Signal()  # output
-        self.m0_axi4_awsize = Signal()  # output
-        self.m0_axi4_awburst = Signal()  # output
-        self.m0_axi4_awlock = Signal(N_PORTS)  # output
-        self.m0_axi4_awprot = Signal()  # output
-        self.m0_axi4_awcache = Signal()  # output
-        self.m0_axi4_awregion = Signal()  # output
-        self.m0_axi4_awqos = Signal()  # output
-        self.m0_axi4_awuser = Signal()  # output
-        self.m0_axi4_wdata = Signal()  # output
-        self.m0_axi4_wvalid = Signal(N_PORTS)  # output
-        self.m0_axi4_wready = Signal(N_PORTS)  # input
-        self.m0_axi4_wstrb = Signal()  # output
-        self.m0_axi4_wlast = Signal(N_PORTS)  # output
-        self.m0_axi4_wuser = Signal()  # output
-        self.m0_axi4_bid = Signal()  # input
-        self.m0_axi4_bresp = Signal()  # input
-        self.m0_axi4_bvalid = Signal(N_PORTS)  # input
-        self.m0_axi4_buser = Signal()  # input
-        self.m0_axi4_bready = Signal(N_PORTS)  # output
-        self.m0_axi4_arid = Signal()  # output
-        self.m0_axi4_araddr = Signal()  # output
-        self.m0_axi4_arvalid = Signal(N_PORTS)  # output
-        self.m0_axi4_arready = Signal(N_PORTS)  # input
-        self.m0_axi4_arlen = Signal()  # output
-        self.m0_axi4_arsize = Signal()  # output
-        self.m0_axi4_arburst = Signal()  # output
-        self.m0_axi4_arlock = Signal(N_PORTS)  # output
-        self.m0_axi4_arprot = Signal()  # output
-        self.m0_axi4_arcache = Signal()  # output
-        self.m0_axi4_aruser = Signal()  # output
-        self.m0_axi4_rid = Signal()  # input
-        self.m0_axi4_rdata = Signal()  # input
-        self.m0_axi4_rresp = Signal()  # input
-        self.m0_axi4_rvalid = Signal(N_PORTS)  # input
-        self.m0_axi4_rready = Signal(N_PORTS)  # output
-        self.m0_axi4_rlast = Signal(N_PORTS)  # input
-        self.m0_axi4_ruser = Signal()  # input
-        self.m1_axi4_awid = Signal()  # output
-        self.m1_axi4_awaddr = Signal()  # output
-        self.m1_axi4_awvalid = Signal(N_PORTS)  # output
-        self.m1_axi4_awready = Signal(N_PORTS)  # input
-        self.m1_axi4_awlen = Signal()  # output
-        self.m1_axi4_awsize = Signal()  # output
-        self.m1_axi4_awburst = Signal()  # output
-        self.m1_axi4_awlock = Signal(N_PORTS)  # output
-        self.m1_axi4_awprot = Signal()  # output
-        self.m1_axi4_awcache = Signal()  # output
-        self.m1_axi4_awregion = Signal()  # output
-        self.m1_axi4_awqos = Signal()  # output
-        self.m1_axi4_awuser = Signal()  # output
-        self.m1_axi4_wdata = Signal()  # output
-        self.m1_axi4_wvalid = Signal(N_PORTS)  # output
-        self.m1_axi4_wready = Signal(N_PORTS)  # input
-        self.m1_axi4_wstrb = Signal()  # output
-        self.m1_axi4_wlast = Signal(N_PORTS)  # output
-        self.m1_axi4_wuser = Signal()  # output
-        self.m1_axi4_bid = Signal()  # input
-        self.m1_axi4_bresp = Signal()  # input
-        self.m1_axi4_bvalid = Signal(N_PORTS)  # input
-        self.m1_axi4_buser = Signal()  # input
-        self.m1_axi4_bready = Signal(N_PORTS)  # output
-        self.m1_axi4_arid = Signal()  # output
-        self.m1_axi4_araddr = Signal()  # output
-        self.m1_axi4_arvalid = Signal(N_PORTS)  # output
-        self.m1_axi4_arready = Signal(N_PORTS)  # input
-        self.m1_axi4_arlen = Signal()  # output
-        self.m1_axi4_arsize = Signal()  # output
-        self.m1_axi4_arburst = Signal()  # output
-        self.m1_axi4_arlock = Signal(N_PORTS)  # output
-        self.m1_axi4_arprot = Signal()  # output
-        self.m1_axi4_arcache = Signal()  # output
-        self.m1_axi4_aruser = Signal()  # output
-        self.m1_axi4_rid = Signal()  # input
-        self.m1_axi4_rdata = Signal()  # input
-        self.m1_axi4_rresp = Signal()  # input
-        self.m1_axi4_rvalid = Signal(N_PORTS)  # input
-        self.m1_axi4_rready = Signal(N_PORTS)  # output
-        self.m1_axi4_rlast = Signal(N_PORTS)  # input
-        self.m1_axi4_ruser = Signal()  # input
-        self.s_axi4lite_awaddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
-        self.s_axi4lite_awvalid = Signal()  # input
-        self.s_axi4lite_awready = Signal()  # output
-        self.s_axi4lite_wdata = Signal(AXI_LITE_DATA_WIDTH)  # input
-        self.s_axi4lite_wvalid = Signal()  # input
-        self.s_axi4lite_wready = Signal()  # output
-        self.s_axi4lite_wstrb = Signal(1+ERROR p_expression_25)  # input
-        self.s_axi4lite_bresp = Signal(2)  # output
-        self.s_axi4lite_bvalid = Signal()  # output
-        self.s_axi4lite_bready = Signal()  # input
-        self.s_axi4lite_araddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
-        self.s_axi4lite_arvalid = Signal()  # input
-        self.s_axi4lite_arready = Signal()  # output
-        self.s_axi4lite_rdata = Signal(AXI_LITE_DATA_WIDTH)  # output
-        self.s_axi4lite_rresp = Signal(2)  # output
-        self.s_axi4lite_rvalid = Signal()  # output
-        self.s_axi4lite_rready = Signal()  # input
-        self.int_miss = Signal(N_PORTS)  # output
-        self.int_multi = Signal(N_PORTS)  # output
-        self.int_prot = Signal(N_PORTS)  # output
-        self.int_mhf_full = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# // --=========================================================================--
-# //
-# //  █████╗ ██╗  ██╗██╗    ██████╗  █████╗ ██████╗     ████████╗ ██████╗ ██████╗
-# // ██╔══██╗╚██╗██╔╝██║    ██╔══██╗██╔══██╗██╔══██╗    ╚══██╔══╝██╔═══██╗██╔══██╗
-# // ███████║ ╚███╔╝ ██║    ██████╔╝███████║██████╔╝       ██║   ██║   ██║██████╔╝
-# // ██╔══██║ ██╔██╗ ██║    ██╔══██╗██╔══██║██╔══██╗       ██║   ██║   ██║██╔═══╝
-# // ██║  ██║██╔╝ ██╗██║    ██║  ██║██║  ██║██████╔╝       ██║   ╚██████╔╝██║
-# // ╚═╝  ╚═╝╚═╝  ╚═╝╚═╝    ╚═╝  ╚═╝╚═╝  ╚═╝╚═════╝        ╚═╝    ╚═════╝ ╚═╝
-# //
-# // --=========================================================================--
-# /*
-# * axi_rab_top
-# *
-# * The remapping address block (RAB) performs address translation for AXI
-# * transactions arriving at the input port and forwards them to different
-# * downstream AXI ports.
-# *
-# * The five axi channels are each buffered on the input side using a FIFO,
-# * described in axi4_XX_buffer. The RAB lookup result is merged into the
-# * AXI transaction via the axi4_XX_sender instances, which manages upstream
-# * error signaling for failed lookups.
-# *
-# * Address translation is performed based on data stored in up to two
-# * translation lookaside buffers (TLBs), which are private per RAB port (each
-# * of which having two AXI master ports and one AXI slave port). These TLBs
-# * are managed in software through the AXI-Lite interface.
-# *
-# * If ACP is enabled, the `cache_coherent` flag in the TLBs is used to
-# * multiplex between the two ports. If ACP is disabled, only the first master
-# * port is used. In this case, the `cache_coherent` flag is used to set the
-# * AxCACHE signals of the AXI bus accordingly.
-# *
-# * Authors:
-# * Antonio Pullini <pullinia@iis.ee.ethz.ch>
-# * Conrad Burchert <bconrad@ethz.ch>
-# * Maheshwara Sharma <msharma@student.ethz.ch>
-# * Andreas Kurth <akurth@iis.ee.ethz.ch>
-# * Johannes Weinbuch <jweinbuch@student.ethz.ch>
-# * Pirmin Vogel <vogelpi@iis.ee.ethz.ch>
-# */
-#
-# //`include "pulp_soc_defines.sv"
-#
-# ////import CfMath::log2;
-#
-# module axi_rab_top
-#
-#  // Parameters {{{
-#  #(
-#    parameter N_PORTS             =  2,
-#    parameter N_L2_SETS           = 32,
-#    parameter N_L2_SET_ENTRIES    = 32,
-#    parameter AXI_DATA_WIDTH      = 64,
-#    parameter AXI_S_ADDR_WIDTH    = 32,
-#    parameter AXI_M_ADDR_WIDTH    = 40,
-#    parameter AXI_LITE_DATA_WIDTH = 64,
-#    parameter AXI_LITE_ADDR_WIDTH = 32,
-#    parameter AXI_ID_WIDTH        = 10,
-#    parameter AXI_USER_WIDTH      =  6,
-#    parameter MH_FIFO_DEPTH       = 16
-#  )
-#  // }}}
-#
-#  // Ports {{{
-#  (
-#
-#    input logic                                            Clk_CI,  // This clock may be gated.
-#    input logic                                            NonGatedClk_CI,
-#    input logic                                            Rst_RBI,
-#
-#    // For every slave port there are two master ports. The master
-#    // port to use can be set using the master_select flag of the protection
-#    // bits of a slice
-#
-#    // AXI4 Slave {{{
-#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_awid,
-#    input  logic    [N_PORTS-1:0]   [AXI_S_ADDR_WIDTH-1:0] s_axi4_awaddr,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_awvalid,
-#    output logic    [N_PORTS-1:0]                          s_axi4_awready,
-#    input  logic    [N_PORTS-1:0]                    [7:0] s_axi4_awlen,
-#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_awsize,
-#    input  logic    [N_PORTS-1:0]                    [1:0] s_axi4_awburst,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_awlock,
-#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_awprot,
-#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_awcache,
-#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_awregion,
-#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_awqos,
-#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_awuser,
-#
-#    input  logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_wvalid,
-#    output logic    [N_PORTS-1:0]                          s_axi4_wready,
-#    input  logic    [N_PORTS-1:0]   [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_wlast,
-#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_wuser,
-#
-#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_bid,
-#    output logic    [N_PORTS-1:0]                    [1:0] s_axi4_bresp,
-#    output logic    [N_PORTS-1:0]                          s_axi4_bvalid,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_buser,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_bready,
-#
-#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_arid,
-#    input  logic    [N_PORTS-1:0]   [AXI_S_ADDR_WIDTH-1:0] s_axi4_araddr,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_arvalid,
-#    output logic    [N_PORTS-1:0]                          s_axi4_arready,
-#    input  logic    [N_PORTS-1:0]                    [7:0] s_axi4_arlen,
-#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_arsize,
-#    input  logic    [N_PORTS-1:0]                    [1:0] s_axi4_arburst,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_arlock,
-#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_arprot,
-#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_arcache,
-#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_aruser,
-#
-#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_rid,
-#    output logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
-#    output logic    [N_PORTS-1:0]                    [1:0] s_axi4_rresp,
-#    output logic    [N_PORTS-1:0]                          s_axi4_rvalid,
-#    input  logic    [N_PORTS-1:0]                          s_axi4_rready,
-#    output logic    [N_PORTS-1:0]                          s_axi4_rlast,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_ruser,
-#    // }}}
-#
-#    // AXI4 Master 0 {{{
-#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_awid,
-#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m0_axi4_awaddr,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_awvalid,
-#    input  logic    [N_PORTS-1:0]                          m0_axi4_awready,
-#    output logic    [N_PORTS-1:0]                    [7:0] m0_axi4_awlen,
-#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_awsize,
-#    output logic    [N_PORTS-1:0]                    [1:0] m0_axi4_awburst,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_awlock,
-#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_awprot,
-#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_awcache,
-#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_awregion,
-#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_awqos,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_awuser,
-#
-#    output logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m0_axi4_wdata,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_wvalid,
-#    input  logic    [N_PORTS-1:0]                          m0_axi4_wready,
-#    output logic    [N_PORTS-1:0]   [AXI_DATA_WIDTH/8-1:0] m0_axi4_wstrb,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_wlast,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_wuser,
-#
-#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_bid,
-#    input  logic    [N_PORTS-1:0]                    [1:0] m0_axi4_bresp,
-#    input  logic    [N_PORTS-1:0]                          m0_axi4_bvalid,
-#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_buser,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_bready,
-#
-#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_arid,
-#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m0_axi4_araddr,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_arvalid,
-#    input  logic    [N_PORTS-1:0]                          m0_axi4_arready,
-#    output logic    [N_PORTS-1:0]                    [7:0] m0_axi4_arlen,
-#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_arsize,
-#    output logic    [N_PORTS-1:0]                    [1:0] m0_axi4_arburst,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_arlock,
-#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_arprot,
-#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_arcache,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_aruser,
-#
-#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_rid,
-#    input  logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m0_axi4_rdata,
-#    input  logic    [N_PORTS-1:0]                    [1:0] m0_axi4_rresp,
-#    input  logic    [N_PORTS-1:0]                          m0_axi4_rvalid,
-#    output logic    [N_PORTS-1:0]                          m0_axi4_rready,
-#    input  logic    [N_PORTS-1:0]                          m0_axi4_rlast,
-#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_ruser,
-#    // }}}
-#
-#    // AXI4 Master 1 {{{
-#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_awid,
-#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m1_axi4_awaddr,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_awvalid,
-#    input  logic    [N_PORTS-1:0]                          m1_axi4_awready,
-#    output logic    [N_PORTS-1:0]                    [7:0] m1_axi4_awlen,
-#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_awsize,
-#    output logic    [N_PORTS-1:0]                    [1:0] m1_axi4_awburst,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_awlock,
-#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_awprot,
-#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_awcache,
-#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_awregion,
-#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_awqos,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_awuser,
-#
-#    output logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m1_axi4_wdata,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_wvalid,
-#    input  logic    [N_PORTS-1:0]                          m1_axi4_wready,
-#    output logic    [N_PORTS-1:0]   [AXI_DATA_WIDTH/8-1:0] m1_axi4_wstrb,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_wlast,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_wuser,
-#
-#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_bid,
-#    input  logic    [N_PORTS-1:0]                    [1:0] m1_axi4_bresp,
-#    input  logic    [N_PORTS-1:0]                          m1_axi4_bvalid,
-#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_buser,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_bready,
-#
-#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_arid,
-#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m1_axi4_araddr,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_arvalid,
-#    input  logic    [N_PORTS-1:0]                          m1_axi4_arready,
-#    output logic    [N_PORTS-1:0]                    [7:0] m1_axi4_arlen,
-#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_arsize,
-#    output logic    [N_PORTS-1:0]                    [1:0] m1_axi4_arburst,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_arlock,
-#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_arprot,
-#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_arcache,
-#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_aruser,
-#
-#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_rid,
-#    input  logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m1_axi4_rdata,
-#    input  logic    [N_PORTS-1:0]                    [1:0] m1_axi4_rresp,
-#    input  logic    [N_PORTS-1:0]                          m1_axi4_rvalid,
-#    output logic    [N_PORTS-1:0]                          m1_axi4_rready,
-#    input  logic    [N_PORTS-1:0]                          m1_axi4_rlast,
-#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_ruser,
-#    // }}}
-#
-#    // AXI 4 Lite Slave (Configuration Interface) {{{
-#    // AXI4-Lite port to setup the rab slices
-#    // use this to program the configuration registers
-#    input  logic                 [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_awaddr,
-#    input  logic                                           s_axi4lite_awvalid,
-#    output logic                                           s_axi4lite_awready,
-#
-#    input  logic                 [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_wdata,
-#    input  logic                                           s_axi4lite_wvalid,
-#    output logic                                           s_axi4lite_wready,
-#    input  logic               [AXI_LITE_DATA_WIDTH/8-1:0] s_axi4lite_wstrb,
-#
-#    output logic                                     [1:0] s_axi4lite_bresp,
-#    output logic                                           s_axi4lite_bvalid,
-#    input  logic                                           s_axi4lite_bready,
-#
-#    input  logic                 [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_araddr,
-#    input  logic                                           s_axi4lite_arvalid,
-#    output logic                                           s_axi4lite_arready,
-#
-#    output logic                 [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_rdata,
-#    output logic                                     [1:0] s_axi4lite_rresp,
-#    output logic                                           s_axi4lite_rvalid,
-#    input  logic                                           s_axi4lite_rready,
-#    // }}}
-#
-#    // BRAMs {{{
-# //`ifdef RAB_AX_LOG_EN
-# //    BramPort.Slave                                         ArBram_PS,
-# //    BramPort.Slave                                         AwBram_PS,
-# //`endif
-#    // }}}
-#
-#    // Logger Control {{{
-# //`ifdef RAB_AX_LOG_EN
-# //   input  logic                                           LogEn_SI,
-# //   input  logic                                           ArLogClr_SI,
-# //   input  logic                                           AwLogClr_SI,
-#  //  output logic                                           ArLogRdy_SO,
-#  //  output logic                                           AwLogRdy_SO,
-# //`endif
-#    // }}}
-#
-#    // Interrupt Outputs {{{
-#    // Interrupt lines to handle misses, collisions of slices/multiple hits,
-#    // protection faults and overflow of the miss handling fifo
-# //`ifdef RAB_AX_LOG_EN
-# //   output logic                                           int_ar_log_full,
-# //   output logic                                           int_aw_log_full,
-# //`endif
-#    output logic                             [N_PORTS-1:0] int_miss,
-#    output logic                             [N_PORTS-1:0] int_multi,
-#    output logic                             [N_PORTS-1:0] int_prot,
-#    output logic                                           int_mhf_full
-#    // }}}
-#
-#  );
-#
-"""#docstring_begin
-
-  // }}}
-
-  // Signals {{{
-  // ███████╗██╗ ██████╗ ███╗   ██╗ █████╗ ██╗     ███████╗
-  // ██╔════╝██║██╔════╝ ████╗  ██║██╔══██╗██║     ██╔════╝
-  // ███████╗██║██║  ███╗██╔██╗ ██║███████║██║     ███████╗
-  // ╚════██║██║██║   ██║██║╚██╗██║██╔══██║██║     ╚════██║
-  // ███████║██║╚██████╔╝██║ ╚████║██║  ██║███████╗███████║
-  // ╚══════╝╚═╝ ╚═════╝ ╚═╝  ╚═══╝╚═╝  ╚═╝╚══════╝╚══════╝
-  //
-
-  // Internal AXI4 lines, these connect buffers on the slave side to the rab core and
-  // multiplexers which switch between the two master outputs
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_awid;
-  logic [N_PORTS-1:0]  [AXI_S_ADDR_WIDTH-1:0] int_awaddr;
-  logic [N_PORTS-1:0]                         int_awvalid;
-  logic [N_PORTS-1:0]                         int_awready;
-  logic [N_PORTS-1:0]                   [7:0] int_awlen;
-  logic [N_PORTS-1:0]                   [2:0] int_awsize;
-  logic [N_PORTS-1:0]                   [1:0] int_awburst;
-  logic [N_PORTS-1:0]                         int_awlock;
-  logic [N_PORTS-1:0]                   [2:0] int_awprot;
-  logic [N_PORTS-1:0]                   [3:0] int_awcache;
-  logic [N_PORTS-1:0]                   [3:0] int_awregion;
-  logic [N_PORTS-1:0]                   [3:0] int_awqos;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_awuser;
-
-  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_wdata;
-  logic [N_PORTS-1:0]                         int_wvalid;
-  logic [N_PORTS-1:0]                         int_wready;
-  logic [N_PORTS-1:0]  [AXI_DATA_WIDTH/8-1:0] int_wstrb;
-  logic [N_PORTS-1:0]                         int_wlast;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_wuser;
-
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_bid;
-  logic [N_PORTS-1:0]                   [1:0] int_bresp;
-  logic [N_PORTS-1:0]                         int_bvalid;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_buser;
-  logic [N_PORTS-1:0]                         int_bready;
-
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_arid;
-  logic [N_PORTS-1:0]  [AXI_S_ADDR_WIDTH-1:0] int_araddr;
-  logic [N_PORTS-1:0]                         int_arvalid;
-  logic [N_PORTS-1:0]                         int_arready;
-  logic [N_PORTS-1:0]                   [7:0] int_arlen;
-  logic [N_PORTS-1:0]                   [2:0] int_arsize;
-  logic [N_PORTS-1:0]                   [1:0] int_arburst;
-  logic [N_PORTS-1:0]                         int_arlock;
-  logic [N_PORTS-1:0]                   [2:0] int_arprot;
-  logic [N_PORTS-1:0]                   [3:0] int_arcache;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_aruser;
-
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_rid;
-  logic [N_PORTS-1:0]                   [1:0] int_rresp;
-  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_rdata;
-  logic [N_PORTS-1:0]                         int_rlast;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_ruser;
-  logic [N_PORTS-1:0]                         int_rvalid;
-  logic [N_PORTS-1:0]                         int_rready;
-
-  // rab_core outputs
-  logic [N_PORTS-1:0]  [AXI_M_ADDR_WIDTH-1:0] int_wtrans_addr;
-  logic [N_PORTS-1:0]                         int_wtrans_accept;
-  logic [N_PORTS-1:0]                         int_wtrans_drop;
-  logic [N_PORTS-1:0]                         int_wtrans_miss;
-  logic [N_PORTS-1:0]                         int_wtrans_sent;
-  logic [N_PORTS-1:0]                         int_wtrans_cache_coherent;
-  logic [N_PORTS-1:0]                         int_wmaster_select;
-
-  logic [N_PORTS-1:0]  [AXI_M_ADDR_WIDTH-1:0] int_rtrans_addr;
-  logic [N_PORTS-1:0]                         int_rtrans_accept;
-  logic [N_PORTS-1:0]                         int_rtrans_drop;
-  logic [N_PORTS-1:0]                         int_rtrans_miss;
-  logic [N_PORTS-1:0]                         int_rtrans_sent;
-  logic [N_PORTS-1:0]                         int_rtrans_cache_coherent;
-  logic [N_PORTS-1:0]                         int_rmaster_select;
-
-  logic [N_PORTS-1:0]                         w_master_select;
-
-  // Internal master0 AXI4 lines. These connect the first master port to the
-  // multiplexers
-  // For channels read address, write address and write data the other lines
-  // are ignored if valid is not set, therefore we only need to multiplex those
-  logic [N_PORTS-1:0]                         int_m0_awvalid;
-  logic [N_PORTS-1:0]                         int_m0_awready;
-
-  logic [N_PORTS-1:0]                         int_m0_wvalid;
-  logic [N_PORTS-1:0]                         int_m0_wready;
-
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m0_bid;
-  logic [N_PORTS-1:0]                   [1:0] int_m0_bresp;
-  logic [N_PORTS-1:0]                         int_m0_bvalid;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m0_buser;
-  logic [N_PORTS-1:0]                         int_m0_bready;
-
-  logic [N_PORTS-1:0]                         int_m0_arvalid;
-  logic [N_PORTS-1:0]                         int_m0_arready;
-
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m0_rid;
-  logic [N_PORTS-1:0]                   [1:0] int_m0_rresp;
-  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_m0_rdata;
-  logic [N_PORTS-1:0]                         int_m0_rlast;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m0_ruser;
-  logic [N_PORTS-1:0]                         int_m0_rready;
-  logic [N_PORTS-1:0]                         int_m0_rvalid;
-
-  logic [N_PORTS-1:0]                         l1_m0_ar_accept;
-  logic [N_PORTS-1:0]                         l1_m0_ar_drop;
-  logic [N_PORTS-1:0]                         l1_m0_ar_save;
-  logic [N_PORTS-1:0]                         l1_m0_ar_done;
-  logic [N_PORTS-1:0]                         l2_m0_ar_accept;
-  logic [N_PORTS-1:0]                         l2_m0_ar_drop;
-  logic [N_PORTS-1:0]                         l2_m0_ar_done;
-  logic [N_PORTS-1:0]                         l2_m0_ar_sending;
-
-  logic [N_PORTS-1:0]                         l1_m0_aw_accept;
-  logic [N_PORTS-1:0]                         l1_m0_aw_drop;
-  logic [N_PORTS-1:0]                         l1_m0_aw_save;
-  logic [N_PORTS-1:0]                         l1_m0_aw_done;
-  logic [N_PORTS-1:0]                         l2_m0_aw_accept;
-  logic [N_PORTS-1:0]                         l2_m0_aw_drop;
-  logic [N_PORTS-1:0]                         l2_m0_aw_done;
-  logic [N_PORTS-1:0]                         l2_m0_aw_sending;
-
-  // Internal master1 AXI4 lines. These connect the second master port to the
-  // multiplexers
-  // For channels read address, write address and write data the other lines
-  // are ignored if valid is not set, therefore we only need to multiplex those
-  logic [N_PORTS-1:0]                         int_m1_awvalid;
-  logic [N_PORTS-1:0]                         int_m1_awready;
-
-  logic [N_PORTS-1:0]                         int_m1_wvalid;
-  logic [N_PORTS-1:0]                         int_m1_wready;
-
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m1_bid;
-  logic [N_PORTS-1:0]                   [1:0] int_m1_bresp;
-  logic [N_PORTS-1:0]                         int_m1_bvalid;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m1_buser;
-  logic [N_PORTS-1:0]                         int_m1_bready;
-
-  logic [N_PORTS-1:0]                         int_m1_arvalid;
-  logic [N_PORTS-1:0]                         int_m1_arready;
-
-  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m1_rid;
-  logic [N_PORTS-1:0]                   [1:0] int_m1_rresp;
-  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_m1_rdata;
-  logic [N_PORTS-1:0]                         int_m1_rlast;
-  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m1_ruser;
-  logic [N_PORTS-1:0]                         int_m1_rvalid;
-  logic [N_PORTS-1:0]                         int_m1_rready;
-
-  logic [N_PORTS-1:0]                         l1_m1_ar_accept;
-  logic [N_PORTS-1:0]                         l1_m1_ar_drop;
-  logic [N_PORTS-1:0]                         l1_m1_ar_save;
-  logic [N_PORTS-1:0]                         l1_m1_ar_done;
-  logic [N_PORTS-1:0]                         l2_m1_ar_accept;
-  logic [N_PORTS-1:0]                         l2_m1_ar_drop;
-  logic [N_PORTS-1:0]                         l2_m1_ar_done;
-
-  logic [N_PORTS-1:0]                         l1_m1_aw_accept;
-  logic [N_PORTS-1:0]                         l1_m1_aw_drop;
-  logic [N_PORTS-1:0]                         l1_m1_aw_save;
-  logic [N_PORTS-1:0]                         l1_m1_aw_done;
-  logic [N_PORTS-1:0]                         l2_m1_aw_accept;
-  logic [N_PORTS-1:0]                         l2_m1_aw_drop;
-  logic [N_PORTS-1:0]                         l2_m1_aw_done;
-
-  // L1 outputs
-  logic [N_PORTS-1:0]                         rab_miss; // L1 RAB miss
-  logic [N_PORTS-1:0]                         rab_prot;
-  logic [N_PORTS-1:0]                         rab_multi;
-  logic [N_PORTS-1:0]                         rab_prefetch;
-
-  //
-  // Signals used to support L2 TLB
-  //
-  // L2 RAM configuration signals
-  logic [N_PORTS-1:0] [AXI_LITE_DATA_WIDTH-1:0] L2CfgWData_D;
-  logic [N_PORTS-1:0] [AXI_LITE_ADDR_WIDTH-1:0] L2CfgWAddr_D;
-  logic [N_PORTS-1:0]                           L2CfgWE_S;
-
-  // L1 output and drop Buffer
-  logic [N_PORTS-1:0]                           L1OutRwType_D, L1DropRwType_DP;
-  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] L1OutUser_D, L1DropUser_DP;
-  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] L1OutId_D, L1DropId_DP;
-  logic [N_PORTS-1:0]                     [7:0] L1OutLen_D, L1DropLen_DP;
-  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] L1OutAddr_D, L1DropAddr_DP;
-  logic [N_PORTS-1:0]                           L1OutProt_D, L1DropProt_DP;
-  logic [N_PORTS-1:0]                           L1OutMulti_D, L1DropMulti_DP;
-  logic [N_PORTS-1:0]                           L1DropEn_S;
-  logic [N_PORTS-1:0]                           L1DropPrefetch_S;
-
-  logic [N_PORTS-1:0]                           L1DropValid_SN, L1DropValid_SP;
-
-  // L2 input Buffer
-  logic [N_PORTS-1:0]                           L2InRwType_DP;
-  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] L2InUser_DP;
-  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] L2InId_DP;
-  logic [N_PORTS-1:0]                     [7:0] L2InLen_DP;
-  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] L2InAddr_DP;
-  logic [N_PORTS-1:0]                           L2InEn_S;
-
-  // L2 output Buffer
-  logic [N_PORTS-1:0]                           L2OutRwType_DP;
-  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] L2OutUser_DP;
-  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] L2OutId_DP;
-  logic [N_PORTS-1:0]                     [7:0] L2OutLen_DP;
-  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] L2OutInAddr_DP;
-
-  logic [N_PORTS-1:0]                           L2OutHit_SN, L2OutHit_SP;
-  logic [N_PORTS-1:0]                           L2OutMiss_SN, L2OutMiss_SP;
-  logic [N_PORTS-1:0]                           L2OutProt_SN, L2OutProt_SP;
-  logic [N_PORTS-1:0]                           L2OutMulti_SN, L2OutMulti_SP;
-  logic [N_PORTS-1:0]                           L2OutCC_SN, L2OutCC_SP;
-  logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] L2OutAddr_DN, L2OutAddr_DP;
-
-  logic [N_PORTS-1:0]                           L2OutValid_SN, L2OutValid_SP;
-  logic [N_PORTS-1:0]                           L2OutPrefetch_S;
-  logic [N_PORTS-1:0]                           L2OutReady_S;
-  logic [N_PORTS-1:0]                           L2OutEn_S;
-
-   // L2 outputs
-  logic [N_PORTS-1:0]                           L2Busy_S;
-  logic [N_PORTS-1:0]                           L2OutValid_S;
-
-  logic [N_PORTS-1:0]                           L2Miss_S;
-
-  // Signals for interfacing the AXI modules
-  logic [N_PORTS-1:0]                           l1_ar_accept;
-  logic [N_PORTS-1:0]                           l1_aw_accept;
-  logic [N_PORTS-1:0]                           l1_w_accept;
-  logic [N_PORTS-1:0]                           l1_xw_accept;
-
-  logic [N_PORTS-1:0]                           l1_ar_drop;
-  logic [N_PORTS-1:0]                           l1_aw_drop;
-  logic [N_PORTS-1:0]                           l1_w_drop;
-  logic [N_PORTS-1:0]                           l1_xw_drop;
-
-  logic [N_PORTS-1:0]                           l1_ar_save;
-  logic [N_PORTS-1:0]                           l1_aw_save;
-  logic [N_PORTS-1:0]                           l1_w_save;
-  logic [N_PORTS-1:0]                           l1_xw_save;
-
-  logic [N_PORTS-1:0]                           l1_ar_done;
-  logic [N_PORTS-1:0]                           l1_r_done;
-  logic [N_PORTS-1:0]                           l1_r_drop;
-  logic [N_PORTS-1:0]                           lx_r_drop;
-  logic [N_PORTS-1:0]                           lx_r_done;
-
-  logic [N_PORTS-1:0]                           l1_aw_done;
-  logic [N_PORTS-1:0]                           l1_w_done;
-  logic [N_PORTS-1:0]                           l1_xw_done;
-  logic [N_PORTS-1:0]                           l1_aw_done_SP;
-  logic [N_PORTS-1:0]                           l1_w_done_SP;
-
-  logic [N_PORTS-1:0]                           l2_ar_accept;
-  logic [N_PORTS-1:0]                           l2_aw_accept;
-  logic [N_PORTS-1:0]                           l2_w_accept;
-  logic [N_PORTS-1:0]                           l2_xw_accept;
-
-  logic [N_PORTS-1:0]                           l2_ar_drop;
-  logic [N_PORTS-1:0]                           l2_r_drop;
-  logic [N_PORTS-1:0]                           l2_xr_drop;
-  logic [N_PORTS-1:0]                           l2_aw_drop;
-  logic [N_PORTS-1:0]                           l2_w_drop;
-  logic [N_PORTS-1:0]                           l2_xw_drop;
-
-  logic [N_PORTS-1:0]                           l2_aw_done;
-  logic [N_PORTS-1:0]                           l2_w_done;
-  logic [N_PORTS-1:0]                           l2_xw_done;
-  logic [N_PORTS-1:0]                           l2_aw_done_SP;
-  logic [N_PORTS-1:0]                           l2_w_done_SP;
-
-  logic [N_PORTS-1:0]                           l2_ar_done;
-  logic [N_PORTS-1:0]                           l2_r_done;
-  logic [N_PORTS-1:0]                           l2_xr_done;
-  logic [N_PORTS-1:0]                           l2_ar_done_SP;
-  logic [N_PORTS-1:0]                           l2_r_done_SP;
-
-  logic [N_PORTS-1:0]                           l1_mx_aw_done;
-  logic [N_PORTS-1:0]                           l1_mx_ar_done;
-  logic [N_PORTS-1:0]                           l1_m0_aw_done_SP;
-  logic [N_PORTS-1:0]                           l1_m0_ar_done_SP;
-  logic [N_PORTS-1:0]                           l1_m1_aw_done_SP;
-  logic [N_PORTS-1:0]                           l1_m1_ar_done_SP;
-
-  logic [N_PORTS-1:0]                           l2_mx_aw_done;
-  logic [N_PORTS-1:0]                           l2_mx_ar_done;
-  logic [N_PORTS-1:0]                           l2_m0_aw_done_SP;
-  logic [N_PORTS-1:0]                           l2_m0_ar_done_SP;
-  logic [N_PORTS-1:0]                           l2_m1_aw_done_SP;
-  logic [N_PORTS-1:0]                           l2_m1_ar_done_SP;
-
-  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] l1_id_drop, lx_id_drop, b_id_drop;
-  logic [N_PORTS-1:0]                     [7:0] l1_len_drop, lx_len_drop;
-  logic [N_PORTS-1:0]                           l1_prefetch_drop, lx_prefetch_drop, b_prefetch_drop;
-  logic [N_PORTS-1:0]                           l1_hit_drop, lx_hit_drop, b_hit_drop;
-
-  logic [N_PORTS-1:0]                           b_drop;
-  logic [N_PORTS-1:0]                           b_done;
-
-  logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] l2_aw_addr;
-  logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] l2_ar_addr;
-
-  logic [N_PORTS-1:0]                           l2_cache_coherent;
-  logic [N_PORTS-1:0]                           l2_master_select;
-
-  logic [N_PORTS-1:0]                           aw_in_stall;
-  logic [N_PORTS-1:0]                           aw_out_stall;
-
-  genvar                                        i;
-
-  // RRESP FSM
-  typedef enum logic                    {IDLE, BUSY} r_resp_mux_ctrl_state_t;
-  r_resp_mux_ctrl_state_t [N_PORTS-1:0] RRespMuxCtrl_SN, RRespMuxCtrl_SP;
-  logic                   [N_PORTS-1:0] RRespSel_SN, RRespSel_SP;
-  logic                   [N_PORTS-1:0] RRespBurst_S;
-  logic                   [N_PORTS-1:0] RRespSelIm_S;
-
-  // }}}
-
-  // Local parameters {{{
-
-  // Enable L2 for select ports
-  localparam integer ENABLE_L2TLB[N_PORTS-1:0] = `EN_L2TLB_ARRAY;
-
-  // L2TLB parameters
-  localparam integer HUM_BUFFER_DEPTH = (N_L2_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS)+13;
-
-  // }}}
-
-  // Derive `master_select` from cache coherency flag. {{{
-  `ifdef EN_ACP
-    assign int_wmaster_select = int_wtrans_cache_coherent;
-    assign int_rmaster_select = int_rtrans_cache_coherent;
-    assign l2_master_select   = l2_cache_coherent;
-  `else
-    assign int_wmaster_select = '0;
-    assign int_rmaster_select = '0;
-    assign l2_master_select   = '0;
-  `endif
-  // }}}
-
-  // Buf and Send {{{
-  // ██████╗ ██╗   ██╗███████╗       ██╗       ███████╗███████╗███╗   ██╗██████╗
-  // ██╔══██╗██║   ██║██╔════╝       ██║       ██╔════╝██╔════╝████╗  ██║██╔══██╗
-  // ██████╔╝██║   ██║█████╗      ████████╗    ███████╗█████╗  ██╔██╗ ██║██║  ██║
-  // ██╔══██╗██║   ██║██╔══╝      ██╔═██╔═╝    ╚════██║██╔══╝  ██║╚██╗██║██║  ██║
-  // ██████╔╝╚██████╔╝██║         ██████║      ███████║███████╗██║ ╚████║██████╔╝
-  // ╚═════╝  ╚═════╝ ╚═╝         ╚═════╝      ╚══════╝╚══════╝╚═╝  ╚═══╝╚═════╝
-  //
-  logic[N_PORTS-1:0] m0_write_is_burst, m0_read_is_burst;
-  logic[N_PORTS-1:0] m1_write_is_burst, m1_read_is_burst;
-
-  generate for (i = 0; i < N_PORTS; i++) begin : BUF_AND_SEND
-
-  // Write Address channel (aw) {{{
-  /*
-   * write address channel (aw)
-   *
-   * ██╗    ██╗██████╗ ██╗████████╗███████╗     █████╗ ██████╗ ██████╗ ██████╗
-   * ██║    ██║██╔══██╗██║╚══██╔══╝██╔════╝    ██╔══██╗██╔══██╗██╔══██╗██╔══██╗
-   * ██║ █╗ ██║██████╔╝██║   ██║   █████╗      ███████║██║  ██║██║  ██║██████╔╝
-   * ██║███╗██║██╔══██╗██║   ██║   ██╔══╝      ██╔══██║██║  ██║██║  ██║██╔══██╗
-   * ╚███╔███╔╝██║  ██║██║   ██║   ███████╗    ██║  ██║██████╔╝██████╔╝██║  ██║
-   *  ╚══╝╚══╝ ╚═╝  ╚═╝╚═╝   ╚═╝   ╚══════╝    ╚═╝  ╚═╝╚═════╝ ╚═════╝ ╚═╝  ╚═╝
-   *
-   */
-
-  axi4_aw_buffer
-    #(
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-      )
-    u_aw_buffer
-    (
-      .axi4_aclk       ( Clk_CI             ),
-      .axi4_arstn      ( Rst_RBI            ),
-      .s_axi4_awid     ( s_axi4_awid[i]     ),
-      .s_axi4_awaddr   ( s_axi4_awaddr[i]   ),
-      .s_axi4_awvalid  ( s_axi4_awvalid[i]  ),
-      .s_axi4_awready  ( s_axi4_awready[i]  ),
-      .s_axi4_awlen    ( s_axi4_awlen[i]    ),
-      .s_axi4_awsize   ( s_axi4_awsize[i]   ),
-      .s_axi4_awburst  ( s_axi4_awburst[i]  ),
-      .s_axi4_awlock   ( s_axi4_awlock[i]   ),
-      .s_axi4_awprot   ( s_axi4_awprot[i]   ),
-      .s_axi4_awcache  ( s_axi4_awcache[i]  ),
-      .s_axi4_awregion ( s_axi4_awregion[i] ),
-      .s_axi4_awqos    ( s_axi4_awqos[i]    ),
-      .s_axi4_awuser   ( s_axi4_awuser[i]   ),
-      .m_axi4_awid     ( int_awid[i]        ),
-      .m_axi4_awaddr   ( int_awaddr[i]      ),
-      .m_axi4_awvalid  ( int_awvalid[i]     ),
-      .m_axi4_awready  ( int_awready[i]     ),
-      .m_axi4_awlen    ( int_awlen[i]       ),
-      .m_axi4_awsize   ( int_awsize[i]      ),
-      .m_axi4_awburst  ( int_awburst[i]     ),
-      .m_axi4_awlock   ( int_awlock[i]      ),
-      .m_axi4_awprot   ( int_awprot[i]      ),
-      .m_axi4_awcache  ( int_awcache[i]     ),
-      .m_axi4_awregion ( int_awregion[i]    ),
-      .m_axi4_awqos    ( int_awqos[i]       ),
-      .m_axi4_awuser   ( int_awuser[i]      )
-    );
-
-  axi4_aw_sender
-    #(
-      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
-      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
-      )
-    u_aw_sender_m0
-    (
-      .axi4_aclk       ( Clk_CI                ),
-      .axi4_arstn      ( Rst_RBI               ),
-      .l1_done_o       ( l1_m0_aw_done[i]      ),
-      .l1_accept_i     ( l1_m0_aw_accept[i]    ),
-      .l1_drop_i       ( l1_m0_aw_drop[i]      ),
-      .l1_save_i       ( l1_m0_aw_save[i]      ),
-      .l2_done_o       ( l2_m0_aw_done[i]      ),
-      .l2_accept_i     ( l2_m0_aw_accept[i]    ),
-      .l2_drop_i       ( l2_m0_aw_drop[i]      ),
-      .l2_sending_o    ( l2_m0_aw_sending[i]   ),
-      .l1_awaddr_i     ( int_wtrans_addr[i]    ),
-      .l2_awaddr_i     ( l2_aw_addr[i]         ),
-      .s_axi4_awid     ( int_awid[i]           ),
-      .s_axi4_awvalid  ( int_m0_awvalid[i]     ),
-      .s_axi4_awready  ( int_m0_awready[i]     ),
-      .s_axi4_awlen    ( int_awlen[i]          ),
-      .s_axi4_awsize   ( int_awsize[i]         ),
-      .s_axi4_awburst  ( int_awburst[i]        ),
-      .s_axi4_awlock   ( int_awlock[i]         ),
-      .s_axi4_awprot   ( int_awprot[i]         ),
-      .s_axi4_awcache  ( int_awcache[i]        ),
-      .s_axi4_awregion ( int_awregion[i]       ),
-      .s_axi4_awqos    ( int_awqos[i]          ),
-      .s_axi4_awuser   ( int_awuser[i]         ),
-      .m_axi4_awid     ( m0_axi4_awid[i]       ),
-      .m_axi4_awaddr   ( m0_axi4_awaddr[i]     ),
-      .m_axi4_awvalid  ( m0_axi4_awvalid[i]    ),
-      .m_axi4_awready  ( m0_axi4_awready[i]    ),
-      .m_axi4_awlen    ( m0_axi4_awlen[i]      ),
-      .m_axi4_awsize   ( m0_axi4_awsize[i]     ),
-      .m_axi4_awburst  ( m0_axi4_awburst[i]    ),
-      .m_axi4_awlock   ( m0_axi4_awlock[i]     ),
-      .m_axi4_awprot   ( m0_axi4_awprot[i]     ),
-      .m_axi4_awcache  (                       ),
-      .m_axi4_awregion ( m0_axi4_awregion[i]   ),
-      .m_axi4_awqos    ( m0_axi4_awqos[i]      ),
-      .m_axi4_awuser   ( m0_axi4_awuser[i]     )
-    );
-
-  // The AXCACHE signals are set according to burstiness and cache coherence or statically
-  // when not connected to ACP on Zynq (implemented below).
-    assign m0_write_is_burst[i] = (m0_axi4_awlen[i] != {8{1'b0}}) && (m0_axi4_awburst[i] != 2'b00);
-  `ifndef EN_ACP
-    always_comb begin
-      if ( (l2_m0_aw_sending[i] & l2_cache_coherent[i]) | int_wtrans_cache_coherent[i]) begin
-        if (m0_write_is_burst[i]) begin
-          m0_axi4_awcache[i]  = 4'b0111;
-        end else begin
-          m0_axi4_awcache[i]  = 4'b1111;
-        end
-      end else begin
-        m0_axi4_awcache[i]    = 4'b0011;
-      end
-    end
-  `else
-    assign m0_axi4_awcache[i] = 4'b0011;
-  `endif
-
-  axi4_aw_sender
-    #(
-      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
-      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
-      )
-    u_aw_sender_m1
-    (
-      .axi4_aclk       ( Clk_CI                ),
-      .axi4_arstn      ( Rst_RBI               ),
-      .l1_accept_i     ( l1_m1_aw_accept[i]    ),
-      .l1_drop_i       ( l1_m1_aw_drop[i]      ),
-      .l1_save_i       ( l1_m1_aw_save[i]      ),
-      .l1_done_o       ( l1_m1_aw_done[i]      ),
-      .l2_accept_i     ( l2_m1_aw_accept[i]    ),
-      .l2_drop_i       ( l2_m1_aw_drop[i]      ),
-      .l2_done_o       ( l2_m1_aw_done[i]      ),
-      .l2_sending_o    (                       ), // just helps to set axcache
-      .l1_awaddr_i     ( int_wtrans_addr[i]    ),
-      .l2_awaddr_i     ( l2_aw_addr[i]         ),
-      .s_axi4_awid     ( int_awid[i]           ),
-      .s_axi4_awvalid  ( int_m1_awvalid[i]     ),
-      .s_axi4_awready  ( int_m1_awready[i]     ),
-      .s_axi4_awlen    ( int_awlen[i]          ),
-      .s_axi4_awsize   ( int_awsize[i]         ),
-      .s_axi4_awburst  ( int_awburst[i]        ),
-      .s_axi4_awlock   ( int_awlock[i]         ),
-      .s_axi4_awprot   ( int_awprot[i]         ),
-      .s_axi4_awcache  ( int_awcache[i]        ),
-      .s_axi4_awregion ( int_awregion[i]       ),
-      .s_axi4_awqos    ( int_awqos[i]          ),
-      .s_axi4_awuser   ( int_awuser[i]         ),
-      .m_axi4_awid     ( m1_axi4_awid[i]       ),
-      .m_axi4_awaddr   ( m1_axi4_awaddr[i]     ),
-      .m_axi4_awvalid  ( m1_axi4_awvalid[i]    ),
-      .m_axi4_awready  ( m1_axi4_awready[i]    ),
-      .m_axi4_awlen    ( m1_axi4_awlen[i]      ),
-      .m_axi4_awsize   ( m1_axi4_awsize[i]     ),
-      .m_axi4_awburst  ( m1_axi4_awburst[i]    ),
-      .m_axi4_awlock   ( m1_axi4_awlock[i]     ),
-      .m_axi4_awprot   ( m1_axi4_awprot[i]     ),
-      .m_axi4_awcache  (                       ),
-      .m_axi4_awregion ( m1_axi4_awregion[i]   ),
-      .m_axi4_awqos    ( m1_axi4_awqos[i]      ),
-      .m_axi4_awuser   ( m1_axi4_awuser[i]     )
-    );
-
-    // The AXCACHE signals are set according to burstiness and cache coherence or statically
-    // when not connected to ACP on Zynq (implemented below).
-      assign m1_write_is_burst[i] = (m1_axi4_awlen[i] != {8{1'b0}}) && (m1_axi4_awburst[i] != 2'b00);
-    `ifdef EN_ACP
-      always_comb begin
-        if (m1_write_is_burst[i]) begin
-          m1_axi4_awcache[i]    = 4'b1011;
-        end else begin
-          m1_axi4_awcache[i]    = 4'b1111;
-        end
-      end
-    `else
-      assign m1_axi4_awcache[i] = 4'b0011;
-    `endif
-
-  // }}}
-
-  // Write Data channel (w) {{{
-  /*
-   * write data channel (w)
-   *
-   * ██╗    ██╗██████╗ ██╗████████╗███████╗    ██████╗  █████╗ ████████╗ █████╗
-   * ██║    ██║██╔══██╗██║╚══██╔══╝██╔════╝    ██╔══██╗██╔══██╗╚══██╔══╝██╔══██╗
-   * ██║ █╗ ██║██████╔╝██║   ██║   █████╗      ██║  ██║███████║   ██║   ███████║
-   * ██║███╗██║██╔══██╗██║   ██║   ██╔══╝      ██║  ██║██╔══██║   ██║   ██╔══██║
-   * ╚███╔███╔╝██║  ██║██║   ██║   ███████╗    ██████╔╝██║  ██║   ██║   ██║  ██║
-   *  ╚══╝╚══╝ ╚═╝  ╚═╝╚═╝   ╚═╝   ╚══════╝    ╚═════╝ ╚═╝  ╚═╝   ╚═╝   ╚═╝  ╚═╝
-   *
-   */
-  axi4_w_buffer
-    #(
-      .AXI_DATA_WIDTH   ( AXI_DATA_WIDTH   ),
-      .AXI_ID_WIDTH     ( AXI_ID_WIDTH     ),
-      .AXI_USER_WIDTH   ( AXI_USER_WIDTH   ),
-      .ENABLE_L2TLB     ( ENABLE_L2TLB[i]  ),
-      .HUM_BUFFER_DEPTH ( HUM_BUFFER_DEPTH )
-      )
-    u_w_buffer
-    (
-      .axi4_aclk       ( Clk_CI                ),
-      .axi4_arstn      ( Rst_RBI               ),
-
-      // L1 interface
-      .l1_done_o       ( l1_w_done[i]          ),
-      .l1_accept_i     ( l1_w_accept[i]        ),
-      .l1_save_i       ( l1_w_save[i]          ),
-      .l1_drop_i       ( l1_w_drop[i]          ),
-      .l1_master_i     ( int_wmaster_select[i] ),
-      .l1_id_i         ( l1_id_drop[i]         ),
-      .l1_len_i        ( l1_len_drop[i]        ),
-      .l1_prefetch_i   ( l1_prefetch_drop[i]   ),
-      .l1_hit_i        ( l1_hit_drop[i]        ),
-
-      // L2 interface
-      .l2_done_o       ( l2_w_done[i]          ),
-      .l2_accept_i     ( l2_w_accept[i]        ),
-      .l2_drop_i       ( l2_w_drop[i]          ),
-      .l2_master_i     ( l2_master_select[i]   ),
-      .l2_id_i         ( lx_id_drop[i]         ),
-      .l2_len_i        ( lx_len_drop[i]        ),
-      .l2_prefetch_i   ( lx_prefetch_drop[i]   ),
-      .l2_hit_i        ( lx_hit_drop[i]        ),
-
-      // Top-level control outputs
-      .master_select_o ( w_master_select[i]    ),
-      .input_stall_o   ( aw_in_stall[i]        ), // stall L1 AW input if request buffers full
-      .output_stall_o  ( aw_out_stall[i]       ), // stall L1 AW hit forwarding if bypass not possible
-
-      // B sender interface
-      .b_drop_o        ( b_drop[i]             ),
-      .b_done_i        ( b_done[i]             ),
-      .id_o            ( b_id_drop[i]          ),
-      .prefetch_o      ( b_prefetch_drop[i]    ),
-      .hit_o           ( b_hit_drop[i]         ),
-
-      // AXI W channel interfaces
-      .s_axi4_wdata    ( s_axi4_wdata[i]       ),
-      .s_axi4_wvalid   ( s_axi4_wvalid[i]      ),
-      .s_axi4_wready   ( s_axi4_wready[i]      ),
-      .s_axi4_wstrb    ( s_axi4_wstrb[i]       ),
-      .s_axi4_wlast    ( s_axi4_wlast[i]       ),
-      .s_axi4_wuser    ( s_axi4_wuser[i]       ),
-      .m_axi4_wdata    ( int_wdata[i]          ),
-      .m_axi4_wvalid   ( int_wvalid[i]         ),
-      .m_axi4_wready   ( int_wready[i]         ),
-      .m_axi4_wstrb    ( int_wstrb[i]          ),
-      .m_axi4_wlast    ( int_wlast[i]          ),
-      .m_axi4_wuser    ( int_wuser[i]          )
-    );
-
-  axi4_w_sender
-    #(
-      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-      )
-    u_w_sender_m0
-    (
-      .axi4_aclk       ( Clk_CI            ),
-      .axi4_arstn      ( Rst_RBI           ),
-      .s_axi4_wdata    ( int_wdata[i]      ),
-      .s_axi4_wvalid   ( int_m0_wvalid[i]  ),
-      .s_axi4_wready   ( int_m0_wready[i]  ),
-      .s_axi4_wstrb    ( int_wstrb[i]      ),
-      .s_axi4_wlast    ( int_wlast[i]      ),
-      .s_axi4_wuser    ( int_wuser[i]      ),
-      .m_axi4_wdata    ( m0_axi4_wdata[i]  ),
-      .m_axi4_wvalid   ( m0_axi4_wvalid[i] ),
-      .m_axi4_wready   ( m0_axi4_wready[i] ),
-      .m_axi4_wstrb    ( m0_axi4_wstrb[i]  ),
-      .m_axi4_wlast    ( m0_axi4_wlast[i]  ),
-      .m_axi4_wuser    ( m0_axi4_wuser[i]  )
-    );
-
-  axi4_w_sender
-    #(
-      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-
-      )
-    u_w_sender_m1
-    (
-      .axi4_aclk       ( Clk_CI            ),
-      .axi4_arstn      ( Rst_RBI           ),
-      .s_axi4_wdata    ( int_wdata[i]      ),
-      .s_axi4_wvalid   ( int_m1_wvalid[i]  ),
-      .s_axi4_wready   ( int_m1_wready[i]  ),
-      .s_axi4_wstrb    ( int_wstrb[i]      ),
-      .s_axi4_wlast    ( int_wlast[i]      ),
-      .s_axi4_wuser    ( int_wuser[i]      ),
-      .m_axi4_wdata    ( m1_axi4_wdata[i]  ),
-      .m_axi4_wvalid   ( m1_axi4_wvalid[i] ),
-      .m_axi4_wready   ( m1_axi4_wready[i] ),
-      .m_axi4_wstrb    ( m1_axi4_wstrb[i]  ),
-      .m_axi4_wlast    ( m1_axi4_wlast[i]  ),
-      .m_axi4_wuser    ( m1_axi4_wuser[i]  )
-    );
-
-  /*
-   * Multiplexer to switch between the two output master ports on the write data (w) channel
-   */
-  always_comb begin
-    /* Only one output can be selected at any time */
-    if (w_master_select[i] == 1'b0) begin
-      int_m0_wvalid[i] = int_wvalid[i];
-      int_m1_wvalid[i] = 1'b0;
-      int_wready[i]    = int_m0_wready[i];
-    end else begin
-      int_m0_wvalid[i] = 1'b0;
-      int_m1_wvalid[i] = int_wvalid[i];
-      int_wready[i]    = int_m1_wready[i];
-    end
-  end
-
-  // }}}
-
-  // Write Response channel (b) {{{
-  /*
-   * write response channel (b)
-   *
-   * ██╗    ██╗██████╗ ██╗████████╗███████╗    ██████╗ ███████╗███████╗██████╗
-   * ██║    ██║██╔══██╗██║╚══██╔══╝██╔════╝    ██╔══██╗██╔════╝██╔════╝██╔══██╗
-   * ██║ █╗ ██║██████╔╝██║   ██║   █████╗      ██████╔╝█████╗  ███████╗██████╔╝
-   * ██║███╗██║██╔══██╗██║   ██║   ██╔══╝      ██╔══██╗██╔══╝  ╚════██║██╔═══╝
-   * ╚███╔███╔╝██║  ██║██║   ██║   ███████╗    ██║  ██║███████╗███████║██║
-   *  ╚══╝╚══╝ ╚═╝  ╚═╝╚═╝   ╚═╝   ╚══════╝    ╚═╝  ╚═╝╚══════╝╚══════╝╚═╝
-   *
-   */
-  axi4_b_buffer
-    #(
-        .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
-        .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-      )
-    u_b_buffer_m0
-    (
-      .axi4_aclk     ( Clk_CI            ),
-      .axi4_arstn    ( Rst_RBI           ),
-      .s_axi4_bid    ( int_m0_bid[i]     ),
-      .s_axi4_bresp  ( int_m0_bresp[i]   ),
-      .s_axi4_bvalid ( int_m0_bvalid[i]  ),
-      .s_axi4_buser  ( int_m0_buser[i]   ),
-      .s_axi4_bready ( int_m0_bready[i]  ),
-      .m_axi4_bid    ( m0_axi4_bid[i]    ),
-      .m_axi4_bresp  ( m0_axi4_bresp[i]  ),
-      .m_axi4_bvalid ( m0_axi4_bvalid[i] ),
-      .m_axi4_buser  ( m0_axi4_buser[i]  ),
-      .m_axi4_bready ( m0_axi4_bready[i] )
-    );
-
-  axi4_b_buffer
-    #(
-        .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
-        .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-      )
-    u_b_buffer_m1
-    (
-      .axi4_aclk      ( Clk_CI            ),
-      .axi4_arstn     ( Rst_RBI           ),
-      .s_axi4_bid     ( int_m1_bid[i]     ),
-      .s_axi4_bresp   ( int_m1_bresp[i]   ),
-      .s_axi4_bvalid  ( int_m1_bvalid[i]  ),
-      .s_axi4_buser   ( int_m1_buser[i]   ),
-      .s_axi4_bready  ( int_m1_bready[i]  ),
-      .m_axi4_bid     ( m1_axi4_bid[i]    ),
-      .m_axi4_bresp   ( m1_axi4_bresp[i]  ),
-      .m_axi4_bvalid  ( m1_axi4_bvalid[i] ),
-      .m_axi4_buser   ( m1_axi4_buser[i]  ),
-      .m_axi4_bready  ( m1_axi4_bready[i] )
-    );
-
-  axi4_b_sender
-    #(
-        .AXI_ID_WIDTH   ( AXI_ID_WIDTH    ),
-        .AXI_USER_WIDTH ( AXI_USER_WIDTH  )
-      )
-    u_b_sender
-    (
-      .axi4_aclk      ( Clk_CI             ),
-      .axi4_arstn     ( Rst_RBI            ),
-      .drop_i         ( b_drop[i]          ),
-      .done_o         ( b_done[i]          ),
-      .id_i           ( b_id_drop[i]       ),
-      .prefetch_i     ( b_prefetch_drop[i] ),
-      .hit_i          ( b_hit_drop[i]      ),
-      .s_axi4_bid     ( s_axi4_bid[i]      ),
-      .s_axi4_bresp   ( s_axi4_bresp[i]    ),
-      .s_axi4_bvalid  ( s_axi4_bvalid[i]   ),
-      .s_axi4_buser   ( s_axi4_buser[i]    ),
-      .s_axi4_bready  ( s_axi4_bready[i]   ),
-      .m_axi4_bid     ( int_bid[i]         ),
-      .m_axi4_bresp   ( int_bresp[i]       ),
-      .m_axi4_bvalid  ( int_bvalid[i]      ),
-      .m_axi4_buser   ( int_buser[i]       ),
-      .m_axi4_bready  ( int_bready[i]      )
-    );
-
-  /*
-   * Multiplexer to switch between the two output master ports on the write response (b) channel
-   */
-  always_comb begin
-     /* Output 1 always gets priority, so if it has something to send connect
-      it and let output 0 wait using rready = 0 */
-    if (int_m1_bvalid[i] == 1'b1) begin
-      int_m0_bready[i] = 1'b0;
-      int_m1_bready[i] = int_bready[i];
-
-      int_bid[i]       = int_m1_bid[i];
-      int_bresp[i]     = int_m1_bresp[i];
-      int_buser[i]     = int_m1_buser[i];
-      int_bvalid[i]    = int_m1_bvalid[i];
-    end else begin
-      int_m0_bready[i] = int_bready[i];
-      int_m1_bready[i] = 1'b0;
-
-      int_bid[i]       = int_m0_bid[i];
-      int_bresp[i]     = int_m0_bresp[i];
-      int_buser[i]     = int_m0_buser[i];
-      int_bvalid[i]    = int_m0_bvalid[i];
-    end
-  end
-
-  // }}}
-
-  // Read Address channel (ar) {{{
-  /*
-   * read address channel (ar)
-   *
-   * ██████╗ ███████╗ █████╗ ██████╗      █████╗ ██████╗ ██████╗ ██████╗
-   * ██╔══██╗██╔════╝██╔══██╗██╔══██╗    ██╔══██╗██╔══██╗██╔══██╗██╔══██╗
-   * ██████╔╝█████╗  ███████║██║  ██║    ███████║██║  ██║██║  ██║██████╔╝
-   * ██╔══██╗██╔══╝  ██╔══██║██║  ██║    ██╔══██║██║  ██║██║  ██║██╔══██╗
-   * ██║  ██║███████╗██║  ██║██████╔╝    ██║  ██║██████╔╝██████╔╝██║  ██║
-   * ╚═╝  ╚═╝╚══════╝╚═╝  ╚═╝╚═════╝     ╚═╝  ╚═╝╚═════╝ ╚═════╝ ╚═╝  ╚═╝
-   *
-   */
-  axi4_ar_buffer
-    #(
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-      )
-    u_ar_buffer
-    (
-      .axi4_aclk      ( Clk_CI            ),
-      .axi4_arstn     ( Rst_RBI           ),
-      .s_axi4_arid    ( s_axi4_arid[i]    ),
-      .s_axi4_araddr  ( s_axi4_araddr[i]  ),
-      .s_axi4_arvalid ( s_axi4_arvalid[i] ),
-      .s_axi4_arready ( s_axi4_arready[i] ),
-      .s_axi4_arlen   ( s_axi4_arlen[i]   ),
-      .s_axi4_arsize  ( s_axi4_arsize[i]  ),
-      .s_axi4_arburst ( s_axi4_arburst[i] ),
-      .s_axi4_arlock  ( s_axi4_arlock[i]  ),
-      .s_axi4_arprot  ( s_axi4_arprot[i]  ),
-      .s_axi4_arcache ( s_axi4_arcache[i] ),
-      .s_axi4_aruser  ( s_axi4_aruser[i]  ),
-      .m_axi4_arid    ( int_arid[i]       ),
-      .m_axi4_araddr  ( int_araddr[i]     ),
-      .m_axi4_arvalid ( int_arvalid[i]    ),
-      .m_axi4_arready ( int_arready[i]    ),
-      .m_axi4_arlen   ( int_arlen[i]      ),
-      .m_axi4_arsize  ( int_arsize[i]     ),
-      .m_axi4_arburst ( int_arburst[i]    ),
-      .m_axi4_arlock  ( int_arlock[i]     ),
-      .m_axi4_arprot  ( int_arprot[i]     ),
-      .m_axi4_arcache ( int_arcache[i]    ),
-      .m_axi4_aruser  ( int_aruser[i]     )
-    );
-
-  axi4_ar_sender
-    #(
-      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
-      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
-      )
-    u_ar_sender_m0
-    (
-      .axi4_aclk       ( Clk_CI                ),
-      .axi4_arstn      ( Rst_RBI               ),
-      .l1_done_o       ( l1_m0_ar_done[i]      ),
-      .l1_accept_i     ( l1_m0_ar_accept[i]    ),
-      .l1_drop_i       ( l1_m0_ar_drop[i]      ),
-      .l1_save_i       ( l1_m0_ar_save[i]      ),
-      .l2_done_o       ( l2_m0_ar_done[i]      ),
-      .l2_accept_i     ( l2_m0_ar_accept[i]    ),
-      .l2_drop_i       ( l2_m0_ar_drop[i]      ),
-      .l2_sending_o    ( l2_m0_ar_sending[i]   ),
-      .l1_araddr_i     ( int_rtrans_addr[i]    ),
-      .l2_araddr_i     ( l2_ar_addr[i]         ),
-      .s_axi4_arid     ( int_arid[i]           ),
-      .s_axi4_arvalid  ( int_m0_arvalid[i]     ),
-      .s_axi4_arready  ( int_m0_arready[i]     ),
-      .s_axi4_arlen    ( int_arlen[i]          ),
-      .s_axi4_arsize   ( int_arsize[i]         ),
-      .s_axi4_arburst  ( int_arburst[i]        ),
-      .s_axi4_arlock   ( int_arlock[i]         ),
-      .s_axi4_arprot   ( int_arprot[i]         ),
-      .s_axi4_arcache  ( int_arcache[i]        ),
-      .s_axi4_aruser   ( int_aruser[i]         ),
-      .m_axi4_arid     ( m0_axi4_arid[i]       ),
-      .m_axi4_araddr   ( m0_axi4_araddr[i]     ),
-      .m_axi4_arvalid  ( m0_axi4_arvalid[i]    ),
-      .m_axi4_arready  ( m0_axi4_arready[i]    ),
-      .m_axi4_arlen    ( m0_axi4_arlen[i]      ),
-      .m_axi4_arsize   ( m0_axi4_arsize[i]     ),
-      .m_axi4_arburst  ( m0_axi4_arburst[i]    ),
-      .m_axi4_arlock   ( m0_axi4_arlock[i]     ),
-      .m_axi4_arprot   ( m0_axi4_arprot[i]     ),
-      .m_axi4_arcache  (                       ),
-      .m_axi4_aruser   ( m0_axi4_aruser[i]     )
-    );
-
-    // The AXCACHE signals are set according to burstiness and cache coherence or statically
-    // when not connected to ACP on Zynq (implemented below).
-      assign m0_read_is_burst[i] = (m0_axi4_arlen[i] != {8{1'b0}}) && (m0_axi4_arburst[i] != 2'b00);
-    `ifndef EN_ACP
-      always_comb begin
-        if ( (l2_m0_ar_sending[i] & l2_cache_coherent[i]) | int_rtrans_cache_coherent[i]) begin
-          if (m0_read_is_burst[i]) begin
-            m0_axi4_arcache[i]  = 4'b1011;
-          end else begin
-            m0_axi4_arcache[i]  = 4'b1111;
-          end
-        end else begin
-          m0_axi4_arcache[i]    = 4'b0011;
-        end
-      end
-    `else
-      assign m0_axi4_arcache[i] = 4'b0011;
-    `endif
-
-  axi4_ar_sender
-    #(
-      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
-      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
-      )
-    u_ar_sender_m1
-    (
-      .axi4_aclk       ( Clk_CI                ),
-      .axi4_arstn      ( Rst_RBI               ),
-      .l1_done_o       ( l1_m1_ar_done[i]      ),
-      .l1_accept_i     ( l1_m1_ar_accept[i]    ),
-      .l1_drop_i       ( l1_m1_ar_drop[i]      ),
-      .l1_save_i       ( l1_m1_ar_save[i]      ),
-      .l2_done_o       ( l2_m1_ar_done[i]      ),
-      .l2_accept_i     ( l2_m1_ar_accept[i]    ),
-      .l2_drop_i       ( l2_m1_ar_drop[i]      ),
-      .l2_sending_o    (                       ), // just helps to set axcache
-      .l1_araddr_i     ( int_rtrans_addr[i]    ),
-      .l2_araddr_i     ( l2_ar_addr[i]         ),
-      .s_axi4_arid     ( int_arid[i]           ),
-      .s_axi4_arvalid  ( int_m1_arvalid[i]     ),
-      .s_axi4_arready  ( int_m1_arready[i]     ),
-      .s_axi4_arlen    ( int_arlen[i]          ),
-      .s_axi4_arsize   ( int_arsize[i]         ),
-      .s_axi4_arburst  ( int_arburst[i]        ),
-      .s_axi4_arlock   ( int_arlock[i]         ),
-      .s_axi4_arprot   ( int_arprot[i]         ),
-      .s_axi4_arcache  ( int_arcache[i]        ),
-      .s_axi4_aruser   ( int_aruser[i]         ),
-      .m_axi4_arid     ( m1_axi4_arid[i]       ),
-      .m_axi4_araddr   ( m1_axi4_araddr[i]     ),
-      .m_axi4_arvalid  ( m1_axi4_arvalid[i]    ),
-      .m_axi4_arready  ( m1_axi4_arready[i]    ),
-      .m_axi4_arlen    ( m1_axi4_arlen[i]      ),
-      .m_axi4_arsize   ( m1_axi4_arsize[i]     ),
-      .m_axi4_arburst  ( m1_axi4_arburst[i]    ),
-      .m_axi4_arlock   ( m1_axi4_arlock[i]     ),
-      .m_axi4_arprot   ( m1_axi4_arprot[i]     ),
-      .m_axi4_arcache  (                       ),
-      .m_axi4_aruser   ( m1_axi4_aruser[i]     )
-    );
-
-    // The AXCACHE signals are set according to burstiness and cache coherence or statically
-    // when not connected to ACP on Zynq (implemented below).
-      assign m1_read_is_burst[i] = (m1_axi4_arlen[i] != {8{1'b0}}) && (m1_axi4_arburst[i] != 2'b00);
-    `ifdef EN_ACP
-      always_comb begin
-        if (m1_read_is_burst[i]) begin
-          m1_axi4_arcache[i]    = 4'b1011;
-        end else begin
-          m1_axi4_arcache[i]    = 4'b1111;
-        end
-      end
-    `else
-      assign m1_axi4_arcache[i] = 4'b0011;
-    `endif
-
-  // }}}
-
-  // Read Response channel (r) {{{
-  /*
-   * read response channel (r)
-   *
-   * ██████╗ ███████╗ █████╗ ██████╗     ██████╗ ███████╗███████╗██████╗
-   * ██╔══██╗██╔════╝██╔══██╗██╔══██╗    ██╔══██╗██╔════╝██╔════╝██╔══██╗
-   * ██████╔╝█████╗  ███████║██║  ██║    ██████╔╝█████╗  ███████╗██████╔╝
-   * ██╔══██╗██╔══╝  ██╔══██║██║  ██║    ██╔══██╗██╔══╝  ╚════██║██╔═══╝
-   * ██║  ██║███████╗██║  ██║██████╔╝    ██║  ██║███████╗███████║██║
-   * ╚═╝  ╚═╝╚══════╝╚═╝  ╚═╝╚═════╝     ╚═╝  ╚═╝╚══════╝╚══════╝╚═╝
-   *
-   */
-  axi4_r_buffer
-    #(
-      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-      )
-    u_r_buffer_m0
-    (
-      .axi4_aclk     ( Clk_CI            ),
-      .axi4_arstn    ( Rst_RBI           ),
-      .s_axi4_rid    ( int_m0_rid[i]     ),
-      .s_axi4_rresp  ( int_m0_rresp[i]   ),
-      .s_axi4_rdata  ( int_m0_rdata[i]   ),
-      .s_axi4_rlast  ( int_m0_rlast[i]   ),
-      .s_axi4_rvalid ( int_m0_rvalid[i]  ),
-      .s_axi4_ruser  ( int_m0_ruser[i]   ),
-      .s_axi4_rready ( int_m0_rready[i]  ),
-      .m_axi4_rid    ( m0_axi4_rid[i]    ),
-      .m_axi4_rresp  ( m0_axi4_rresp[i]  ),
-      .m_axi4_rdata  ( m0_axi4_rdata[i]  ),
-      .m_axi4_rlast  ( m0_axi4_rlast[i]  ),
-      .m_axi4_rvalid ( m0_axi4_rvalid[i] ),
-      .m_axi4_ruser  ( m0_axi4_ruser[i]  ),
-      .m_axi4_rready ( m0_axi4_rready[i] )
-    );
-
-  axi4_r_buffer
-    #(
-      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
-      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
-      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
-      )
-    u_r_buffer_m1
-    (
-      .axi4_aclk     ( Clk_CI            ),
-      .axi4_arstn    ( Rst_RBI           ),
-      .s_axi4_rid    ( int_m1_rid[i]     ),
-      .s_axi4_rresp  ( int_m1_rresp[i]   ),
-      .s_axi4_rdata  ( int_m1_rdata[i]   ),
-      .s_axi4_rlast  ( int_m1_rlast[i]   ),
-      .s_axi4_rvalid ( int_m1_rvalid[i]  ),
-      .s_axi4_ruser  ( int_m1_ruser[i]   ),
-      .s_axi4_rready ( int_m1_rready[i]  ),
-      .m_axi4_rid    ( m1_axi4_rid[i]    ),
-      .m_axi4_rresp  ( m1_axi4_rresp[i]  ),
-      .m_axi4_rdata  ( m1_axi4_rdata[i]  ),
-      .m_axi4_rlast  ( m1_axi4_rlast[i]  ),
-      .m_axi4_rvalid ( m1_axi4_rvalid[i] ),
-      .m_axi4_ruser  ( m1_axi4_ruser[i]  ),
-      .m_axi4_rready ( m1_axi4_rready[i] )
-    );
-
-  axi4_r_sender
-    #(
-      .AXI_DATA_WIDTH  ( AXI_DATA_WIDTH ),
-      .AXI_ID_WIDTH    ( AXI_ID_WIDTH   ),
-      .AXI_USER_WIDTH  ( AXI_USER_WIDTH )
-      )
-    u_r_sender
-    (
-      .axi4_aclk     ( Clk_CI              ),
-      .axi4_arstn    ( Rst_RBI             ),
-      .drop_i        ( lx_r_drop[i]        ),
-      .drop_len_i    ( lx_len_drop[i]      ),
-      .done_o        ( lx_r_done[i]        ),
-      .id_i          ( lx_id_drop[i]       ),
-      .prefetch_i    ( lx_prefetch_drop[i] ),
-      .hit_i         ( lx_hit_drop[i]      ),
-      .s_axi4_rid    ( s_axi4_rid[i]       ),
-      .s_axi4_rresp  ( s_axi4_rresp[i]     ),
-      .s_axi4_rdata  ( s_axi4_rdata[i]     ),
-      .s_axi4_rlast  ( s_axi4_rlast[i]     ),
-      .s_axi4_rvalid ( s_axi4_rvalid[i]    ),
-      .s_axi4_ruser  ( s_axi4_ruser[i]     ),
-      .s_axi4_rready ( s_axi4_rready[i]    ),
-      .m_axi4_rid    ( int_rid[i]          ),
-      .m_axi4_rresp  ( int_rresp[i]        ),
-      .m_axi4_rdata  ( int_rdata[i]        ),
-      .m_axi4_rlast  ( int_rlast[i]        ),
-      .m_axi4_rvalid ( int_rvalid[i]       ),
-      .m_axi4_ruser  ( int_ruser[i]        ),
-      .m_axi4_rready ( int_rready[i]       )
-    );
-
-  /*
-   * Multiplexer to switch between the two output master ports on the read response(r) channel
-   *
-   * Do not perform read burst interleaving as the DMA does not support it. This means we can only
-   * switch between the two masters upon sending rlast or when idle.
-   *
-   * However, if the downstream already performs burst interleaving, this cannot be undone here.
-   * Also, the downstream may interleave a burst reponse with a single-beat transaction. In this
-   * case, the FSM below falls out of the burst mode. To avoid it performing burst interleaving
-   * after such an event, it gives priority to the master which received the last burst in case
-   * both have a have a burst ready (rvalid).
-   *
-   * Order of priority:
-   * 1. Ongoing burst transaction
-   * 2. Single-beat transaction on Master 1.
-   * 3. Single-beat transaction on Master 0.
-   * 4. Burst transaction on master that received the last burst.
-   */
-  // Select signal
-  always_ff @(posedge Clk_CI) begin
-    if (Rst_RBI == 0) begin
-      RRespSel_SP[i] <= 1'b0;
-    end else begin
-      RRespSel_SP[i] <= RRespSel_SN[i];
-    end
-  end
-
-  // FSM
-  always_comb begin : RRespMuxFsm
-    RRespMuxCtrl_SN[i] = RRespMuxCtrl_SP[i];
-    RRespSel_SN[i]     = RRespSel_SP[i];
-
-    RRespBurst_S[i]    = 1'b0;
-    RRespSelIm_S[i]    = 1'b0;
-
-    unique case (RRespMuxCtrl_SP[i])
-
-      IDLE: begin
-        // immediately forward single-beat transactions
-        if      (int_m1_rvalid[i] && int_m1_rlast[i])
-          RRespSelIm_S[i] = 1'b1;
-        else if (int_m0_rvalid[i] && int_m0_rlast[i])
-          RRespSelIm_S[i] = 1'b0;
-
-        // bursts - they also start immediately
-        else if (int_m1_rvalid[i] || int_m0_rvalid[i]) begin
-          RRespMuxCtrl_SN[i] = BUSY;
-
-          // in case both are ready, continue with the master that had the last burst
-          if    (int_m1_rvalid[i] && int_m0_rvalid[i]) begin
-            RRespSel_SN[i]  = RRespSel_SP[i];
-            RRespSelIm_S[i] = RRespSel_SP[i];
-          end else if (int_m1_rvalid[i]) begin
-            RRespSel_SN[i]  = 1'b1;
-            RRespSelIm_S[i] = 1'b1;
-          end else begin
-            RRespSel_SN[i]  = 1'b0;
-            RRespSelIm_S[i] = 1'b0;
-          end
-        end
-      end
-
-      BUSY: begin
-        RRespBurst_S[i] = 1'b1;
-        // detect last handshake of currently ongoing transfer
-        if (int_rvalid[i] && int_rready[i] && int_rlast[i])
-          RRespMuxCtrl_SN[i] = IDLE;
-      end
-
-      default: begin
-        RRespMuxCtrl_SN[i] = IDLE;
-      end
-
-    endcase
-  end
-
-  // FSM state
-  always_ff @(posedge Clk_CI) begin
-    if (Rst_RBI == 0) begin
-      RRespMuxCtrl_SP[i] <= IDLE;
-    end else begin
-      RRespMuxCtrl_SP[i] <= RRespMuxCtrl_SN[i];
-    end
-  end
-
-  // Actual multiplexer
-  always_comb begin
-    if ( (RRespBurst_S[i] && RRespSel_SP[i]) || (!RRespBurst_S[i] && RRespSelIm_S[i]) ) begin
-      int_m0_rready[i] = 1'b0;
-      int_m1_rready[i] = int_rready[i];
-
-      int_rid[i]    = int_m1_rid[i];
-      int_rresp[i]  = int_m1_rresp[i];
-      int_rdata[i]  = int_m1_rdata[i];
-      int_rlast[i]  = int_m1_rlast[i];
-      int_ruser[i]  = int_m1_ruser[i];
-      int_rvalid[i] = int_m1_rvalid[i];
-    end else begin
-      int_m0_rready[i] = int_rready[i];
-      int_m1_rready[i] = 1'b0;
-
-      int_rid[i]    = int_m0_rid[i];
-      int_rresp[i]  = int_m0_rresp[i];
-      int_rdata[i]  = int_m0_rdata[i];
-      int_rlast[i]  = int_m0_rlast[i];
-      int_ruser[i]  = int_m0_ruser[i];
-      int_rvalid[i] = int_m0_rvalid[i];
-    end
-  end
-
-  end // BUF & SEND
-
-  // }}}
-
-  endgenerate // BUF & SEND }}}
-
-  // Log {{{
-
-`ifdef RAB_AX_LOG_EN
-  AxiBramLogger
-    #(
-      .AXI_ID_BITW     ( AXI_ID_WIDTH        ),
-      .AXI_ADDR_BITW   ( AXI_S_ADDR_WIDTH    ),
-      .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES )
-    )
-    u_aw_logger
-    (
-      .Clk_CI          ( NonGatedClk_CI    ),
-      .TimestampClk_CI ( Clk_CI            ),
-      .Rst_RBI         ( Rst_RBI           ),
-      .AxiValid_SI     ( s_axi4_awvalid[1] ),
-      .AxiReady_SI     ( s_axi4_awready[1] ),
-      .AxiId_DI        ( s_axi4_awid[1]    ),
-      .AxiAddr_DI      ( s_axi4_awaddr[1]  ),
-      .AxiLen_DI       ( s_axi4_awlen[1]   ),
-      .Clear_SI        ( AwLogClr_SI       ),
-      .LogEn_SI        ( LogEn_SI          ),
-      .Full_SO         ( int_aw_log_full   ),
-      .Ready_SO        ( AwLogRdy_SO       ),
-      .Bram_PS         ( AwBram_PS         )
-    );
-
-  AxiBramLogger
-    #(
-      .AXI_ID_BITW     ( AXI_ID_WIDTH        ),
-      .AXI_ADDR_BITW   ( AXI_S_ADDR_WIDTH    ),
-      .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES )
-    )
-    u_ar_logger
-    (
-      .Clk_CI          ( NonGatedClk_CI    ),
-      .TimestampClk_CI ( Clk_CI            ),
-      .Rst_RBI         ( Rst_RBI           ),
-      .AxiValid_SI     ( s_axi4_arvalid[1] ),
-      .AxiReady_SI     ( s_axi4_arready[1] ),
-      .AxiId_DI        ( s_axi4_arid[1]    ),
-      .AxiAddr_DI      ( s_axi4_araddr[1]  ),
-      .AxiLen_DI       ( s_axi4_arlen[1]   ),
-      .Clear_SI        ( ArLogClr_SI       ),
-      .LogEn_SI        ( LogEn_SI          ),
-      .Full_SO         ( int_ar_log_full   ),
-      .Ready_SO        ( ArLogRdy_SO       ),
-      .Bram_PS         ( ArBram_PS         )
-    );
-`endif
-
-  // }}}
-
-  // RAB Core {{{
-  // ██████╗  █████╗ ██████╗      ██████╗ ██████╗ ██████╗ ███████╗
-  // ██╔══██╗██╔══██╗██╔══██╗    ██╔════╝██╔═══██╗██╔══██╗██╔════╝
-  // ██████╔╝███████║██████╔╝    ██║     ██║   ██║██████╔╝█████╗
-  // ██╔══██╗██╔══██║██╔══██╗    ██║     ██║   ██║██╔══██╗██╔══╝
-  // ██║  ██║██║  ██║██████╔╝    ╚██████╗╚██████╔╝██║  ██║███████╗
-  // ╚═╝  ╚═╝╚═╝  ╚═╝╚═════╝      ╚═════╝ ╚═════╝ ╚═╝  ╚═╝╚══════╝
-  //
-  /*
-   * rab_core
-   *
-   * The rab core translates addresses. It has two ports, which can be used
-   * independently, however they will compete for time internally, as lookups
-   * are serialized.
-   *
-   * type is the read(0) or write(1) used to check the protection flags. If they
-   * don't match an interrupt is created on the int_prot line.
-   */
-
-  rab_core
-    #(
-      .N_PORTS             ( N_PORTS             ),
-      .N_L2_SETS           ( N_L2_SETS           ),
-      .N_L2_SET_ENTRIES    ( N_L2_SET_ENTRIES    ),
-      .AXI_DATA_WIDTH      ( AXI_DATA_WIDTH      ),
-      .AXI_S_ADDR_WIDTH    ( AXI_S_ADDR_WIDTH    ),
-      .AXI_M_ADDR_WIDTH    ( AXI_M_ADDR_WIDTH    ),
-      .AXI_LITE_DATA_WIDTH ( AXI_LITE_DATA_WIDTH ),
-      .AXI_LITE_ADDR_WIDTH ( AXI_LITE_ADDR_WIDTH ),
-      .AXI_ID_WIDTH        ( AXI_ID_WIDTH        ),
-      .AXI_USER_WIDTH      ( AXI_USER_WIDTH      ),
-      .MH_FIFO_DEPTH       ( MH_FIFO_DEPTH       )
-    )
-    u_rab_core
-    (
-      .Clk_CI               ( Clk_CI                     ),
-      .Rst_RBI              ( Rst_RBI                    ),
-
-      // Config IF
-      .s_axi_awaddr         ( s_axi4lite_awaddr          ),
-      .s_axi_awvalid        ( s_axi4lite_awvalid         ),
-      .s_axi_awready        ( s_axi4lite_awready         ),
-      .s_axi_wdata          ( s_axi4lite_wdata           ),
-      .s_axi_wstrb          ( s_axi4lite_wstrb           ),
-      .s_axi_wvalid         ( s_axi4lite_wvalid          ),
-      .s_axi_wready         ( s_axi4lite_wready          ),
-      .s_axi_bresp          ( s_axi4lite_bresp           ),
-      .s_axi_bvalid         ( s_axi4lite_bvalid          ),
-      .s_axi_bready         ( s_axi4lite_bready          ),
-      .s_axi_araddr         ( s_axi4lite_araddr          ),
-      .s_axi_arvalid        ( s_axi4lite_arvalid         ),
-      .s_axi_arready        ( s_axi4lite_arready         ),
-      .s_axi_rready         ( s_axi4lite_rready          ),
-      .s_axi_rdata          ( s_axi4lite_rdata           ),
-      .s_axi_rresp          ( s_axi4lite_rresp           ),
-      .s_axi_rvalid         ( s_axi4lite_rvalid          ),
-
-      // L1 miss info outputs -> L2 TLB arbitration
-      .int_miss             ( rab_miss                   ),
-      .int_multi            ( rab_multi                  ),
-      .int_prot             ( rab_prot                   ),
-      .int_prefetch         ( rab_prefetch               ),
-      .int_mhf_full         ( int_mhf_full               ),
-
-      // L1 transaction info outputs -> L2 TLB arbitration
-      .int_axaddr_o         ( L1OutAddr_D                ),
-      .int_axid_o           ( L1OutId_D                  ),
-      .int_axlen_o          ( L1OutLen_D                 ),
-      .int_axuser_o         ( L1OutUser_D                ),
-
-      // Write Req IF
-      .port1_addr           ( int_awaddr                 ),
-      .port1_id             ( int_awid                   ),
-      .port1_len            ( int_awlen                  ),
-      .port1_size           ( int_awsize                 ),
-      .port1_addr_valid     ( int_awvalid & ~aw_in_stall ), // avoid the FSM accepting new AW requests
-      .port1_type           ( {N_PORTS{1'b1}}            ),
-      .port1_user           ( int_awuser                 ),
-      .port1_sent           ( int_wtrans_sent            ), // signal done to L1 FSM
-      .port1_out_addr       ( int_wtrans_addr            ),
-      .port1_cache_coherent ( int_wtrans_cache_coherent  ),
-      .port1_accept         ( int_wtrans_accept          ),
-      .port1_drop           ( int_wtrans_drop            ),
-      .port1_miss           ( int_wtrans_miss            ),
-
-      // Read Req IF
-      .port2_addr           ( int_araddr                 ),
-      .port2_id             ( int_arid                   ),
-      .port2_len            ( int_arlen                  ),
-      .port2_size           ( int_arsize                 ),
-      .port2_addr_valid     ( int_arvalid                ),
-      .port2_type           ( {N_PORTS{1'b0}}            ),
-      .port2_user           ( int_aruser                 ),
-      .port2_sent           ( int_rtrans_sent            ), // signal done to L1 FSM
-      .port2_out_addr       ( int_rtrans_addr            ),
-      .port2_cache_coherent ( int_rtrans_cache_coherent  ),
-      .port2_accept         ( int_rtrans_accept          ),
-      .port2_drop           ( int_rtrans_drop            ),
-      .port2_miss           ( int_rtrans_miss            ),
-
-      // L2 miss info inputs -> axi_rab_cfg
-      .miss_l2_i            ( L2Miss_S                   ),
-      .miss_l2_addr_i       ( L2OutInAddr_DP             ),
-      .miss_l2_id_i         ( L2OutId_DP                 ),
-      .miss_l2_user_i       ( L2OutUser_DP               ),
-
-      // L2 config outputs
-      .wdata_l2_o           ( L2CfgWData_D               ),
-      .waddr_l2_o           ( L2CfgWAddr_D               ),
-      .wren_l2_o            ( L2CfgWE_S                  )
-    );
-
-  // }}}
-
-  // AX SPLITS {{{
-  //  █████╗ ██╗  ██╗    ███████╗██████╗ ██╗     ██╗████████╗
-  // ██╔══██╗╚██╗██╔╝    ██╔════╝██╔══██╗██║     ██║╚══██╔══╝
-  // ███████║ ╚███╔╝     ███████╗██████╔╝██║     ██║   ██║
-  // ██╔══██║ ██╔██╗     ╚════██║██╔═══╝ ██║     ██║   ██║
-  // ██║  ██║██╔╝ ██╗    ███████║██║     ███████╗██║   ██║
-  // ╚═╝  ╚═╝╚═╝  ╚═╝    ╚══════╝╚═╝     ╚══════╝╚═╝   ╚═╝
-  //
-  /**
-   * Multiplex the two output master ports of the Read Address and Write Address (AR/AW) channels.
-   *
-   * Use the `int_xmaster_select` signal to route the signals to either Master 0 (to memory) or
-   * Master 1 (to ACP). In case of an L1 miss: Route the signals to both masters. They shall be
-   * saved until the L2 outputs are available.
-   */
-  generate for (i = 0; i < N_PORTS; i++) begin : AX_SPLIT
-
-    /*
-     * When accepting L1 transactions, we must just do so on the selected master. Drop requests must
-     * be performed on any one of the two masters. Save requests must be performed by both masters.
-     */
-    always_comb begin : AW_L1_SPLIT
-
-      // TLB handshake
-      l1_m0_aw_accept[i] = 1'b0;
-      l1_m1_aw_accept[i] = 1'b0;
-      l1_m0_aw_drop[i]   = 1'b0;
-      l1_m1_aw_drop[i]   = 1'b0;
-      l1_m0_aw_save[i]   = 1'b0;
-      l1_m1_aw_save[i]   = 1'b0;
-
-      l1_mx_aw_done[i]   = 1'b0;
-
-      // AXI sender input handshake
-      int_m0_awvalid[i]  = 1'b0;
-      int_m1_awvalid[i]  = 1'b0;
-      int_awready[i]     = 1'b0;
-
-      // accept on selected master only
-      if (l1_aw_accept[i]) begin
-        if (int_wmaster_select[i]) begin
-          l1_m1_aw_accept[i] = 1'b1;
-          l1_mx_aw_done[i]   = l1_m1_aw_done[i];
-
-          int_m1_awvalid[i]  = int_awvalid[i];
-          int_awready[i]     = int_m1_awready[i];
-
-        end else begin
-          l1_m0_aw_accept[i] = 1'b1;
-          l1_mx_aw_done[i]   = l1_m0_aw_done[i];
-
-          int_m0_awvalid[i]  = int_awvalid[i];
-          int_awready[i]     = int_m0_awready[i];
-        end
-
-      // drop on Master 0 only
-      end else if (l1_aw_drop[i]) begin
-        l1_m0_aw_drop[i]     = 1'b1;
-        l1_mx_aw_done[i]     = l1_m0_aw_done[i];
-
-        int_m0_awvalid[i]    = int_awvalid[i];
-        int_awready[i]       = l1_m0_aw_done[i];
-
-      // save on both masters
-      end else if (l1_aw_save[i]) begin
-        // split save
-        l1_m0_aw_save[i]     = ~l1_m0_aw_done_SP[i];
-        l1_m1_aw_save[i]     = ~l1_m1_aw_done_SP[i];
-
-        // combine done
-        l1_mx_aw_done[i]     = l1_m0_aw_done_SP[i] & l1_m1_aw_done_SP[i];
-
-        int_m0_awvalid[i]    = int_awvalid[i];
-        int_m1_awvalid[i]    = int_awvalid[i];
-        int_awready[i]       = l1_mx_aw_done[i];
-      end
-    end
-
-    // signal back to handshake splitter
-    assign l1_aw_done[i]     = l1_mx_aw_done[i];
-
-    always_ff @(posedge Clk_CI) begin : L1_MX_AW_DONE_REG
-      if (Rst_RBI == 0) begin
-        l1_m0_aw_done_SP[i] <= 1'b0;
-        l1_m1_aw_done_SP[i] <= 1'b0;
-      end else if (l1_mx_aw_done[i]) begin
-        l1_m0_aw_done_SP[i] <= 1'b0;
-        l1_m1_aw_done_SP[i] <= 1'b0;
-      end else begin
-        l1_m0_aw_done_SP[i] <= l1_m0_aw_done_SP[i] | l1_m0_aw_done[i];
-        l1_m1_aw_done_SP[i] <= l1_m1_aw_done_SP[i] | l1_m1_aw_done[i];
-      end
-    end
-
-    /*
-     * When accepting L2 transactions, we must drop the corresponding transaction from the other
-     * master to make it available again for save requests from L1_DROP_SAVE.
-     */
-    always_comb begin : AW_L2_SPLIT
-
-      l2_m0_aw_accept[i] = 1'b0;
-      l2_m1_aw_accept[i] = 1'b0;
-      l2_m0_aw_drop[i]   = 1'b0;
-      l2_m1_aw_drop[i]   = 1'b0;
-
-      // de-assert request signals individually upon handshakes
-      if (l2_aw_accept[i]) begin
-        if (l2_master_select[i]) begin
-          l2_m1_aw_accept[i] = ~l2_m1_aw_done_SP[i];
-          l2_m0_aw_drop[i]   = ~l2_m0_aw_done_SP[i];
-
-        end else begin
-          l2_m0_aw_accept[i] = ~l2_m0_aw_done_SP[i];
-          l2_m1_aw_drop[i]   = ~l2_m1_aw_done_SP[i];
-
-        end
-      end else begin
-        l2_m0_aw_drop[i]     = ~l2_m0_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0;
-        l2_m1_aw_drop[i]     = ~l2_m1_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0;
-
-      end
-
-      // combine done
-      l2_mx_aw_done[i] = l2_m0_aw_done_SP[i] & l2_m1_aw_done_SP[i];
-
-      l2_aw_done[i]    = l2_mx_aw_done[i];
-    end
-
-    always_ff @(posedge Clk_CI) begin : L2_MX_AW_DONE_REG
-      if (Rst_RBI == 0) begin
-        l2_m0_aw_done_SP[i] <= 1'b0;
-        l2_m1_aw_done_SP[i] <= 1'b0;
-      end else if (l2_mx_aw_done[i]) begin
-        l2_m0_aw_done_SP[i] <= 1'b0;
-        l2_m1_aw_done_SP[i] <= 1'b0;
-      end else begin
-        l2_m0_aw_done_SP[i] <= l2_m0_aw_done_SP[i] | l2_m0_aw_done[i];
-        l2_m1_aw_done_SP[i] <= l2_m1_aw_done_SP[i] | l2_m1_aw_done[i];
-      end
-    end
-
-    /*
-     * When accepting L1 transactions, we must just do so on the selected master. Drop requests must
-     * be performed on any one of the two masters. Save requests must be performed by both masters.
-     */
-    always_comb begin : AR_L1_SPLIT
-
-      // TLB handshake
-      l1_m0_ar_accept[i] = 1'b0;
-      l1_m1_ar_accept[i] = 1'b0;
-      l1_m0_ar_drop[i]   = 1'b0;
-      l1_m1_ar_drop[i]   = 1'b0;
-      l1_m0_ar_save[i]   = 1'b0;
-      l1_m1_ar_save[i]   = 1'b0;
-
-      l1_mx_ar_done[i]   = 1'b0;
-
-      // AXI sender input handshake
-      int_m0_arvalid[i]  = 1'b0;
-      int_m1_arvalid[i]  = 1'b0;
-      int_arready[i]     = 1'b0;
-
-      // accept on selected master only
-      if (l1_ar_accept[i]) begin
-        if (int_rmaster_select[i]) begin
-          l1_m1_ar_accept[i] = 1'b1;
-          l1_mx_ar_done[i]   = l1_m1_ar_done[i];
-
-          int_m1_arvalid[i]  = int_arvalid[i];
-          int_arready[i]     = int_m1_arready[i];
-
-        end else begin
-          l1_m0_ar_accept[i] = 1'b1;
-          l1_mx_ar_done[i]   = l1_m0_ar_done[i];
-
-          int_m0_arvalid[i]  = int_arvalid[i];
-          int_arready[i]     = int_m0_arready[i];
-        end
-
-      // drop on Master 0 only
-      end else if (l1_ar_drop[i]) begin
-        l1_m0_ar_drop[i]     = 1'b1;
-        l1_mx_ar_done[i]     = l1_m0_ar_done[i];
-
-        int_m0_arvalid[i]    = int_arvalid[i];
-        int_arready[i]       = l1_m0_ar_done[i];
-
-      // save on both masters
-      end else if (l1_ar_save[i]) begin
-        // split save
-        l1_m0_ar_save[i]     = ~l1_m0_ar_done_SP[i];
-        l1_m1_ar_save[i]     = ~l1_m1_ar_done_SP[i];
-
-        // combine done
-        l1_mx_ar_done[i]     = l1_m0_ar_done_SP[i] & l1_m1_ar_done_SP[i];
-
-        int_m0_arvalid[i]    = int_arvalid[i];
-        int_m1_arvalid[i]    = int_arvalid[i];
-        int_arready[i]       = l1_mx_ar_done[i];
-      end
-    end
-
-    // signal back to handshake splitter
-    assign l1_ar_done[i]     = l1_mx_ar_done[i];
-
-    always_ff @(posedge Clk_CI) begin : L1_MX_AR_DONE_REG
-      if (Rst_RBI == 0) begin
-        l1_m0_ar_done_SP[i] <= 1'b0;
-        l1_m1_ar_done_SP[i] <= 1'b0;
-      end else if (l1_mx_ar_done[i]) begin
-        l1_m0_ar_done_SP[i] <= 1'b0;
-        l1_m1_ar_done_SP[i] <= 1'b0;
-      end else begin
-        l1_m0_ar_done_SP[i] <= l1_m0_ar_done_SP[i] | l1_m0_ar_done[i];
-        l1_m1_ar_done_SP[i] <= l1_m1_ar_done_SP[i] | l1_m1_ar_done[i];
-      end
-    end
-
-    /*
-     * When accepting L2 transactions, we must drop the corresponding transaction from the other
-     * master to make it available again for save requests from L1_DROP_SAVE.
-     */
-    always_comb begin : AR_L2_SPLIT
-
-      l2_m0_ar_accept[i] = 1'b0;
-      l2_m1_ar_accept[i] = 1'b0;
-      l2_m0_ar_drop[i]   = 1'b0;
-      l2_m1_ar_drop[i]   = 1'b0;
-
-      // de-assert request signals individually upon handshakes
-      if (l2_ar_accept[i]) begin
-        if (l2_master_select[i]) begin
-          l2_m1_ar_accept[i] = ~l2_m1_ar_done_SP[i];
-          l2_m0_ar_drop[i]   = ~l2_m0_ar_done_SP[i];
-
-        end else begin
-          l2_m0_ar_accept[i] = ~l2_m0_ar_done_SP[i];
-          l2_m1_ar_drop[i]   = ~l2_m1_ar_done_SP[i];
-
-        end
-      end else if (l2_ar_drop[i]) begin
-        l2_m0_ar_drop[i]     = ~l2_m0_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0;
-        l2_m1_ar_drop[i]     = ~l2_m1_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0;
-
-      end
-
-      // combine done
-      l2_mx_ar_done[i] = l2_m0_ar_done_SP[i] & l2_m1_ar_done_SP[i];
-
-      l2_ar_done[i]    = l2_mx_ar_done[i];
-    end
-
-    always_ff @(posedge Clk_CI) begin : L2_MX_AR_DONE_REG
-      if (Rst_RBI == 0) begin
-        l2_m0_ar_done_SP[i] <= 1'b0;
-        l2_m1_ar_done_SP[i] <= 1'b0;
-      end else if (l2_mx_ar_done[i]) begin
-        l2_m0_ar_done_SP[i] <= 1'b0;
-        l2_m1_ar_done_SP[i] <= 1'b0;
-      end else begin
-        l2_m0_ar_done_SP[i] <= l2_m0_ar_done_SP[i] | l2_m0_ar_done[i];
-        l2_m1_ar_done_SP[i] <= l2_m1_ar_done_SP[i] | l2_m1_ar_done[i];
-      end
-    end
-
-  end // AX_SPLIT
-  endgenerate // AX_SPLIT
-
-  // }}}
-
-  // HANDSHAKE SPLITS {{{
-  // ██╗  ██╗███████╗    ███████╗██████╗ ██╗     ██╗████████╗
-  // ██║  ██║██╔════╝    ██╔════╝██╔══██╗██║     ██║╚══██╔══╝
-  // ███████║███████╗    ███████╗██████╔╝██║     ██║   ██║
-  // ██╔══██║╚════██║    ╚════██║██╔═══╝ ██║     ██║   ██║
-  // ██║  ██║███████║    ███████║██║     ███████╗██║   ██║
-  // ╚═╝  ╚═╝╚══════╝    ╚══════╝╚═╝     ╚══════╝╚═╝   ╚═╝
-  //
-  /*
-   * We need to perform combined handshakes with multiple AXI modules
-   * upon transactions drops, accepts, saves etc. from two TLBs.
-   */
-  generate for (i = 0; i < N_PORTS; i++) begin : HANDSHAKE_SPLIT
-
-    assign l1_xw_accept[i]    = int_wtrans_accept[i] & ~aw_out_stall[i];
-    assign int_wtrans_sent[i] = l1_xw_done[i];
-
-    assign l1_ar_accept[i]    = int_rtrans_accept[i];
-    assign int_rtrans_sent[i] = l1_ar_done[i];
-
-    /*
-     * L1 AW sender + W buffer handshake split
-     */
-    // forward
-    assign l1_aw_accept[i] = l1_xw_accept[i] & ~l1_aw_done_SP[i];
-    assign l1_w_accept[i]  = l1_xw_accept[i] & ~l1_w_done_SP[i];
-
-    assign l1_aw_save[i]   = l1_xw_save[i]   & ~l1_aw_done_SP[i];
-    assign l1_w_save[i]    = l1_xw_save[i]   & ~l1_w_done_SP[i];
-
-    assign l1_aw_drop[i]   = l1_xw_drop[i]   & ~l1_aw_done_SP[i];
-    assign l1_w_drop[i]    = l1_xw_drop[i]   & ~l1_w_done_SP[i];
-
-    // backward
-    assign l1_xw_done[i]   = l1_aw_done_SP[i] & l1_w_done_SP[i];
-
-    always_ff @(posedge Clk_CI) begin : L1_XW_HS_SPLIT
-      if (Rst_RBI == 0) begin
-        l1_aw_done_SP[i] <= 1'b0;
-        l1_w_done_SP[i]  <= 1'b0;
-      end else if (l1_xw_done[i]) begin
-        l1_aw_done_SP[i] <= 1'b0;
-        l1_w_done_SP[i]  <= 1'b0;
-      end else begin
-        l1_aw_done_SP[i] <= l1_aw_done_SP[i] | l1_aw_done[i];
-        l1_w_done_SP[i]  <= l1_w_done_SP[i]  | l1_w_done[i];
-      end
-    end
-
-    if (ENABLE_L2TLB[i] == 1) begin : L2_HS_SPLIT
-
-      /*
-       * L1 AR sender + R sender handshake split
-       *
-       * AR and R do not need to be strictly in sync. We thus use separate handshakes.
-       * But the handshake signals for the R sender are multiplexed with the those for
-       * the L2. However, L2_ACCEPT_DROP_SAVE has always higher priority.
-       */
-      assign lx_r_drop[i] = l2_r_drop[i] | l1_r_drop[i];
-      assign l1_r_done[i] = l2_r_drop[i] ? 1'b0         : lx_r_done[i];
-      assign l2_r_done[i] = l2_r_drop[i] ? lx_r_done[i] : 1'b0;
-
-      /*
-       * L2 AW sender + W buffer handshake split
-       */
-      // forward
-      assign l2_aw_accept[i] = l2_xw_accept[i] & ~l2_aw_done_SP[i];
-      assign l2_w_accept[i]  = l2_xw_accept[i] & ~l2_w_done_SP[i];
-
-      assign l2_aw_drop[i]   = l2_xw_drop[i]   & ~l2_aw_done_SP[i];
-      assign l2_w_drop[i]    = l2_xw_drop[i]   & ~l2_w_done_SP[i];
-
-      // backward
-      assign l2_xw_done[i]   = l2_aw_done_SP[i] & l2_w_done_SP[i];
-
-      always_ff @(posedge Clk_CI) begin : L2_XW_HS_SPLIT
-        if (Rst_RBI == 0) begin
-          l2_aw_done_SP[i] <= 1'b0;
-          l2_w_done_SP[i]  <= 1'b0;
-        end else if (l2_xw_done[i]) begin
-          l2_aw_done_SP[i] <= 1'b0;
-          l2_w_done_SP[i]  <= 1'b0;
-        end else begin
-          l2_aw_done_SP[i] <= l2_aw_done_SP[i] | l2_aw_done[i];
-          l2_w_done_SP[i]  <= l2_w_done_SP[i]  | l2_w_done[i];
-        end
-      end
-
-      /*
-       * L2 AR + R sender handshake split
-       */
-      // forward
-      assign l2_ar_drop[i]   = l2_xr_drop[i]   & ~l2_ar_done_SP[i];
-      assign l2_r_drop[i]    = l2_xr_drop[i]   & ~l2_r_done_SP[i];
-
-      // backward - make sure to always clear L2_XR_HS_SPLIT
-      always_comb begin
-        if (l2_xr_drop[i]) begin
-          l2_xr_done[i]      = l2_ar_done_SP[i] & l2_r_done_SP[i];
-        end else begin
-          l2_xr_done[i]      = l2_ar_done_SP[i];
-        end
-      end
-
-      always_ff @(posedge Clk_CI) begin : L2_XR_HS_SPLIT
-        if (Rst_RBI == 0) begin
-          l2_ar_done_SP[i] <= 1'b0;
-          l2_r_done_SP[i]  <= 1'b0;
-        end else if (l2_xr_done[i]) begin
-          l2_ar_done_SP[i] <= 1'b0;
-          l2_r_done_SP[i]  <= 1'b0;
-        end else begin
-          l2_ar_done_SP[i] <= l2_ar_done_SP[i] | l2_ar_done[i];
-          l2_r_done_SP[i]  <= l2_r_done_SP[i]  | l2_r_done[i];
-        end
-      end
-
-    end else begin // if (ENABLE_L2TLB[i] == 1)
-
-      assign lx_r_drop[i]     = l1_r_drop[i];
-      assign l1_r_done[i]     = lx_r_done[i];
-
-      assign l2_aw_accept[i]  = 1'b0;
-      assign l2_w_accept[i]   = 1'b0;
-      assign l2_aw_drop[i]    = 1'b0;
-      assign l2_w_drop[i]     = 1'b0;
-      assign l2_xw_done[i]    = 1'b0;
-      assign l2_aw_done_SP[i] = 1'b0;
-      assign l2_w_done_SP[i]  = 1'b0;
-
-      assign l2_ar_accept[i]  = 1'b0;
-      assign l2_ar_drop[i]    = 1'b0;
-      assign l2_r_drop[i]     = 1'b0;
-      assign l2_xr_done[i]    = 1'b0;
-      assign l2_r_done[i]     = 1'b0;
-      assign l2_ar_done_SP[i] = 1'b0;
-      assign l2_r_done_SP[i]  = 1'b0;
-
-    end // if (ENABLE_L2TLB[i] == 1)
-
-  end // HANDSHAKE_SPLIT
-  endgenerate // HANDSHAKE_SPLIT
-
-  // }}}
-
-  // L2 TLB {{{
-  // ██╗     ██████╗     ████████╗██╗     ██████╗
-  // ██║     ╚════██╗    ╚══██╔══╝██║     ██╔══██╗
-  // ██║      █████╔╝       ██║   ██║     ██████╔╝
-  // ██║     ██╔═══╝        ██║   ██║     ██╔══██╗
-  // ███████╗███████╗       ██║   ███████╗██████╔╝
-  // ╚══════╝╚══════╝       ╚═╝   ╚══════╝╚═════╝
-  //
-  /*
-   * l2_tlb
-   *
-   * The L2 TLB translates addresses upon misses in the L1 TLB (rab_core).
-   *
-   * It supports one ongoing translation at a time. If an L1 miss occurs while the L2 is busy,
-   * the L1 is stalled untill the L2 is available again.
-   *
-   */
-  generate for (i = 0; i < N_PORTS; i++) begin : L2_TLB
-    if (ENABLE_L2TLB[i] == 1) begin : L2_TLB
-
-      /*
-       * L1 output selector
-       */
-      assign L1OutRwType_D[i] = int_wtrans_drop[i] ? 1'b1 : 1'b0;
-      assign L1OutProt_D[i]   = rab_prot[i];
-      assign L1OutMulti_D[i]  = rab_multi[i];
-
-      /*
-       * L1 output control + L1_DROP_BUF, L2_IN_BUF management
-       *
-       * Forward the L1 drop request to AR/AW sender modules if
-       * 1. the transactions needs to be dropped (L1 multi, prot, prefetch), or
-       * 2. if a lookup in the L2 TLB is required (L1 miss) and the input buffer is not full.
-       *
-       * The AR/AW senders do not support more than 1 oustanding L1 miss. The push back towards
-       * the upstream is realized by not accepting the save request (saving the L1 transaction)
-       * in the senders as long as the L2 TLB is busy or has valid output. This ultimately
-       * blocks the L1 TLB.
-       *
-       * Together with the AW drop/save, we also perform the W drop/save as AW and W need to
-       * absolutely remain in order. In contrast, the R drop is performed
-       */
-      always_comb begin : L1_DROP_SAVE
-
-        l1_ar_drop[i]       = 1'b0;
-        l1_ar_save[i]       = 1'b0;
-        l1_xw_drop[i]       = 1'b0;
-        l1_xw_save[i]       = 1'b0;
-
-        l1_id_drop[i]       = L1OutId_D[i];
-        l1_len_drop[i]      = L1OutLen_D[i];
-        l1_prefetch_drop[i] = rab_prefetch[i];
-        l1_hit_drop[i]      = 1'b1; // there are no drops for L1 misses
-
-        L1DropEn_S[i]       = 1'b0;
-        L2InEn_S[i]         = 1'b0;
-
-        if ( rab_prot[i] | rab_multi[i] | rab_prefetch[i] ) begin
-          // 1. Drop
-          l1_ar_drop[i] = int_rtrans_drop[i] & ~L1DropValid_SP[i];
-          l1_xw_drop[i] = int_wtrans_drop[i] & ~L1DropValid_SP[i];
-
-          // Store to L1_DROP_BUF upon handshake
-          L1DropEn_S[i] = (l1_ar_drop[i] & l1_ar_done[i]) |
-                          (l1_xw_drop[i] & l1_xw_done[i]);
-
-        end else if ( rab_miss[i] ) begin
-          // 2. Save - Make sure L2 is really available.
-          l1_ar_save[i] = int_rtrans_drop[i] & ~L2Busy_S[i];
-          l1_xw_save[i] = int_wtrans_drop[i] & ~L2Busy_S[i];
-
-          // Store to L2_IN_BUF upon handshake - triggers the L2 TLB
-          L2InEn_S[i]   = (l1_ar_save[i] & l1_ar_done[i]) |
-                          (l1_xw_save[i] & l1_xw_done[i]);
-        end
-      end
-
-      /*
-       * L2 output control + L2_OUT_BUF management + R/B sender control + W buffer control
-       *
-       * Perform L1 R transaction drops unless the L2 output buffer holds valid data. The AXI specs
-       * require the B response to be sent only after consuming/discarding the corresponding data
-       * in the W channel. Thus, we only send L2 drop request to the W buffer here. The drop
-       * request to the B sender is then sent by the W buffer autonomously.
-       *
-       * L1 AW/W drop requests are managed by L1_DROP_SAVE.
-       */
-      always_comb begin : L2_ACCEPT_DROP_SAVE
-
-        l2_ar_addr[i]       =  'b0;
-        l2_aw_addr[i]       =  'b0;
-        l2_ar_accept[i]     = 1'b0;
-        l2_xr_drop[i]       = 1'b0;
-        l2_xw_accept[i]     = 1'b0;
-        l2_xw_drop[i]       = 1'b0;
-
-        l1_r_drop[i]        = 1'b0;
-
-        lx_id_drop[i]       =  'b0;
-        lx_len_drop[i]      =  'b0;
-        lx_prefetch_drop[i] = 1'b0;
-        lx_hit_drop[i]      = 1'b0;
-
-        L1DropValid_SN[i]   = L1DropValid_SP[i] | L1DropEn_S[i];
-        L2OutValid_SN[i]    = L2OutValid_SP[i];
-        L2OutReady_S[i]     = 1'b0;
-        L2OutEn_S[i]        = 1'b0;
-
-        L2Miss_S[i]         = 1'b0;
-        int_multi[i]        = 1'b0;
-        int_prot[i]         = 1'b0;
-
-        if (L2OutValid_SP[i] == 1'b0) begin
-
-          // Drop L1 from R senders
-          if (L1DropValid_SP[i] == 1'b1) begin
-
-            // Only perform the R sender drop here.
-            if (~L1DropRwType_DP[i]) begin
-
-              l1_r_drop[i]        = 1'b1;
-              lx_id_drop[i]       = L1DropId_DP[i];
-              lx_len_drop[i]      = L1DropLen_DP[i];
-              lx_prefetch_drop[i] = L1DropPrefetch_S[i];
-              lx_hit_drop[i]      = 1'b1; // there are no drops for L1 misses
-
-              // Invalidate L1_DROP_BUF upon handshake
-              if ( l1_r_drop[i] & l1_r_done[i] ) begin
-
-                L1DropValid_SN[i] = 1'b0;
-                int_prot[i]       = L1DropProt_DP[i];
-                int_multi[i]      = L1DropMulti_DP[i];
-              end
-
-            end else begin
-              // Invalidate L1_DROP_BUF
-              L1DropValid_SN[i]   = 1'b0;
-              int_prot[i]         = L1DropProt_DP[i];
-              int_multi[i]        = L1DropMulti_DP[i];
-            end
-          end
-
-        end else begin // L2_OUT_BUF has valid data
-
-          if ( L2OutHit_SP[i] & ~(L2OutPrefetch_S[i] | L2OutProt_SP[i] | L2OutMulti_SP[i]) ) begin
-
-            l2_ar_addr[i]       = L2OutAddr_DP[i];
-            l2_aw_addr[i]       = L2OutAddr_DP[i];
-
-            l2_ar_accept[i]     = L2OutRwType_DP[i] ? 1'b0 : 1'b1;
-            l2_xw_accept[i]     = L2OutRwType_DP[i] ? 1'b1 : 1'b0;
-
-            // Invalidate L2_OUT_BUF upon handshake
-            L2OutValid_SN[i] = ~( (l2_ar_accept[i] & l2_ar_done[i]) |
-                                  (l2_xw_accept[i] & l2_xw_done[i]) );
-          end else begin
-
-            lx_id_drop[i]       = L2OutId_DP[i];
-            lx_len_drop[i]      = L2OutLen_DP[i];
-            lx_prefetch_drop[i] = L2OutPrefetch_S[i];
-            lx_hit_drop[i]      = L2OutHit_SP[i];
-
-            // The l2_xr_drop will also perform the handshake with the R sender
-            l2_xr_drop[i]       = L2OutRwType_DP[i] ? 1'b0 : 1'b1;
-            l2_xw_drop[i]       = L2OutRwType_DP[i] ? 1'b1 : 1'b0;
-
-            // Invalidate L1_DROP_BUF upon handshake
-            if ( (l2_xr_drop[i] & l2_xr_done[i]) | (l2_xw_drop[i] & l2_xw_done[i]) ) begin
-
-              L2OutValid_SN[i]  = 1'b0;
-              L2Miss_S[i]       = ~L2OutHit_SP[i];
-              int_prot[i]       = L2OutProt_SP[i];
-              int_multi[i]      = L2OutMulti_SP[i];
-            end
-          end
-        end
-
-        // Only accept new L2 output after ongoing drops have finished.
-        if ( (l2_xr_drop[i] == l2_xr_done[i]) &
-             (l2_xw_drop[i] == l2_xw_done[i]) &
-             (l1_r_drop[i]  == l1_r_done[i] ) ) begin
-          // Store to L2_OUT_BUF upon handshake with L2 TLB module
-          if ( (L2OutValid_SP[i] == 1'b0) && (L2OutValid_S[i] == 1'b1) ) begin
-            L2OutValid_SN[i]   = 1'b1;
-            L2OutReady_S[i]    = 1'b1;
-            L2OutEn_S[i]       = 1'b1;
-          end
-        end
-      end
-
-      /*
-       * L1 drop buffer
-       *
-       * Used in case of multi, prot and prefetch hits in the L1 TLB.
-       */
-      always_ff @(posedge Clk_CI) begin : L1_DROP_BUF
-         if (Rst_RBI == 0) begin
-            L1DropProt_DP[i]   <= 1'b0;
-            L1DropMulti_DP[i]  <= 1'b0;
-            L1DropRwType_DP[i] <= 1'b0;
-            L1DropUser_DP[i]   <=  'b0;
-            L1DropId_DP[i]     <=  'b0;
-            L1DropLen_DP[i]    <=  'b0;
-            L1DropAddr_DP[i]   <=  'b0;
-         end else if (L1DropEn_S[i] == 1'b1) begin
-            L1DropProt_DP[i]   <= L1OutProt_D[i]  ;
-            L1DropMulti_DP[i]  <= L1OutMulti_D[i] ;
-            L1DropRwType_DP[i] <= L1OutRwType_D[i];
-            L1DropUser_DP[i]   <= L1OutUser_D[i]  ;
-            L1DropId_DP[i]     <= L1OutId_D[i]    ;
-            L1DropLen_DP[i]    <= L1OutLen_D[i]   ;
-            L1DropAddr_DP[i]   <= L1OutAddr_D[i]  ;
-         end
-      end // always_ff @ (posedge Clk_CI)
-
-      /*
-       * L2 input buffer
-       *
-       * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB.
-       */
-      always_ff @(posedge Clk_CI) begin : L2_IN_BUF
-         if (Rst_RBI == 0) begin
-            L2InRwType_DP[i] <= 1'b0;
-            L2InUser_DP[i]   <=  'b0;
-            L2InId_DP[i]     <=  'b0;
-            L2InLen_DP[i]    <=  'b0;
-            L2InAddr_DP[i]   <=  'b0;
-         end else if (L2InEn_S[i] == 1'b1) begin
-            L2InRwType_DP[i] <= L1OutRwType_D[i];
-            L2InUser_DP[i]   <= L1OutUser_D[i]  ;
-            L2InId_DP[i]     <= L1OutId_D[i]    ;
-            L2InLen_DP[i]    <= L1OutLen_D[i]   ;
-            L2InAddr_DP[i]   <= L1OutAddr_D[i]  ;
-         end
-      end // always_ff @ (posedge Clk_CI)
-
-      l2_tlb
-        #(
-          .AXI_S_ADDR_WIDTH       ( AXI_S_ADDR_WIDTH                                    ),
-          .AXI_M_ADDR_WIDTH       ( AXI_M_ADDR_WIDTH                                    ),
-          .AXI_LITE_DATA_WIDTH    ( AXI_LITE_DATA_WIDTH                                 ),
-          .AXI_LITE_ADDR_WIDTH    ( AXI_LITE_ADDR_WIDTH                                 ),
-          .N_SETS                 ( `RAB_L2_N_SETS                                      ),
-          .N_OFFSETS              ( `RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS       ),
-          .N_PAR_VA_RAMS          ( `RAB_L2_N_PAR_VA_RAMS                               ),
-          .HIT_OFFSET_STORE_WIDTH ( log2(`RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS) )
-          )
-      u_l2_tlb
-        (
-          .clk_i              ( Clk_CI           ),
-          .rst_ni             ( Rst_RBI          ),
-
-          // Config inputs
-          .we_i               ( L2CfgWE_S[i]     ),
-          .waddr_i            ( L2CfgWAddr_D[i]  ),
-          .wdata_i            ( L2CfgWData_D[i]  ),
-
-          // Request input
-          .start_i            ( L2InEn_S[i]      ),
-          .busy_o             ( L2Busy_S[i]      ),
-          .rw_type_i          ( L2InRwType_DP[i] ),
-          .in_addr_i          ( L2InAddr_DP[i]   ),
-
-          // Response output
-          .out_ready_i        ( L2OutReady_S[i]  ),
-          .out_valid_o        ( L2OutValid_S[i]  ),
-          .hit_o              ( L2OutHit_SN[i]   ),
-          .miss_o             ( L2OutMiss_SN[i]  ),
-          .prot_o             ( L2OutProt_SN[i]  ),
-          .multi_o            ( L2OutMulti_SN[i] ),
-          .cache_coherent_o   ( L2OutCC_SN[i]    ),
-          .out_addr_o         ( L2OutAddr_DN[i]  )
-        );
-
-      /*
-       * L2 output buffer
-       *
-       * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB.
-       */
-      always_ff @(posedge Clk_CI) begin : L2_OUT_BUF
-         if (Rst_RBI == 0) begin
-            L2OutRwType_DP[i] <= 1'b0;
-            L2OutUser_DP[i]   <=  'b0;
-            L2OutLen_DP[i]    <=  'b0;
-            L2OutId_DP[i]     <=  'b0;
-            L2OutInAddr_DP[i] <=  'b0;
-
-            L2OutHit_SP[i]    <= 1'b0;
-            L2OutMiss_SP[i]   <= 1'b0;
-            L2OutProt_SP[i]   <= 1'b0;
-            L2OutMulti_SP[i]  <= 1'b0;
-            L2OutCC_SP[i]     <= 1'b0;
-            L2OutAddr_DP[i]   <=  'b0;
-         end else if (L2OutEn_S[i] == 1'b1) begin
-            L2OutRwType_DP[i] <= L2InRwType_DP[i];
-            L2OutUser_DP[i]   <= L2InUser_DP[i]  ;
-            L2OutLen_DP[i]    <= L2InLen_DP[i]   ;
-            L2OutId_DP[i]     <= L2InId_DP[i]    ;
-            L2OutInAddr_DP[i] <= L2InAddr_DP[i]  ;
-
-            L2OutHit_SP[i]    <= L2OutHit_SN[i]  ;
-            L2OutMiss_SP[i]   <= L2OutMiss_SN[i] ;
-            L2OutProt_SP[i]   <= L2OutProt_SN[i] ;
-            L2OutMulti_SP[i]  <= L2OutMulti_SN[i];
-            L2OutCC_SP[i]     <= L2OutCC_SN[i]   ;
-            L2OutAddr_DP[i]   <= L2OutAddr_DN[i] ;
-         end
-      end // always_ff @ (posedge Clk_CI)
-
-      always_ff @(posedge Clk_CI) begin : BUF_VALID
-        if (Rst_RBI == 0) begin
-          L1DropValid_SP[i] = 1'b0;
-          L2OutValid_SP[i]  = 1'b0;
-        end else begin
-          L1DropValid_SP[i] = L1DropValid_SN[i];
-          L2OutValid_SP[i]  = L2OutValid_SN[i];
-        end
-      end
-
-      always_comb begin : BUF_TO_PREFETCH
-        // L1 Drop Buf
-        if (L1DropUser_DP[i] == {AXI_USER_WIDTH{1'b1}})
-          L1DropPrefetch_S[i] = 1'b1;
-        else
-          L1DropPrefetch_S[i] = 1'b0;
-
-        // L2 Out Buf
-        if (L2OutUser_DP[i] == {AXI_USER_WIDTH{1'b1}})
-          L2OutPrefetch_S[i]  = 1'b1;
-        else
-          L2OutPrefetch_S[i]  = 1'b0;
-      end
-
-      assign l2_cache_coherent[i] = L2OutCC_SP[i];
-      assign int_miss[i]          = L2Miss_S[i];
-
-    end else begin : L2_TLB_STUB // if (ENABLE_L2TLB[i] == 1)
-
-      assign l1_ar_drop[i]        = int_rtrans_drop[i];
-      assign l1_r_drop[i]         = int_rtrans_drop[i];
-      assign l1_xw_drop[i]        = int_wtrans_drop[i];
-
-      assign l1_ar_save[i]        = 1'b0;
-      assign l1_xw_save[i]        = 1'b0;
-      assign l2_xw_accept[i]      = 1'b0;
-      assign l2_xr_drop[i]        = 1'b0;
-      assign l2_xw_drop[i]        = 1'b0;
-
-      assign l2_ar_addr[i]        =  'b0;
-      assign l2_aw_addr[i]        =  'b0;
-
-      assign l1_id_drop[i]        = int_wtrans_drop[i] ? int_awid[i] :
-                                    int_rtrans_drop[i] ? int_arid[i] :
-                                    '0;
-      assign l1_len_drop[i]       = int_wtrans_drop[i] ? int_awlen[i] :
-                                    int_rtrans_drop[i] ? int_arlen[i] :
-                                    '0;
-      assign l1_prefetch_drop[i]  = rab_prefetch[i];
-      assign l1_hit_drop[i]       = ~rab_miss[i];
-
-      assign lx_id_drop[i]        = int_wtrans_drop[i] ? int_awid[i] :
-                                    int_rtrans_drop[i] ? int_arid[i] :
-                                    '0;
-      assign lx_len_drop[i]       = int_wtrans_drop[i] ? int_awlen[i] :
-                                    int_rtrans_drop[i] ? int_arlen[i] :
-                                    '0;
-      assign lx_prefetch_drop[i]  = rab_prefetch[i];
-      assign lx_hit_drop[i]       = ~rab_miss[i];
-
-      assign l2_cache_coherent[i] = 1'b0;
-
-      assign int_miss[i]          = rab_miss[i];
-      assign int_prot[i]          = rab_prot[i];
-      assign int_multi[i]         = rab_multi[i];
-
-      // unused signals
-      assign L2Miss_S[i]          = 1'b0;
-
-      assign L1OutRwType_D[i]     = 1'b0;
-      assign L1OutProt_D[i]       = 1'b0;
-      assign L1OutMulti_D[i]      = 1'b0;
-
-      assign L1DropRwType_DP[i]   = 1'b0;
-      assign L1DropUser_DP[i]     =  'b0;
-      assign L1DropId_DP[i]       =  'b0;
-      assign L1DropLen_DP[i]      =  'b0;
-      assign L1DropAddr_DP[i]     =  'b0;
-      assign L1DropProt_DP[i]     = 1'b0;
-      assign L1DropMulti_DP[i]    = 1'b0;
-
-      assign L1DropEn_S[i]        = 1'b0;
-      assign L1DropPrefetch_S[i]  = 1'b0;
-      assign L1DropValid_SN[i]    = 1'b0;
-      assign L1DropValid_SP[i]    = 1'b0;
-
-      assign L2InRwType_DP[i]     = 1'b0;
-      assign L2InUser_DP[i]       =  'b0;
-      assign L2InId_DP[i]         =  'b0;
-      assign L2InLen_DP[i]        =  'b0;
-      assign L2InAddr_DP[i]       =  'b0;
-
-      assign L2InEn_S[i]          = 1'b0;
-
-      assign L2OutHit_SN[i]       = 1'b0;
-      assign L2OutMiss_SN[i]      = 1'b0;
-      assign L2OutProt_SN[i]      = 1'b0;
-      assign L2OutMulti_SN[i]     = 1'b0;
-      assign L2OutCC_SN[i]        = 1'b0;
-      assign L2OutAddr_DN[i]      =  'b0;
-
-      assign L2OutRwType_DP[i]    = 1'b0;
-      assign L2OutUser_DP[i]      =  'b0;
-      assign L2OutId_DP[i]        =  'b0;
-      assign L2OutLen_DP[i]       =  'b0;
-      assign L2OutInAddr_DP[i]    =  'b0;
-      assign L2OutHit_SP[i]       = 1'b0;
-      assign L2OutMiss_SP[i]      = 1'b0;
-      assign L2OutProt_SP[i]      = 1'b0;
-      assign L2OutMulti_SP[i]     = 1'b0;
-      assign L2OutCC_SP[i]        = 1'b0;
-      assign L2OutAddr_DP[i]      =  'b0;
-
-      assign L2OutEn_S[i]         = 1'b0;
-      assign L2OutPrefetch_S[i]   = 1'b0;
-      assign L2Busy_S[i]          = 1'b0;
-      assign L2OutValid_S[i]      = 1'b0;
-      assign L2OutValid_SN[i]     = 1'b0;
-      assign L2OutValid_SP[i]     = 1'b0;
-      assign L2OutReady_S[i]      = 1'b0;
-
-    end // !`ifdef ENABLE_L2TLB
-  end // for (i = 0; i < N_PORTS; i++)
-  endgenerate
-
-// }}}
-"""
-# endmodule
-#
-#
-# // vim: ts=2 sw=2 sts=2 et nosmartindent autoindent foldmethod=marker
-#
-#
diff --git a/src/unused/iommu/axi_rab/check_ram.py b/src/unused/iommu/axi_rab/check_ram.py
deleted file mode 100644 (file)
index 31bf32e..0000000
+++ /dev/null
@@ -1,240 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class check_ram(Elaboratable):
-
-    def __init__(self):
-        self.clk_i = Signal()  # input
-        self.rst_ni = Signal()  # input
-        self.in_addr = Signal(ADDR_WIDTH)  # input
-        self.rw_type = Signal()  # input
-        self.ram_we = Signal()  # input
-        self.port0_addr = Signal(1+ERROR p_expression_25)  # input
-        self.port1_addr = Signal(1+ERROR p_expression_25)  # input
-        self.ram_wdata = Signal(RAM_DATA_WIDTH)  # input
-        self.output_sent = Signal()  # input
-        self.output_valid = Signal()  # input
-        self.offset_addr_d = Signal(OFFSET_WIDTH)  # input
-        self.hit_addr = Signal(1+ERROR p_expression_25)  # output
-        self.master = Signal()  # output
-        self.hit = Signal()  # output
-        self.multi_hit = Signal()  # output
-        self.prot = Signal()  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# //import CfMath::log2;
-#
-# //`define MULTI_HIT_FULL_SET
-#
-# module check_ram
-#  //#(
-#  //  parameter ADDR_WIDTH     = 32,
-#   // parameter RAM_DATA_WIDTH = 32,
-#   // parameter PAGE_SIZE      = 4096, // 4kB
-#  //  parameter SET_WIDTH      = 5,
-#   // parameter OFFSET_WIDTH   = 4
-#   // )
-#  (
-#   input  logic                                clk_i,
-#   input  logic                                rst_ni,
-#   input  logic [ADDR_WIDTH-1:0]               in_addr,
-#   input  logic                                rw_type, // 1 => write, 0=> read
-#   input  logic                                ram_we,
-#   input  logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr,
-#   input  logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr,
-#   input  logic [RAM_DATA_WIDTH-1:0]           ram_wdata,
-#   input  logic                                output_sent,
-#   input  logic                                output_valid,
-#   input  logic [OFFSET_WIDTH-1:0]             offset_addr_d,
-#   output logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr,
-#   output logic                                master,
-#   output logic                                hit,
-#   output logic                                multi_hit,
-#   output logic                                prot
-#   );
-#
-"""   #docstring_begin
-
-   localparam IGNORE_LSB = log2(PAGE_SIZE); // 12
-
-   logic [RAM_DATA_WIDTH-1:0]           port0_data_o, port1_data_o; // RAM read data outputs
-   logic                                port0_hit, port1_hit; // Ram output matches in_addr
-
-    logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr_saved, port1_addr_saved;
-
-   // Hit FSM Signals
-   typedef enum                         logic {SEARCH, HIT} hit_state_t;
-   hit_state_t                          hit_SP; // Hit FSM state
-   hit_state_t                          hit_SN; // Hit FSM next state
-
-   // Multi Hit FSM signals
-`ifdef MULTI_HIT_FULL_SET
-   typedef enum                         logic[1:0] {NO_HITS, ONE_HIT, MULTI_HIT} multi_state_t;
-   multi_state_t                        multi_SP; // Multi Hit FSM state
-   multi_state_t                        multi_SN; // Multi Hit FSM next state
-
-   logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr_saved;
-   logic                                master_saved;
-`endif
-
-  //// --------------- Block RAM (Dual Port) -------------- ////
-
-  // The outputs of the BRAMs are only valid if in the previous cycle:
-  // 1. the inputs were valid, and
-  // 2. the BRAM was not written to.
-  // Otherwise, the outputs must be ignored which is controlled by the output_valid signal.
-  // This signal is driven by the uppler level L2 TLB module.
-  ram_tp_no_change #(
-      .ADDR_WIDTH( SET_WIDTH+OFFSET_WIDTH+1 ),
-      .DATA_WIDTH( RAM_DATA_WIDTH           )
-    )
-    ram_tp_no_change_0
-    (
-      .clk   ( clk_i         ),
-      .we    ( ram_we        ),
-      .addr0 ( port0_addr    ),
-      .addr1 ( port1_addr    ),
-      .d_i   ( ram_wdata     ),
-      .d0_o  ( port0_data_o  ),
-      .d1_o  ( port1_data_o  )
-    );
-
-   //// Check Ram Outputs
-   assign port0_hit = (port0_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port0_data_o[RAM_DATA_WIDTH-1:4]);
-   assign port1_hit = (port1_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port1_data_o[RAM_DATA_WIDTH-1:4]);
-   //// ----------------------------------------------------- /////
-
-   //// ------------------- Check if Hit ------------------------ ////
-   // FSM
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         hit_SP <= SEARCH;
-      end else begin
-         hit_SP <= hit_SN;
-      end
-   end
-
-   always_ff @(posedge clk_i, negedge rst_ni) begin
-       if (!rst_ni) begin
-           port0_addr_saved <= '0;
-           port1_addr_saved <= '0;
-       end else begin
-           port0_addr_saved <= port0_addr;
-           port1_addr_saved <= port1_addr;
-       end
-   end
-
-   always_comb begin
-      hit_SN   = hit_SP;
-      hit      = 1'b0;
-      hit_addr = 0;
-      master   = 1'b0;
-      unique case(hit_SP)
-        SEARCH :
-          if (output_valid)
-            if (port0_hit || port1_hit) begin
-               hit_SN   = HIT;
-               hit      = 1'b1;
-               hit_addr = port0_hit ? {port0_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} :
-                          port1_hit ? {port1_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} :
-                          0;
-               master   = port0_hit ? port0_data_o[3] :
-                          port1_hit ? port1_data_o[3] :
-                          1'b0;
-            end
-
-        HIT : begin
-`ifdef MULTI_HIT_FULL_SET // Since the search continues after the first hit, it needs to be saved to be accessed later.
-           hit      = 1'b1;
-           hit_addr = hit_addr_saved;
-           master   = master_saved;
-`endif
-           if (output_sent)
-             hit_SN = SEARCH;
-        end
-
-        default : begin
-           hit_SN = SEARCH;
-        end
-      endcase // case (hit_SP)
-   end // always_comb begin
-
-   //// ------------------------------------------- ////
-
-   assign prot = output_valid && port0_hit ? ((~port0_data_o[2] && rw_type) || (~port0_data_o[1] && ~rw_type)) :
-                 output_valid && port1_hit ? ((~port1_data_o[2] && rw_type) || (~port1_data_o[1] && ~rw_type)) :
-                 1'b0;
-
-   //// ------------------- Multi ------------------- ////
-`ifdef MULTI_HIT_FULL_SET
-
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         hit_addr_saved <= 0;
-         master_saved   <= 1'b0;
-      end else if (output_valid) begin
-         hit_addr_saved <= hit_addr;
-         master_saved   <= master;
-      end
-   end
-
-   // FSM
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         multi_SP <= NO_HITS;
-      end else begin
-         multi_SP <= multi_SN;
-      end
-   end
-
-   always_comb begin
-      multi_SN  = multi_SP;
-      multi_hit = 1'b0;
-      unique case(multi_SP)
-        NO_HITS :
-          if(output_valid && (port0_hit && port1_hit)) begin
-             multi_SN  = MULTI_HIT;
-             multi_hit = 1'b1;
-          end else if(output_valid && (port0_hit || port1_hit))
-            multi_SN = ONE_HIT;
-
-        ONE_HIT :
-          if(output_valid && (port0_hit || port1_hit)) begin
-             multi_SN  = MULTI_HIT;
-             multi_hit = 1'b1;
-          end else if (output_sent)
-            multi_SN = NO_HITS;
-
-        MULTI_HIT : begin
-          multi_hit = 1'b1;
-           if (output_sent)
-             multi_SN = NO_HITS;
-        end
-
-      endcase // case (multi_SP)
-   end // always_comb begin
-
-`else // !`ifdef MULTI_HIT_FULL_SET
-   assign multi_hit = output_valid && port0_hit && port1_hit;
-`endif // !`ifdef MULTI_HIT_FULL_SET
-   //// ------------------------------------------- ////
-"""
-# endmodule
-#
-#
diff --git a/src/unused/iommu/axi_rab/coreconfig.py b/src/unused/iommu/axi_rab/coreconfig.py
deleted file mode 100644 (file)
index 247d0ce..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-class CoreConfig:
-    def __init__(self):
-        self.N_SLICES = 16
-        self.N_REGS = 4*self.N_SLICES
-        self.ADDR_WIDTH_PHYS = 40
-        self.ADDR_WIDTH_VIRT = 32
diff --git a/src/unused/iommu/axi_rab/fsm.py b/src/unused/iommu/axi_rab/fsm.py
deleted file mode 100644 (file)
index d64b1cb..0000000
+++ /dev/null
@@ -1,243 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class fsm(Elaboratable):
-
-    def __init__(self):
-        self.Clk_CI = Signal()  # input
-        self.Rst_RBI = Signal()  # input
-        self.port1_addr_valid_i = Signal()  # input
-        self.port2_addr_valid_i = Signal()  # input
-        self.port1_sent_i = Signal()  # input
-        self.port2_sent_i = Signal()  # input
-        self.select_i = Signal()  # input
-        self.no_hit_i = Signal()  # input
-        self.multi_hit_i = Signal()  # input
-        self.no_prot_i = Signal()  # input
-        self.prefetch_i = Signal()  # input
-        self.out_addr_i = Signal(AXI_M_ADDR_WIDTH)  # input
-        self.cache_coherent_i = Signal()  # input
-        self.port1_accept_o = Signal()  # output
-        self.port1_drop_o = Signal()  # output
-        self.port1_miss_o = Signal()  # output
-        self.port2_accept_o = Signal()  # output
-        self.port2_drop_o = Signal()  # output
-        self.port2_miss_o = Signal()  # output
-        self.out_addr_o = Signal(AXI_M_ADDR_WIDTH)  # output
-        self.cache_coherent_o = Signal()  # output
-        self.miss_o = Signal()  # output
-        self.multi_o = Signal()  # output
-        self.prot_o = Signal()  # output
-        self.prefetch_o = Signal()  # output
-        self.in_addr_i = Signal(AXI_S_ADDR_WIDTH)  # input
-        self.in_id_i = Signal(AXI_ID_WIDTH)  # input
-        self.in_len_i = Signal(8)  # input
-        self.in_user_i = Signal(AXI_USER_WIDTH)  # input
-        self.in_addr_o = Signal(AXI_S_ADDR_WIDTH)  # output
-        self.in_id_o = Signal(AXI_ID_WIDTH)  # output
-        self.in_len_o = Signal(8)  # output
-        self.in_user_o = Signal(AXI_USER_WIDTH)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# //`timescale 1ns / 1ps
-#
-# module fsm
-#  #(
-#    parameter AXI_M_ADDR_WIDTH = 40,
-#    parameter AXI_S_ADDR_WIDTH = 32,
-#    parameter AXI_ID_WIDTH     = 8,
-#    parameter AXI_USER_WIDTH   = 6
-#  )
-#  (
-#    input  logic                        Clk_CI,
-#    input  logic                        Rst_RBI,
-#
-#    input  logic                        port1_addr_valid_i,
-#    input  logic                        port2_addr_valid_i,
-#    input  logic                        port1_sent_i,
-#    input  logic                        port2_sent_i,
-#    input  logic                        select_i,
-#    input  logic                        no_hit_i,
-#    input  logic                        multi_hit_i,
-#    input  logic                        no_prot_i,
-#    input  logic                        prefetch_i,
-#    input  logic [AXI_M_ADDR_WIDTH-1:0] out_addr_i,
-#    input  logic                        cache_coherent_i,
-#    output logic                        port1_accept_o,
-#    output logic                        port1_drop_o,
-#    output logic                        port1_miss_o,
-#    output logic                        port2_accept_o,
-#    output logic                        port2_drop_o,
-#    output logic                        port2_miss_o,
-#    output logic [AXI_M_ADDR_WIDTH-1:0] out_addr_o,
-#    output logic                        cache_coherent_o,
-#    output logic                        miss_o,
-#    output logic                        multi_o,
-#    output logic                        prot_o,
-#    output logic                        prefetch_o,
-#    input  logic [AXI_S_ADDR_WIDTH-1:0] in_addr_i,
-#    input  logic     [AXI_ID_WIDTH-1:0] in_id_i,
-#    input  logic                  [7:0] in_len_i,
-#    input  logic   [AXI_USER_WIDTH-1:0] in_user_i,
-#    output logic [AXI_S_ADDR_WIDTH-1:0] in_addr_o,
-#    output logic     [AXI_ID_WIDTH-1:0] in_id_o,
-#    output logic                  [7:0] in_len_o,
-#    output logic   [AXI_USER_WIDTH-1:0] in_user_o
-#  );
-#
-"""  #docstring_begin
-
-  //-------------Internal Signals----------------------
-
-  typedef enum logic           {IDLE, WAIT} state_t;
-  logic                        state_SP; // Present state
-  logic                        state_SN; // Next State
-
-  logic                        port1_accept_SN;
-  logic                        port1_drop_SN;
-  logic                        port1_miss_SN;
-  logic                        port2_accept_SN;
-  logic                        port2_drop_SN;
-  logic                        port2_miss_SN;
-  logic                        miss_SN;
-  logic                        multi_SN;
-  logic                        prot_SN;
-  logic                        prefetch_SN;
-  logic                        cache_coherent_SN;
-  logic [AXI_M_ADDR_WIDTH-1:0] out_addr_DN;
-
-  logic                        out_reg_en_S;
-
-  //----------FSM comb------------------------------
-
-  always_comb begin: FSM_COMBO
-    state_SN          = state_SP;
-
-    port1_accept_SN   = 1'b0;
-    port1_drop_SN     = 1'b0;
-    port1_miss_SN     = 1'b0;
-    port2_accept_SN   = 1'b0;
-    port2_drop_SN     = 1'b0;
-    port2_miss_SN     = 1'b0;
-    miss_SN           = 1'b0;
-    multi_SN          = 1'b0;
-    prot_SN           = 1'b0;
-    prefetch_SN       = 1'b0;
-    cache_coherent_SN = 1'b0;
-    out_addr_DN       =   '0;
-
-    out_reg_en_S      = 1'b0; // by default hold register output
-
-    unique case(state_SP)
-        IDLE :
-          if ( (port1_addr_valid_i & select_i) | (port2_addr_valid_i & ~select_i) ) begin
-            out_reg_en_S = 1'b1;
-            state_SN     = WAIT;
-
-            // Select inputs for output registers
-            if          (port1_addr_valid_i & select_i) begin
-              port1_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
-              port1_drop_SN   =  (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
-              port1_miss_SN   =   no_hit_i;
-              port2_accept_SN = 1'b0;
-              port2_drop_SN   = 1'b0;
-              port2_miss_SN   = 1'b0;
-            end else if (port2_addr_valid_i & ~select_i) begin
-              port1_accept_SN = 1'b0;
-              port1_drop_SN   = 1'b0;
-              port1_miss_SN   = 1'b0;
-              port2_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
-              port2_drop_SN   =  (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
-              port2_miss_SN   =   no_hit_i;
-            end
-
-            miss_SN           = port1_miss_SN | port2_miss_SN;
-            multi_SN          = multi_hit_i;
-            prot_SN           = ~no_prot_i;
-            prefetch_SN       = ~no_hit_i & prefetch_i;
-
-            cache_coherent_SN = cache_coherent_i;
-            out_addr_DN       = out_addr_i;
-          end
-
-        WAIT :
-          if ( port1_sent_i | port2_sent_i ) begin
-            out_reg_en_S = 1'b1; // "clear" the register
-            state_SN     = IDLE;
-          end
-
-        default : begin
-           state_SN      = IDLE;
-        end
-      endcase
-    end
-
-  //----------FSM seq-------------------------------
-
-  always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: FSM_SEQ
-    if (Rst_RBI == 1'b0)
-      state_SP <= IDLE;
-    else
-      state_SP <= state_SN;
-  end
-
-  //----------Output seq--------------------------
-
-  always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: OUTPUT_SEQ
-    if (Rst_RBI == 1'b0) begin
-      port1_accept_o   = 1'b0;
-      port1_drop_o     = 1'b0;
-      port1_miss_o     = 1'b0;
-      port2_accept_o   = 1'b0;
-      port2_drop_o     = 1'b0;
-      port2_miss_o     = 1'b0;
-      miss_o           = 1'b0;
-      multi_o          = 1'b0;
-      prot_o           = 1'b0;
-      prefetch_o       = 1'b0;
-      cache_coherent_o = 1'b0;
-      out_addr_o       =   '0;
-      in_addr_o        =   '0;
-      in_id_o          =   '0;
-      in_len_o         =   '0;
-      in_user_o        =   '0;
-    end else if (out_reg_en_S == 1'b1) begin
-      port1_accept_o   = port1_accept_SN;
-      port1_drop_o     = port1_drop_SN;
-      port1_miss_o     = port1_miss_SN;
-      port2_accept_o   = port2_accept_SN;
-      port2_drop_o     = port2_drop_SN;
-      port2_miss_o     = port2_miss_SN;
-      miss_o           = miss_SN;
-      multi_o          = multi_SN;
-      prot_o           = prot_SN;
-      prefetch_o       = prefetch_SN;
-      cache_coherent_o = cache_coherent_SN;
-      out_addr_o       = out_addr_DN;
-      in_addr_o        = in_addr_i;
-      in_id_o          = in_id_i;
-      in_len_o         = in_len_i;
-      in_user_o        = in_user_i;
-    end
-  end // block: OUTPUT_SEQ
-"""
-#
-# endmodule
-#
-#
diff --git a/src/unused/iommu/axi_rab/l2_tlb.py b/src/unused/iommu/axi_rab/l2_tlb.py
deleted file mode 100644 (file)
index 11983f6..0000000
+++ /dev/null
@@ -1,550 +0,0 @@
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class l2_tlb(Elaboratable):
-
-    def __init__(self):
-        self.clk_i = Signal()  # input
-        self.rst_ni = Signal()  # input
-        self.we_i = Signal()  # input
-        self.waddr_i = Signal(AXI_LITE_ADDR_WIDTH)  # input
-        self.wdata_i = Signal(AXI_LITE_DATA_WIDTH)  # input
-        self.start_i = Signal()  # input
-        self.busy_o = Signal()  # output
-        self.in_addr_i = Signal(AXI_S_ADDR_WIDTH)  # input
-        self.rw_type_i = Signal()  # input
-        self.out_ready_i = Signal()  # input
-        self.out_valid_o = Signal()  # output
-        self.hit_o = Signal()  # output
-        self.miss_o = Signal()  # output
-        self.prot_o = Signal()  # output
-        self.multi_o = Signal()  # output
-        self.cache_coherent_o = Signal()  # output
-        self.out_addr_o = Signal(AXI_M_ADDR_WIDTH)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# //`include "pulp_soc_defines.sv"
-#
-# ////import CfMath::log2;
-#
-# //`define MULTI_HIT_FULL_SET  // Enable full multi hit detection. Always the entire set is searched.
-# //`define MULTI_HIT_CUR_CYCLE // Enable partial multi hit detection. Only multi hits in the same search cycle are detected.
-#
-# //`ifdef MULTI_HIT_FULL_SET
-# //  `ifndef MULTI_HIT_CUR_CYCLE
-# //    `define MULTI_HIT_CUR_CYCLE
-# //  `endif
-# //`endif
-#
-# module l2_tlb
-#  //#(
-#  //  parameter AXI_S_ADDR_WIDTH       = 32,
-#   // parameter AXI_M_ADDR_WIDTH       = 40,
-#  //  parameter AXI_LITE_DATA_WIDTH    = 64,
-#   // parameter AXI_LITE_ADDR_WIDTH    = 32,
-#   // parameter N_SETS                 = 32,
-#   // parameter N_OFFSETS              = 4, //per port. There are 2 ports.
-#  //  parameter PAGE_SIZE              = 4096, // 4kB
-#  //  parameter N_PAR_VA_RAMS          = 4,
-#  //  parameter HIT_OFFSET_STORE_WIDTH = 2 // Num of bits of VA RAM offset stored. This should not be greater than OFFSET_WIDTH
-#  //  )
-#   (
-#    input  logic                           clk_i,
-#    input  logic                           rst_ni,
-#
-#    input  logic                           we_i,
-#    input  logic [AXI_LITE_ADDR_WIDTH-1:0] waddr_i,
-#    input  logic [AXI_LITE_DATA_WIDTH-1:0] wdata_i,
-#
-#    input  logic                           start_i,
-#    output logic                           busy_o,
-#    input  logic    [AXI_S_ADDR_WIDTH-1:0] in_addr_i,
-#    input  logic                           rw_type_i, //1 => write, 0=> read
-#
-#    input  logic                           out_ready_i,
-#    output logic                           out_valid_o,
-#    output logic                           hit_o,
-#    output logic                           miss_o,
-#    output logic                           prot_o,
-#    output logic                           multi_o,
-#    output logic                           cache_coherent_o,
-#    output logic    [AXI_M_ADDR_WIDTH-1:0] out_addr_o
-#    );
-#
-"""    #docstring_begin
-
-   localparam VA_RAM_DEPTH      = N_SETS * N_OFFSETS * 2;
-   localparam PA_RAM_DEPTH      = VA_RAM_DEPTH * N_PAR_VA_RAMS;
-   localparam VA_RAM_ADDR_WIDTH = log2(VA_RAM_DEPTH);
-   localparam PA_RAM_ADDR_WIDTH = log2(PA_RAM_DEPTH);
-   localparam SET_WIDTH         = log2(N_SETS);
-   localparam OFFSET_WIDTH      = log2(N_OFFSETS);
-   localparam LL_WIDTH          = log2(N_PAR_VA_RAMS);
-   localparam IGNORE_LSB        = log2(PAGE_SIZE);
-
-   localparam VA_RAM_DATA_WIDTH = AXI_S_ADDR_WIDTH - IGNORE_LSB + 4;
-   localparam PA_RAM_DATA_WIDTH = AXI_M_ADDR_WIDTH - IGNORE_LSB;
-
-   logic                               [N_PAR_VA_RAMS-1:0] hit, prot, multi_hit, cache_coherent;
-   logic                               [N_PAR_VA_RAMS-1:0] ram_we;
-   logic                                                   last_search, last_search_next;
-   logic                                                   first_search, first_search_next;
-   logic                    [SET_WIDTH+OFFSET_WIDTH+1-1:0] ram_waddr;
-   logic [N_PAR_VA_RAMS-1:0][SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr;
-   logic                                                   pa_ram_we;
-   logic                           [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr, pa_port0_waddr; // PA RAM read, Write addr;
-   logic                           [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr_reg_SN, pa_port0_raddr_reg_SP; // registered addresses, needed for WAIT_ON_WRITE;
-   logic                           [PA_RAM_ADDR_WIDTH-1:0] pa_port0_addr; // PA RAM addr
-   logic                           [PA_RAM_DATA_WIDTH-1:0] pa_port0_data, pa_data, pa_port0_data_reg; // PA RAM data
-   logic                                                   pa_ram_store_data_SN, pa_ram_store_data_SP;
-   logic                                                   hit_top, prot_top, multi_hit_top, first_hit_top;
-   logic                                                   output_sent;
-   int                                                     hit_block_num;
-
-   logic                                                   searching, search_done;
-   logic                    [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr, port0_raddr; // VA RAM port0 addr
-   logic                    [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr; // VA RAM port1 addr
-   logic                                [OFFSET_WIDTH-1:0] offset_addr, offset_addr_d;
-   logic                                [OFFSET_WIDTH-1:0] offset_start_addr, offset_end_addr;
-   logic                                   [SET_WIDTH-1:0] set_num;
-
-   logic                                                   va_output_valid;
-   logic                                                   searching_q;
-
-   genvar                                                  z;
-
-   // Search FSM
-   typedef enum logic                                [1:0] {IDLE, SEARCH, DONE} search_state_t;
-   search_state_t                                          search_SP; // Present state
-   search_state_t                                          search_SN; // Next State
-
-   // Output FSM
-   typedef enum logic                                [1:0] {OUT_IDLE, SEND_OUTPUT, WAIT_ON_WRITE} out_state_t;
-   out_state_t                                             out_SP; // Present state
-   out_state_t                                             out_SN; // Next State
-
-   logic                                                   miss_next;
-   logic                                                   hit_next;
-   logic                                                   prot_next;
-   logic                                                   multi_next;
-   logic                                                   cache_coherent_next;
-
-   // Generate the VA Block rams and their surrounding logic
-   generate
-      for (z = 0; z < N_PAR_VA_RAMS; z++) begin : VA_RAMS
-         check_ram
-           #(
-             .ADDR_WIDTH     ( AXI_S_ADDR_WIDTH  ),
-             .RAM_DATA_WIDTH ( VA_RAM_DATA_WIDTH ),
-             .PAGE_SIZE      ( PAGE_SIZE         ),
-             .SET_WIDTH      ( SET_WIDTH         ),
-             .OFFSET_WIDTH   ( OFFSET_WIDTH      )
-             )
-         u_check_ram
-             (
-              .clk_i         ( clk_i                          ),
-              .rst_ni        ( rst_ni                         ),
-              .in_addr       ( in_addr_i                      ),
-              .rw_type       ( rw_type_i                      ),
-              .ram_we        ( ram_we[z]                      ),
-              .port0_addr    ( port0_addr                     ),
-              .port1_addr    ( port1_addr                     ),
-              .ram_wdata     ( wdata_i[VA_RAM_DATA_WIDTH-1:0] ),
-              .output_sent   ( output_sent                    ),
-              .output_valid  ( va_output_valid                ),
-              .offset_addr_d ( offset_addr_d                  ),
-              .hit_addr      ( hit_addr[z]                    ),
-              .master        ( cache_coherent[z]              ),
-              .hit           ( hit[z]                         ),
-              .multi_hit     ( multi_hit[z]                   ),
-              .prot          ( prot[z]                        )
-              );
-      end // for (z = 0; z < N_PORTS; z++)
-   endgenerate
-
-   ////////////////// ---------------- Control and Address --------------- ////////////////////////
-   // FSM
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         search_SP <= IDLE;
-      end else begin
-         search_SP <= search_SN;
-      end
-   end
-
-   always_comb begin : SEARCH_FSM
-      search_SN         = search_SP;
-      busy_o            = 1'b0;
-      searching         = 1'b0;
-      search_done       = 1'b0;
-      last_search_next  = 1'b0;
-      first_search_next = first_search;
-
-      unique case (search_SP)
-        IDLE : begin
-          if (start_i) begin
-            search_SN         = SEARCH;
-            first_search_next = 1'b1;
-          end
-        end
-
-        SEARCH : begin
-          busy_o = 1'b1;
-
-          // detect last search cycle
-          if ( (first_search == 1'b0) && (offset_addr == offset_end_addr) )
-             last_search_next  = 1'b1;
-
-          // pause search during VA RAM reconfigration
-          if (|ram_we) begin
-             searching         = 1'b0;
-          end else begin
-             searching         = 1'b1;
-             first_search_next = 1'b0;
-          end
-
-          if (va_output_valid) begin
-            // stop search
-`ifdef MULTI_HIT_FULL_SET
-            if (last_search | prot_top | multi_hit_top) begin
-`else
-            if (last_search | prot_top | multi_hit_top | hit_top ) begin
-`endif
-              search_SN      = DONE;
-              search_done    = 1'b1;
-            end
-          end
-        end
-
-        DONE : begin
-          busy_o = 1'b1;
-          if (out_valid_o & out_ready_i)
-            search_SN = IDLE;
-        end
-
-        default : begin
-          search_SN = IDLE;
-        end
-      endcase // case (prot_SP)
-   end // always_comb begin
-
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         last_search  <= 1'b0;
-         first_search <= 1'b0;
-      end else begin
-         last_search  <= last_search_next;
-         first_search <= first_search_next;
-      end
-   end
-
-   /*
-    * VA RAM address generation
-    *
-    * The input address and set number, and thus the offset start address, are available in the
-    * cycle after the start signal. The buffered offset_addr becomes available one cycle later.
-    * During the first search cycle, we therefore directly use offset_addr_start for the lookup.
-    */
-   assign set_num = in_addr_i[SET_WIDTH+IGNORE_LSB -1 : IGNORE_LSB];
-
-   assign port0_raddr[OFFSET_WIDTH] = 1'b0;
-   assign port1_addr [OFFSET_WIDTH] = 1'b1;
-
-   assign port0_raddr[OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr;
-   assign port1_addr [OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr;
-
-   assign port0_raddr[SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num;
-   assign port1_addr [SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num;
-
-   assign port0_addr = ram_we ? ram_waddr : port0_raddr;
-
-   // The outputs of the BRAMs are only valid if in the previous cycle:
-   // 1. the inputs were valid, and
-   // 2. the BRAMs were not written to.
-   // Otherwise, the outputs must be ignored.
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         searching_q <= 1'b0;
-      end else begin
-         searching_q <= searching;
-      end
-   end
-   assign va_output_valid = searching_q;
-
-   // Address offset for looking up the VA RAMs
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         offset_addr   <= 0;
-      end else if (first_search) begin
-         offset_addr <= offset_start_addr + 1'b1;
-      end else if (searching) begin
-         offset_addr <= offset_addr + 1'b1;
-      end
-   end
-
-   // Delayed address offest for looking up the PA RAM upon a hit in the VA RAMs
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         offset_addr_d <= 0;
-      end else if (first_search) begin
-         offset_addr_d <= offset_start_addr;
-      end else if (searching) begin
-         offset_addr_d <= offset_addr_d + 1'b1;
-      end
-   end
-
-   // Store the offset addr for hit to reduce latency for next search.
-   generate
-      if (HIT_OFFSET_STORE_WIDTH > 0) begin : OFFSET_STORE
-`ifndef MULTI_HIT_FULL_SET
-         logic [N_SETS-1:0][HIT_OFFSET_STORE_WIDTH-1:0] hit_offset_addr; // Contains offset addr for previous hit for every SET.
-         logic [SET_WIDTH+OFFSET_WIDTH+1-1:0]           hit_addr_reg;
-
-         assign offset_start_addr = { hit_offset_addr[set_num] , {{OFFSET_WIDTH-HIT_OFFSET_STORE_WIDTH}{1'b0}} };
-         assign offset_end_addr   =   hit_offset_addr[set_num]-1'b1;
-
-         // Register the hit addr
-         always_ff @(posedge clk_i) begin
-            if (rst_ni == 0) begin
-               hit_addr_reg <= 0;
-            end else if (hit_top) begin
-               hit_addr_reg <= hit_addr[hit_block_num];
-            end
-         end
-
-         // Store hit addr for each set. The next search in the same set will start from the saved addr.
-         always_ff @(posedge clk_i) begin
-            if (rst_ni == 0) begin
-               hit_offset_addr <= 0;
-            end else if (hit_o) begin
-               hit_offset_addr[set_num][HIT_OFFSET_STORE_WIDTH-1:0] <= hit_addr_reg[OFFSET_WIDTH-1 : (OFFSET_WIDTH - HIT_OFFSET_STORE_WIDTH)];
-            end
-         end
-`else // No need to store offset if full multi hit detection is enabled because the entire SET is searched.
-         assign offset_start_addr = 0;
-         assign offset_end_addr   = {OFFSET_WIDTH{1'b1}};
-`endif
-      end else begin // if (HIT_OFFSET_STORE_WIDTH > 0)
-         assign offset_start_addr = 0;
-         assign offset_end_addr   = {OFFSET_WIDTH{1'b1}};
-      end
-   endgenerate
-
-   assign prot_top = |prot;
-
-   //////////////////////////////////////////////////////////////////////////////////////
-   // check for hit, multi hit
-   // In case of a multi hit, the hit_block_num indicates the lowest VA RAM with a hit.
-   // In case of a multi hit in the same VA RAM, Port 0 is given priority.
-   always_comb begin : HIT_CHECK
-      hit_top       = |hit;
-      hit_block_num = 0;
-      first_hit_top = 1'b0;
-      multi_hit_top = 1'b0;
-      for (int i=N_PAR_VA_RAMS-1; i>=0; i--) begin
-        if (hit[i] == 1'b1) begin
-`ifdef MULTI_HIT_CUR_CYCLE
-          if (multi_hit[i] | first_hit_top ) begin
-            multi_hit_top = 1'b1;
-          end
-`endif
-          first_hit_top = 1'b1;
-          hit_block_num = i;
-        end
-      end // for (int i=0; i<N_PAR_VA_RAMS; i++)
-   end // always_comb begin
-
-   ///////////////////// ------------- Outputs ------------ //////////////////////////////////
-   //// FSM
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         out_SP                     <= OUT_IDLE;
-         pa_ram_store_data_SP       <= 1'b0;
-         pa_port0_raddr_reg_SP      <=  'b0;
-      end else begin
-         out_SP                     <= out_SN;
-         pa_ram_store_data_SP       <= pa_ram_store_data_SN;
-         pa_port0_raddr_reg_SP      <= pa_port0_raddr_reg_SN;
-      end
-   end
-
-   always_comb begin : OUTPUT_FSM
-      out_SN                   = out_SP;
-
-      miss_next                = miss_o;
-      prot_next                = prot_o;
-      multi_next               = multi_o;
-      hit_next                 = hit_o;
-      cache_coherent_next      = cache_coherent_o;
-      pa_port0_raddr_reg_SN    = pa_port0_raddr_reg_SP;
-
-      pa_port0_raddr           =  'b0;
-      pa_ram_store_data_SN     = 1'b0;
-
-      out_valid_o              = 1'b0;
-      output_sent              = 1'b0;
-
-      unique case (out_SP)
-        OUT_IDLE : begin
-           hit_next            = 1'b0;
-           miss_next           = 1'b0;
-           prot_next           = 1'b0;
-           multi_next          = 1'b0;
-           cache_coherent_next = 1'b0;
-
-          // abort transaction
-          if         ((search_done & ~hit_top) | prot_top | multi_hit_top) begin
-             out_SN = SEND_OUTPUT;
-
-             if (search_done & ~hit_top) begin
-                miss_next  = 1'b1;
-             end
-             if (prot_top) begin
-                prot_next  = 1'b1;
-                hit_next   = 1'b1;
-             end
-             if (multi_hit_top) begin
-                multi_next = 1'b1;
-                hit_next   = 1'b1;
-             end
-
-          // read PA RAM
-          end else if (search_done & hit_top) begin
-             hit_next              = 1'b1;
-             cache_coherent_next   = cache_coherent[hit_block_num];
-             pa_port0_raddr        = (N_PAR_VA_RAMS * hit_addr[hit_block_num]) + hit_block_num;
-             pa_port0_raddr_reg_SN = pa_port0_raddr;
-
-             // read PA RAM now
-             if (~pa_ram_we) begin
-                out_SN               = SEND_OUTPUT;
-                pa_ram_store_data_SN = 1'b1;
-
-             // read PA RAM after PA RAM reconfiguration
-             end else begin // pa_ram_we
-                out_SN               = WAIT_ON_WRITE;
-
-             end
-          end
-        end
-
-        WAIT_ON_WRITE : begin
-          if ( ~pa_ram_we ) begin
-             out_SN               = SEND_OUTPUT;
-             pa_port0_raddr       = pa_port0_raddr_reg_SP;
-             pa_ram_store_data_SN = 1'b1;
-          end
-        end
-
-        SEND_OUTPUT : begin
-           out_valid_o  = 1'b1;
-           if (out_ready_i) begin
-              out_SN      = OUT_IDLE;
-              output_sent = 1'b1;
-           end
-        end
-
-        default : begin
-           out_SN = OUT_IDLE;
-        end
-
-      endcase // case (out_SP)
-   end // always_comb begin
-
-   //// Output signals
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         miss_o           <= 1'b0;
-         prot_o           <= 1'b0;
-         multi_o          <= 1'b0;
-         hit_o            <= 1'b0;
-         cache_coherent_o <= 1'b0;
-      end else begin
-         miss_o           <= miss_next;
-         prot_o           <= prot_next;
-         multi_o          <= multi_next;
-         hit_o            <= hit_next;
-         cache_coherent_o <= cache_coherent_next;
-      end
-   end
-
-   ///////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-  ///////////////////// --------------- Physical Address -------------- ////////////////////////////
-
-  /// PA Block RAM
-  ram_tp_no_change #(
-        .ADDR_WIDTH( PA_RAM_ADDR_WIDTH ),
-        .DATA_WIDTH( PA_RAM_DATA_WIDTH )
-        )
-  pa_ram
-    (
-      .clk   ( clk_i                          ),
-      .we    ( pa_ram_we                      ),
-      .addr0 ( pa_port0_addr                  ),
-      .addr1 ( '0                             ),
-      .d_i   ( wdata_i[PA_RAM_DATA_WIDTH-1:0] ),
-      .d0_o  ( pa_port0_data                  ),
-      .d1_o  (                                )
-    );
-
-   assign out_addr_o[IGNORE_LSB-1:0]                = in_addr_i[IGNORE_LSB-1:0];
-   assign out_addr_o[AXI_M_ADDR_WIDTH-1:IGNORE_LSB] = pa_data;
-
-   always_ff @(posedge clk_i) begin
-      if (rst_ni == 0) begin
-         pa_port0_data_reg <= 0;
-      end else if (pa_ram_store_data_SP) begin
-         pa_port0_data_reg <= pa_port0_data;
-      end
-   end
-
-   assign pa_data = pa_ram_store_data_SP ? pa_port0_data : pa_port0_data_reg;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-///// Write enable for all block rams
-generate if (LL_WIDTH != 0) begin
-   always_comb begin
-      var reg[LL_WIDTH:0] para;
-      var int             para_int;
-      for (para = 0; para < N_PAR_VA_RAMS; para=para+1'b1) begin
-        para_int         = int'(para);
-        ram_we[para_int] = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b0) && (waddr_i[LL_WIDTH-1:0] == para);
-      end
-   end
-end else begin
-   assign ram_we[0] = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b0);
-end
-
-endgenerate
-
-// Addresses are word, not byte addresses
-assign pa_ram_we      = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b1); //waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] will be 0 for all VA writes and 1 for all PA writes
-assign ram_waddr      = waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH-1:LL_WIDTH];
-assign pa_port0_waddr = waddr_i[PA_RAM_ADDR_WIDTH-1:0];
-assign pa_port0_addr  = pa_ram_we ? pa_port0_waddr : pa_port0_raddr;
-
-"""
-# endmodule
-#
-# // vim: ts=3 sw=3 sts=3 et nosmartindent autoindent foldmethod=marker tw=100
-#
-#
diff --git a/src/unused/iommu/axi_rab/rab_core.py b/src/unused/iommu/axi_rab/rab_core.py
deleted file mode 100644 (file)
index 7d7494a..0000000
+++ /dev/null
@@ -1,539 +0,0 @@
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-
-# this file has been generated by sv2nmigen
-
-#
-# //`include "pulp_soc_defines.sv"
-#
-# ////import CfMath::log2;
-#
-# //`define MY_ARRAY_SUM(MY_ARRAY,ARRAY_SIZE) ( (ARRAY_SIZE==1) ? MY_ARRAY[0] : (ARRAY_SIZE==2) ? MY_ARRAY[0] + MY_ARRAY[1] : (ARRAY_SIZE==3) ? MY_ARRAY[0] + MY_ARRAY[1] + MY_ARRAY[2] : (ARRAY_SIZE==4) ? MY_ARRAY[0] + MY_ARRAY[1] + MY_ARRAY[2] + MY_ARRAY[3] : 0 )
-#
-
-# module rab_core
-#  #(
-#    parameter N_PORTS             =  3,
-#    parameter N_L2_SETS           = 32,
-#    parameter N_L2_SET_ENTRIES    = 32,
-#    parameter AXI_DATA_WIDTH      = 64,
-#    parameter AXI_S_ADDR_WIDTH    = 32,
-#    parameter AXI_M_ADDR_WIDTH    = 40,
-#    parameter AXI_LITE_DATA_WIDTH = 64,
-#    parameter AXI_LITE_ADDR_WIDTH = 32,
-#    parameter AXI_ID_WIDTH        =  8,
-#    parameter AXI_USER_WIDTH      =  6,
-#    parameter MH_FIFO_DEPTH       = 16
-#    )
-#   (
-#    input  logic                                         Clk_CI,
-#    input  logic                                         Rst_RBI,
-#
-#    input  logic               [AXI_LITE_ADDR_WIDTH-1:0] s_axi_awaddr,
-#    input  logic                                         s_axi_awvalid,
-#    output logic                                         s_axi_awready,
-#
-#    input  logic               [AXI_LITE_DATA_WIDTH-1:0] s_axi_wdata,
-#    input  logic             [AXI_LITE_DATA_WIDTH/8-1:0] s_axi_wstrb,
-#    input  logic                                         s_axi_wvalid,
-#    output logic                                         s_axi_wready,
-#
-#    input  logic               [AXI_LITE_ADDR_WIDTH-1:0] s_axi_araddr,
-#    input  logic                                         s_axi_arvalid,
-#    output logic                                         s_axi_arready,
-#
-#    input  logic                                         s_axi_rready,
-#    output logic               [AXI_LITE_DATA_WIDTH-1:0] s_axi_rdata,
-#    output logic                                   [1:0] s_axi_rresp,
-#    output logic                                         s_axi_rvalid,
-#
-#    output logic                                   [1:0] s_axi_bresp,
-#    output logic                                         s_axi_bvalid,
-#    input  logic                                         s_axi_bready,
-#
-#    output logic [N_PORTS-1:0]                           int_miss,
-#    output logic [N_PORTS-1:0]                           int_prot,
-#    output logic [N_PORTS-1:0]                           int_multi,
-#    output logic [N_PORTS-1:0]                           int_prefetch,
-#    output logic                                         int_mhf_full,
-#
-#    output logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] int_axaddr_o,
-#    output logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] int_axid_o,
-#    output logic [N_PORTS-1:0]                     [7:0] int_axlen_o,
-#    output logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] int_axuser_o,
-#
-#    input  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] port1_addr,
-#    input  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] port1_id,
-#    input  logic [N_PORTS-1:0]                     [7:0] port1_len,
-#    input  logic [N_PORTS-1:0]                     [2:0] port1_size,
-#    input  logic [N_PORTS-1:0]                           port1_addr_valid,
-#    input  logic [N_PORTS-1:0]                           port1_type,
-#    input  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] port1_user,
-#    input  logic [N_PORTS-1:0]                           port1_sent,
-#    output logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] port1_out_addr,
-#    output logic [N_PORTS-1:0]                           port1_cache_coherent,
-#    output logic [N_PORTS-1:0]                           port1_accept,
-#    output logic [N_PORTS-1:0]                           port1_drop,
-#    output logic [N_PORTS-1:0]                           port1_miss,
-#
-#    input  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] port2_addr,
-#    input  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] port2_id,
-#    input  logic [N_PORTS-1:0]                     [7:0] port2_len,
-#    input  logic [N_PORTS-1:0]                     [2:0] port2_size,
-#    input  logic [N_PORTS-1:0]                           port2_addr_valid,
-#    input  logic [N_PORTS-1:0]                           port2_type,
-#    input  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] port2_user,
-#    input  logic [N_PORTS-1:0]                           port2_sent,
-#    output logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] port2_out_addr,
-#    output logic [N_PORTS-1:0]                           port2_cache_coherent,
-#    output logic [N_PORTS-1:0]                           port2_accept,
-#    output logic [N_PORTS-1:0]                           port2_drop,
-#    output logic [N_PORTS-1:0]                           port2_miss,
-#
-#    input  logic [N_PORTS-1:0]                           miss_l2_i,
-#    input  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] miss_l2_addr_i,
-#    input  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] miss_l2_id_i,
-#    input  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] miss_l2_user_i,
-#
-#    output logic [N_PORTS-1:0] [AXI_LITE_DATA_WIDTH-1:0] wdata_l2_o,
-#    output logic [N_PORTS-1:0] [AXI_LITE_ADDR_WIDTH-1:0] waddr_l2_o,
-#    output logic [N_PORTS-1:0]                           wren_l2_o
-#    );
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class rab_core(Elaboratable):
-
-    def __init__(self):
-        self.s_axi_awaddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
-        self.s_axi_awvalid = Signal()  # input
-        self.s_axi_awready = Signal()  # output
-        self.s_axi_wdata = Signal(AXI_LITE_DATA_WIDTH)  # input
-        self.s_axi_wstrb = Signal(FIXME)  # input
-        self.s_axi_wvalid = Signal()  # input
-        self.s_axi_wready = Signal()  # output
-        self.s_axi_araddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
-        self.s_axi_arvalid = Signal()  # input
-        self.s_axi_arready = Signal()  # output
-        self.s_axi_rready = Signal()  # input
-        self.s_axi_rdata = Signal(AXI_LITE_DATA_WIDTH)  # output
-        self.s_axi_rresp = Signal(2)  # output
-        self.s_axi_rvalid = Signal()  # output
-        self.s_axi_bresp = Signal(2)  # output
-        self.s_axi_bvalid = Signal()  # output
-        self.s_axi_bready = Signal()  # input
-        self.int_miss = Signal(N_PORTS)  # output
-        self.int_prot = Signal(N_PORTS)  # output
-        self.int_multi = Signal(N_PORTS)  # output
-        self.int_prefetch = Signal(N_PORTS)  # output
-        self.int_mhf_full = Signal()  # output
-        self.int_axaddr_o = Signal()  # output
-        self.int_axid_o = Signal()  # output
-        self.int_axlen_o = Signal()  # output
-        self.int_axuser_o = Signal()  # output
-        self.port1_addr = Signal()  # input
-        self.port1_id = Signal()  # input
-        self.port1_len = Signal()  # input
-        self.port1_size = Signal()  # input
-        self.port1_addr_valid = Signal(N_PORTS)  # input
-        self.port1_type = Signal(N_PORTS)  # input
-        self.port1_user = Signal()  # input
-        self.port1_sent = Signal(N_PORTS)  # input
-        self.port1_out_addr = Signal()  # output
-        self.port1_cache_coherent = Signal(N_PORTS)  # output
-        self.port1_accept = Signal(N_PORTS)  # output
-        self.port1_drop = Signal(N_PORTS)  # output
-        self.port1_miss = Signal(N_PORTS)  # output
-        self.port2_addr = Signal()  # input
-        self.port2_id = Signal()  # input
-        self.port2_len = Signal()  # input
-        self.port2_size = Signal()  # input
-        self.port2_addr_valid = Signal(N_PORTS)  # input
-        self.port2_type = Signal(N_PORTS)  # input
-        self.port2_user = Signal()  # input
-        self.port2_sent = Signal(N_PORTS)  # input
-        self.port2_out_addr = Signal()  # output
-        self.port2_cache_coherent = Signal(N_PORTS)  # output
-        self.port2_accept = Signal(N_PORTS)  # output
-        self.port2_drop = Signal(N_PORTS)  # output
-        self.port2_miss = Signal(N_PORTS)  # output
-        self.miss_l2_i = Signal(N_PORTS)  # input
-        self.miss_l2_addr_i = Signal()  # input
-        self.miss_l2_id_i = Signal()  # input
-        self.miss_l2_user_i = Signal()  # input
-        self.wdata_l2_o = Signal()  # output
-        self.waddr_l2_o = Signal()  # output
-        self.wren_l2_o = Signal(N_PORTS)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        return m
-
-
-""" 
-
-
-    // ███████╗██╗ ██████╗ ███╗   ██╗ █████╗ ██╗     ███████╗
-    // ██╔════╝██║██╔════╝ ████╗  ██║██╔══██╗██║     ██╔════╝
-    // ███████╗██║██║  ███╗██╔██╗ ██║███████║██║     ███████╗
-    // ╚════██║██║██║   ██║██║╚██╗██║██╔══██║██║     ╚════██║
-    // ███████║██║╚██████╔╝██║ ╚████║██║  ██║███████╗███████║
-    // ╚══════╝╚═╝ ╚═════╝ ╚═╝  ╚═══╝╚═╝  ╚═╝╚══════╝╚══════╝
-    // signals
-
-  localparam integer ENABLE_L2TLB[N_PORTS-1:0] = `EN_L2TLB_ARRAY;
-
-  localparam integer N_SLICES[N_PORTS-1:0]     = `N_SLICES_ARRAY;
-  localparam         N_SLICES_TOT              = `MY_ARRAY_SUM(N_SLICES,N_PORTS);
-  localparam         N_SLICES_MAX              = `N_SLICES_MAX;
-
-  localparam N_REGS                            = 4*N_SLICES_TOT + 4;
-  localparam AXI_SIZE_WIDTH                    = log2(AXI_DATA_WIDTH/8);
-
-  localparam PORT_ID_WIDTH                     = (N_PORTS < 2) ? 1 : log2(N_PORTS);
-  localparam MISS_META_WIDTH                   = PORT_ID_WIDTH + AXI_USER_WIDTH + AXI_ID_WIDTH;
-
-  logic [N_PORTS-1:0]                      [15:0] p1_burst_size;
-  logic [N_PORTS-1:0]                      [15:0] p2_burst_size;
-
-  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p1_align_addr;
-  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p2_align_addr;
-
-  logic [N_PORTS-1:0]        [AXI_SIZE_WIDTH-1:0] p1_mask;
-  logic [N_PORTS-1:0]        [AXI_SIZE_WIDTH-1:0] p2_mask;
-
-  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p1_max_addr;
-  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p2_max_addr;
-
-  logic [N_PORTS-1:0]                             p1_prefetch;
-  logic [N_PORTS-1:0]                             p2_prefetch;
-
-  logic [N_PORTS-1:0]                             int_rw;
-  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] int_addr_min;
-  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] int_addr_max;
-  logic [N_PORTS-1:0]          [AXI_ID_WIDTH-1:0] int_id;
-  logic [N_PORTS-1:0]                       [7:0] int_len;
-  logic [N_PORTS-1:0]        [AXI_USER_WIDTH-1:0] int_user;
-
-  logic [N_PORTS-1:0]                             hit;
-  logic [N_PORTS-1:0]                             prot;
-  logic [N_PORTS-1:0]                             prefetch;
-
-  logic [N_PORTS-1:0]                             no_hit;
-  logic [N_PORTS-1:0]                             no_prot;
-
-  logic [N_PORTS-1:0]          [N_SLICES_MAX-1:0] hit_slices;
-  logic [N_PORTS-1:0]          [N_SLICES_MAX-1:0] prot_slices;
-
-  logic [N_PORTS-1:0]      [AXI_M_ADDR_WIDTH-1:0] out_addr;
-  logic [N_PORTS-1:0]      [AXI_M_ADDR_WIDTH-1:0] out_addr_reg;
-
-  logic [N_PORTS-1:0]                             cache_coherent;
-  logic [N_PORTS-1:0]                             cache_coherent_reg;
-
-  logic [N_PORTS-1:0]                             select;
-  reg   [N_PORTS-1:0]                             curr_priority;
-
-  reg   [N_PORTS-1:0]                             multi_hit;
-
-  logic [N_PORTS-1:0]                             miss_valid_mhf;
-  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] miss_addr_mhf;
-  logic [N_PORTS-1:0]       [MISS_META_WIDTH-1:0] miss_meta_mhf;
-
-  logic [N_REGS-1:0]                       [63:0] int_cfg_regs;
-  logic [N_PORTS-1:0] [4*N_SLICES_MAX-1:0] [63:0] int_cfg_regs_slices;
-
-  logic                                           L1AllowMultiHit_S;
-
-  genvar z;
-
-  //  █████╗ ███████╗███████╗██╗ ██████╗ ███╗   ██╗███╗   ███╗███████╗███╗   ██╗████████╗███████╗
-  // ██╔══██╗██╔════╝██╔════╝██║██╔════╝ ████╗  ██║████╗ ████║██╔════╝████╗  ██║╚══██╔══╝██╔════╝
-  // ███████║███████╗███████╗██║██║  ███╗██╔██╗ ██║██╔████╔██║█████╗  ██╔██╗ ██║   ██║   ███████╗
-  // ██╔══██║╚════██║╚════██║██║██║   ██║██║╚██╗██║██║╚██╔╝██║██╔══╝  ██║╚██╗██║   ██║   ╚════██║
-  // ██║  ██║███████║███████║██║╚██████╔╝██║ ╚████║██║ ╚═╝ ██║███████╗██║ ╚████║   ██║   ███████║
-  // ╚═╝  ╚═╝╚══════╝╚══════╝╚═╝ ╚═════╝ ╚═╝  ╚═══╝╚═╝     ╚═╝╚══════╝╚═╝  ╚═══╝   ╚═╝   ╚══════╝
-  // assignments
-
-  always_comb
-    begin : PORT_SELECT
-      var integer idx;
-
-      for (idx=0; idx<N_PORTS; idx++) begin
-
-        // select = 1 -> port1 active
-        // select = 0 -> port2 active
-        select[idx] = (curr_priority[idx] & port1_addr_valid[idx]) | ~port2_addr_valid[idx];
-
-        p1_burst_size[idx] = (port1_len[idx] + 1) << port1_size[idx];
-        p2_burst_size[idx] = (port2_len[idx] + 1) << port2_size[idx];
-
-        // align min addr for max addr computation to allow for smart AXI bursts around the 4k boundary
-        if      (port1_size[idx] == 3'b001)
-          p1_mask[idx] = 3'b110;
-        else if (port1_size[idx] == 3'b010)
-          p1_mask[idx] = 3'b100;
-        else if (port1_size[idx] == 3'b011)
-          p1_mask[idx] = 3'b000;
-        else
-          p1_mask[idx] = 3'b111;
-
-        p1_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port1_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH];
-        p1_align_addr[idx][AXI_SIZE_WIDTH-1:0]                = port1_addr[idx][AXI_SIZE_WIDTH-1:0] & p1_mask[idx];
-
-        if      (port2_size[idx] == 3'b001)
-          p2_mask[idx] = 3'b110;
-        else if (port2_size[idx] == 3'b010)
-          p2_mask[idx] = 3'b100;
-        else if (port2_size[idx] == 3'b011)
-          p2_mask[idx] = 3'b000;
-        else
-          p2_mask[idx] = 3'b111;
-
-        if (port1_user[idx] == {AXI_USER_WIDTH{1'b1}})
-          p1_prefetch[idx] = 1'b1;
-        else
-          p1_prefetch[idx] = 1'b0;
-
-        if (port2_user[idx] == {AXI_USER_WIDTH{1'b1}})
-          p2_prefetch[idx] = 1'b1;
-        else
-          p2_prefetch[idx] = 1'b0;
-
-        p2_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port2_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH];
-        p2_align_addr[idx][AXI_SIZE_WIDTH-1:0]                = port2_addr[idx][AXI_SIZE_WIDTH-1:0] & p2_mask[idx];
-
-        p1_max_addr[idx]  = p1_align_addr[idx] + p1_burst_size[idx] - 1;
-        p2_max_addr[idx]  = p2_align_addr[idx] + p2_burst_size[idx] - 1;
-
-        int_addr_min[idx] = select[idx] ? port1_addr[idx]  : port2_addr[idx];
-        int_addr_max[idx] = select[idx] ? p1_max_addr[idx] : p2_max_addr[idx];
-        int_rw[idx]       = select[idx] ? port1_type[idx]  : port2_type[idx];
-        int_id[idx]       = select[idx] ? port1_id[idx]    : port2_id[idx];
-        int_len[idx]      = select[idx] ? port1_len[idx]   : port2_len[idx];
-        int_user[idx]     = select[idx] ? port1_user[idx]  : port2_user[idx];
-        prefetch[idx]     = select[idx] ? p1_prefetch[idx] : p2_prefetch[idx];
-
-        hit [idx]    = | hit_slices [idx];
-        prot[idx]    = | prot_slices[idx];
-
-        no_hit [idx] = ~hit [idx];
-        no_prot[idx] = ~prot[idx];
-
-        port1_out_addr[idx] = out_addr_reg[idx];
-        port2_out_addr[idx] = out_addr_reg[idx];
-
-        port1_cache_coherent[idx] = cache_coherent_reg[idx];
-        port2_cache_coherent[idx] = cache_coherent_reg[idx];
-      end
-    end
-
-  always_comb
-    begin
-      var integer idx_port, idx_slice;
-      var integer reg_num;
-      reg_num=0;
-      for ( idx_port = 0; idx_port < N_PORTS; idx_port++ ) begin
-        for ( idx_slice = 0; idx_slice < 4*N_SLICES[idx_port]; idx_slice++ ) begin
-          int_cfg_regs_slices[idx_port][idx_slice] = int_cfg_regs[4+reg_num];
-          reg_num++;
-        end
-        // int_cfg_regs_slices[idx_port][N_SLICES_MAX:N_SLICES[idx_port]] will be dangling
-        // Fix to zero. Synthesis will remove these signals.
-        // int_cfg_regs_slices[idx_port][4*N_SLICES_MAX-1:4*N_SLICES[idx_port]] = 0;
-      end
-  end
-
-  always @(posedge Clk_CI or negedge Rst_RBI)
-    begin : PORT_PRIORITY
-      var integer idx;
-      if (Rst_RBI == 1'b0)
-        curr_priority = 'h0;
-      else begin
-        for (idx=0; idx<N_PORTS; idx++) begin
-          if (port1_accept[idx] || port1_drop[idx])
-            curr_priority[idx] = 1'b1;
-          else if (port2_accept[idx] || port2_drop[idx])
-            curr_priority[idx] = 1'b0;
-        end
-      end
-    end
-
-  // find port that misses
-  logic [PORT_ID_WIDTH-1:0] PortIdx_D; // index of the first missing port
-  var integer               idx_miss;
-  always_comb begin : MHF_PORT_SELECT
-    PortIdx_D = 'b0;
-    for (idx_miss = 0; idx_miss < N_PORTS; idx_miss++) begin
-      if (miss_valid_mhf[idx_miss] == 1'b1) begin
-        PortIdx_D = idx_miss;
-        break;
-      end
-    end
-  end // always_comb begin
-
-  //  █████╗ ██╗  ██╗██╗    ██████╗  █████╗ ██████╗      ██████╗███████╗ ██████╗
-  // ██╔══██╗╚██╗██╔╝██║    ██╔══██╗██╔══██╗██╔══██╗    ██╔════╝██╔════╝██╔════╝
-  // ███████║ ╚███╔╝ ██║    ██████╔╝███████║██████╔╝    ██║     █████╗  ██║  ███╗
-  // ██╔══██║ ██╔██╗ ██║    ██╔══██╗██╔══██║██╔══██╗    ██║     ██╔══╝  ██║   ██║
-  // ██║  ██║██╔╝ ██╗██║    ██║  ██║██║  ██║██████╔╝    ╚██████╗██║     ╚██████╔╝
-  // ╚═╝  ╚═╝╚═╝  ╚═╝╚═╝    ╚═╝  ╚═╝╚═╝  ╚═╝╚═════╝      ╚═════╝╚═╝      ╚═════╝
-  axi_rab_cfg
-    #(
-      .N_PORTS         ( N_PORTS             ),
-      .N_REGS          ( N_REGS              ),
-      .N_L2_SETS       ( N_L2_SETS           ),
-      .N_L2_SET_ENTRIES( N_L2_SET_ENTRIES    ),
-      .ADDR_WIDTH_PHYS ( AXI_M_ADDR_WIDTH    ),
-      .ADDR_WIDTH_VIRT ( AXI_S_ADDR_WIDTH    ),
-      .N_FLAGS         ( 4                   ),
-      .AXI_DATA_WIDTH  ( AXI_LITE_DATA_WIDTH ),
-      .AXI_ADDR_WIDTH  ( AXI_LITE_ADDR_WIDTH ),
-      .MISS_META_WIDTH ( MISS_META_WIDTH     ),
-      .MH_FIFO_DEPTH   ( MH_FIFO_DEPTH       )
-    )
-    u_axi_rab_cfg
-    (
-      .Clk_CI             ( Clk_CI                    ),
-      .Rst_RBI            ( Rst_RBI                   ),
-      .s_axi_awaddr       ( s_axi_awaddr              ),
-      .s_axi_awvalid      ( s_axi_awvalid             ),
-      .s_axi_wdata        ( s_axi_wdata               ),
-      .s_axi_wstrb        ( s_axi_wstrb               ),
-      .s_axi_wvalid       ( s_axi_wvalid              ),
-      .s_axi_bready       ( s_axi_bready              ),
-      .s_axi_araddr       ( s_axi_araddr              ),
-      .s_axi_arvalid      ( s_axi_arvalid             ),
-      .s_axi_rready       ( s_axi_rready              ),
-      .s_axi_arready      ( s_axi_arready             ),
-      .s_axi_rdata        ( s_axi_rdata               ),
-      .s_axi_rresp        ( s_axi_rresp               ),
-      .s_axi_rvalid       ( s_axi_rvalid              ),
-      .s_axi_wready       ( s_axi_wready              ),
-      .s_axi_bresp        ( s_axi_bresp               ),
-      .s_axi_bvalid       ( s_axi_bvalid              ),
-      .s_axi_awready      ( s_axi_awready             ),
-      .L1Cfg_DO           ( int_cfg_regs              ),
-      .L1AllowMultiHit_SO ( L1AllowMultiHit_S         ),
-      .MissAddr_DI        ( miss_addr_mhf[PortIdx_D]  ),
-      .MissMeta_DI        ( miss_meta_mhf[PortIdx_D]  ),
-      .Miss_SI            ( miss_valid_mhf[PortIdx_D] ),
-      .MhFifoFull_SO      ( int_mhf_full              ),
-      .wdata_l2           ( wdata_l2_o                ),
-      .waddr_l2           ( waddr_l2_o                ),
-      .wren_l2            ( wren_l2_o                 )
-    );
-
-  generate for (z = 0; z < N_PORTS; z++) begin : MHF_TLB_SELECT
-    if (ENABLE_L2TLB[z] == 1) begin // L2 TLB is enabled
-      assign miss_valid_mhf[z] = miss_l2_i[z];
-      assign miss_addr_mhf[z]  = miss_l2_addr_i[z];
-      assign miss_meta_mhf[z]  = {miss_l2_user_i[z], PortIdx_D, miss_l2_id_i[z]};
-    end else begin// L2 TLB is disabled
-      assign miss_valid_mhf[z] = int_miss[z];
-      assign miss_addr_mhf[z]  = int_addr_min[z];
-      assign miss_meta_mhf[z]  = {int_user[z], PortIdx_D, int_id[z]};
-    end
-  end
-  endgenerate
-
-  // ███████╗██╗     ██╗ ██████╗███████╗    ████████╗ ██████╗ ██████╗
-  // ██╔════╝██║     ██║██╔════╝██╔════╝    ╚══██╔══╝██╔═══██╗██╔══██╗
-  // ███████╗██║     ██║██║     █████╗         ██║   ██║   ██║██████╔╝
-  // ╚════██║██║     ██║██║     ██╔══╝         ██║   ██║   ██║██╔═══╝
-  // ███████║███████╗██║╚██████╗███████╗       ██║   ╚██████╔╝██║
-  // ╚══════╝╚══════╝╚═╝ ╚═════╝╚══════╝       ╚═╝    ╚═════╝ ╚═╝
-  generate for (z = 0; z < N_PORTS; z++) begin : SLICE_TOP_GEN
-    slice_top
-      #(
-        .N_SLICES        ( N_SLICES[z]      ),
-        .N_REGS          ( 4*N_SLICES[z]    ),
-        .ADDR_WIDTH_PHYS ( AXI_M_ADDR_WIDTH ),
-        .ADDR_WIDTH_VIRT ( AXI_S_ADDR_WIDTH )
-      )
-      u_slice_top
-      (
-        .int_cfg_regs    ( int_cfg_regs_slices[z][4*N_SLICES[z]-1:0] ),
-        .int_rw          ( int_rw[z]                                 ),
-        .int_addr_min    ( int_addr_min[z]                           ),
-        .int_addr_max    ( int_addr_max[z]                           ),
-        .multi_hit_allow ( L1AllowMultiHit_S                         ),
-        .multi_hit       ( multi_hit[z]                              ),
-        .prot            ( prot_slices[z][N_SLICES[z]-1:0]           ),
-        .hit             ( hit_slices [z][N_SLICES[z]-1:0]           ),
-        .cache_coherent  ( cache_coherent[z]                         ),
-        .out_addr        ( out_addr[z]                               )
-      );
-    // hit_slices [N_SLICES_MAX-1:N_SLICES_MAX-N_SLICES[z]] will be dangling
-    // prot_slices[N_SLICES_MAX-1:N_SLICES_MAX-N_SLICES[z]] will be dangling
-    // Fix to zero. Synthesis will remove these signals.
-    if ( N_SLICES[z] < N_SLICES_MAX ) begin
-      assign hit_slices [z][N_SLICES_MAX-1:N_SLICES[z]] = 0;
-      assign prot_slices[z][N_SLICES_MAX-1:N_SLICES[z]] = 0;
-    end
-  end // for (z = 0; z < N_PORTS; z++)
-  endgenerate
-
-  // ███████╗███████╗███╗   ███╗
-  // ██╔════╝██╔════╝████╗ ████║
-  // █████╗  ███████╗██╔████╔██║
-  // ██╔══╝  ╚════██║██║╚██╔╝██║
-  // ██║     ███████║██║ ╚═╝ ██║
-  // ╚═╝     ╚══════╝╚═╝     ╚═╝
-  //
-  generate for (z = 0; z < N_PORTS; z++) begin : FSM_GEN
-    fsm
-      #(
-        .AXI_M_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
-        .AXI_S_ADDR_WIDTH ( AXI_S_ADDR_WIDTH ),
-        .AXI_ID_WIDTH     ( AXI_ID_WIDTH     ),
-        .AXI_USER_WIDTH   ( AXI_USER_WIDTH   )
-      )
-      u_fsm
-      (
-        .Clk_CI             ( Clk_CI                ),
-        .Rst_RBI            ( Rst_RBI               ),
-        .port1_addr_valid_i ( port1_addr_valid[z]   ),
-        .port2_addr_valid_i ( port2_addr_valid[z]   ),
-        .port1_sent_i       ( port1_sent[z]         ),
-        .port2_sent_i       ( port2_sent[z]         ),
-        .select_i           ( select[z]             ),
-        .no_hit_i           ( no_hit[z]             ),
-        .multi_hit_i        ( multi_hit[z]          ),
-        .no_prot_i          ( no_prot[z]            ),
-        .prefetch_i         ( prefetch[z]           ),
-        .out_addr_i         ( out_addr[z]           ),
-        .cache_coherent_i   ( cache_coherent[z]     ),
-        .port1_accept_o     ( port1_accept[z]       ),
-        .port1_drop_o       ( port1_drop[z]         ),
-        .port1_miss_o       ( port1_miss[z]         ),
-        .port2_accept_o     ( port2_accept[z]       ),
-        .port2_drop_o       ( port2_drop[z]         ),
-        .port2_miss_o       ( port2_miss[z]         ),
-        .out_addr_o         ( out_addr_reg[z]       ),
-        .cache_coherent_o   ( cache_coherent_reg[z] ),
-        .miss_o             ( int_miss[z]           ),
-        .multi_o            ( int_multi[z]          ),
-        .prot_o             ( int_prot[z]           ),
-        .prefetch_o         ( int_prefetch[z]       ),
-        .in_addr_i          ( int_addr_min[z]       ),
-        .in_id_i            ( int_id[z]             ),
-        .in_len_i           ( int_len[z]            ),
-        .in_user_i          ( int_user[z]           ),
-        .in_addr_o          ( int_axaddr_o[z]       ),
-        .in_id_o            ( int_axid_o[z]         ),
-        .in_len_o           ( int_axlen_o[z]        ),
-        .in_user_o          ( int_axuser_o[z]       )
-      );
-  end
-  endgenerate
-  
-"""
diff --git a/src/unused/iommu/axi_rab/rab_slice.py b/src/unused/iommu/axi_rab/rab_slice.py
deleted file mode 100644 (file)
index 59f84e3..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# module rab_slice
-# #(
-#    parameter ADDR_WIDTH_PHYS = 40,
-#    parameter ADDR_WIDTH_VIRT = 32
-#    )
-#   (
-#    input  logic [ADDR_WIDTH_VIRT-1:0] cfg_min,
-#    input  logic [ADDR_WIDTH_VIRT-1:0] cfg_max,
-#    input  logic [ADDR_WIDTH_PHYS-1:0] cfg_offset,
-#    input  logic                       cfg_wen,
-#    input  logic                       cfg_ren,
-#    input  logic                       cfg_en,
-#    input  logic                       in_trans_type,
-#    input  logic [ADDR_WIDTH_VIRT-1:0] in_addr_min,
-#    input  logic [ADDR_WIDTH_VIRT-1:0] in_addr_max,
-#    output logic                       out_hit,
-#    output logic                       out_prot,
-#    output logic [ADDR_WIDTH_PHYS-1:0] out_addr
-#  );
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-
-
-class rab_slice(Elaboratable):
-
-    def __init__(self, params):  # pass config object
-        # TODO parameters
-        self.params = params
-        self.cfg_min = Signal(params.ADDR_WIDTH_VIRT)  # input
-        self.cfg_max = Signal(params.ADDR_WIDTH_VIRT)  # input
-        self.cfg_offset = Signal(params.ADDR_WIDTH_PHYS)  # input
-        self.cfg_wen = Signal()  # input
-        self.cfg_ren = Signal()  # input
-        self.cfg_en = Signal()  # input
-        self.in_trans_type = Signal()  # input
-        self.in_addr_min = Signal(params.ADDR_WIDTH_VIRT)  # input
-        self.in_addr_max = Signal(params.ADDR_WIDTH_VIRT)  # input
-        self.out_hit = Signal()  # output
-        self.out_prot = Signal()  # output
-        self.out_addr = Signal(params.ADDR_WIDTH_PHYS)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-        min_above_min = Signal()
-        min_below_max = Signal()
-        max_below_max = Signal()
-
-        #  assign min_above_min = (in_addr_min >= cfg_min) ? 1'b1 : 1'b0;
-        #  assign min_below_max = (in_addr_min <= cfg_max) ? 1'b1 : 1'b0;
-        #  assign max_below_max = (in_addr_max <= cfg_max) ? 1'b1 : 1'b0;
-        #  assign out_hit  = cfg_en & min_above_min & min_below_max & max_below_max;
-        #  assign out_prot = out_hit & ((in_trans_type & ~cfg_wen) | (~in_trans_type & ~cfg_ren));
-        #  assign out_addr = in_addr_min - cfg_min + cfg_offset;
-        m.d.comb += [
-            min_above_min.eq(self.in_addr_min >= self.cfg_min),
-            min_below_max.eq(self.in_addr_min <= self.cfg_max),
-            max_below_max.eq(self.in_addr_max <= self.cfg_max),
-            self.out_hit.eq(self.cfg_en & min_above_min &
-                            min_below_max & max_below_max),
-            self.out_prot.eq(self.out_hit & (
-                (self.in_trans_type & ~self.cfg_wen) | (~self.in_trans_type & ~self.cfg_ren))),
-            self.out_addr.eq(self.in_addr_min - self.cfg_min + self.cfg_offset)
-        ]
-
-        return m
diff --git a/src/unused/iommu/axi_rab/ram_tp_no_change.py b/src/unused/iommu/axi_rab/ram_tp_no_change.py
deleted file mode 100644 (file)
index d010473..0000000
+++ /dev/null
@@ -1,97 +0,0 @@
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# /*
-# * ram_tp_no_change
-# *
-# * This code implements a parameterizable two-port memory. Port 0 can read and
-# * write while Port 1 can read only. The Xilinx tools will infer a BRAM with
-# * Port 0 in "no change" mode, i.e., during a write, it retains the last read
-# * value on the output. Port 1 (read-only) is in "write first" mode. Still, it
-# * outputs the old data during the write cycle. Note: Port 1 outputs invalid
-# * data in the cycle after the write when reading the same address.
-# *
-# * For more information, see Xilinx PG058 Block Memory Generator Product Guide.
-# */
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-from nmigen import Memory
-
-import math
-
-#
-# module ram_tp_no_change
-#  #(
-ADDR_WIDTH = 10
-DATA_WIDTH = 36
-#  )
-#  (
-#    input                   clk,
-#    input                   we,
-#    input  [ADDR_WIDTH-1:0] addr0,
-#    input  [ADDR_WIDTH-1:0] addr1,
-#    input  [DATA_WIDTH-1:0] d_i,
-#    output [DATA_WIDTH-1:0] d0_o,
-#    output [DATA_WIDTH-1:0] d1_o
-#  );
-
-
-class ram_tp_no_change(Elaboratable):
-
-    def __init__(self):
-        self.we = Signal()               # input
-        self.addr0 = Signal(ADDR_WIDTH)  # input
-        self.addr1 = Signal(ADDR_WIDTH)  # input
-        self.d_i = Signal(DATA_WIDTH)    # input
-        self.d0_o = Signal(DATA_WIDTH)   # output
-        self.d1_o = Signal(DATA_WIDTH)   # output
-
-        DEPTH = int(math.pow(2, ADDR_WIDTH))
-        self.ram = Memory(width=DATA_WIDTH, depth=DEPTH)
-    #
-    #  localparam DEPTH = 2**ADDR_WIDTH;
-    #
-    #  (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH];
-    #                            reg [DATA_WIDTH-1:0] d0;
-    #                            reg [DATA_WIDTH-1:0] d1;
-    #
-    #  always_ff @(posedge clk) begin
-    #    if(we == 1'b1) begin
-    #      ram[addr0] <= d_i;
-    #    end else begin
-    # only change data if we==false
-    #      d0 <= ram[addr0];
-    #    end
-    #    d1   <= ram[addr1];
-    #  end
-    #
-    #  assign d0_o = d0;
-    #  assign d1_o = d1;
-    #
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.submodules.read_ram0 = read_ram0 = self.ram.read_port()
-        m.submodules.read_ram1 = read_ram1 = self.ram.read_port()
-        m.submodules.write_ram = write_ram = self.ram.write_port()
-
-        # write port
-        m.d.comb += write_ram.en.eq(self.we)
-        m.d.comb += write_ram.addr.eq(self.addr0)
-        m.d.comb += write_ram.data.eq(self.d_i)
-
-        # read ports
-        m.d.comb += read_ram0.addr.eq(self.addr0)
-        m.d.comb += read_ram1.addr.eq(self.addr1)
-        with m.If(self.we == 0):
-            m.d.sync += self.d0_o.eq(read_ram0.data)
-        m.d.sync += self.d1_o.eq(read_ram1.data)
-
-        return m
diff --git a/src/unused/iommu/axi_rab/ram_tp_write_first.py b/src/unused/iommu/axi_rab/ram_tp_write_first.py
deleted file mode 100644 (file)
index 8fd2abb..0000000
+++ /dev/null
@@ -1,93 +0,0 @@
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-#
-# /*
-# * ram_tp_write_first
-# *
-# * This code implements a parameterizable two-port memory. Port 0 can read and
-# * write while Port 1 can read only. Xilinx Vivado will infer a BRAM in
-# * "write first" mode, i.e., upon a read and write to the same address, the
-# * new value is read. Note: Port 1 outputs invalid data in the cycle after
-# * the write when reading the same address.
-# *
-# * For more information, see Xilinx PG058 Block Memory Generator Product Guide.
-# */
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-from nmigen import Memory
-
-import math
-#
-# module ram_tp_write_first
-#  #(
-ADDR_WIDTH = 10
-DATA_WIDTH = 36
-#  )
-#  (
-#    input                   clk,
-#    input                   we,
-#    input  [ADDR_WIDTH-1:0] addr0,
-#    input  [ADDR_WIDTH-1:0] addr1,
-#    input  [DATA_WIDTH-1:0] d_i,
-#    output [DATA_WIDTH-1:0] d0_o,
-#    output [DATA_WIDTH-1:0] d1_o
-#  );
-
-
-class ram_tp_write_first(Elaboratable):
-
-    def __init__(self):
-        self.we = Signal()               # input
-        self.addr0 = Signal(ADDR_WIDTH)  # input
-        self.addr1 = Signal(ADDR_WIDTH)  # input
-        self.d_i = Signal(DATA_WIDTH)    # input
-        self.d0_o = Signal(DATA_WIDTH)   # output
-        self.d1_o = Signal(DATA_WIDTH)   # output
-
-        DEPTH = int(math.pow(2, ADDR_WIDTH))
-        self.ram = Memory(width=DATA_WIDTH, depth=DEPTH)
-
-    #
-    #  localparam DEPTH = 2**ADDR_WIDTH;
-    #
-    #  (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH];
-    #                            reg [ADDR_WIDTH-1:0] raddr0;
-    #                            reg [ADDR_WIDTH-1:0] raddr1;
-    #
-    #  always_ff @(posedge clk) begin
-    #    if(we == 1'b1) begin
-    #      ram[addr0] <= d_i;
-    #    end
-    #    raddr0 <= addr0;
-    #    raddr1 <= addr1;
-    #  end
-    #
-    #  assign d0_o = ram[raddr0];
-    #  assign d1_o = ram[raddr1];
-    #
-
-    def elaborate(self, platform=None):
-        m = Module()
-        m.submodules.read_ram0 = read_ram0 = self.ram.read_port()
-        m.submodules.read_ram1 = read_ram1 = self.ram.read_port()
-        m.submodules.write_ram = write_ram = self.ram.write_port()
-
-        # write port
-        m.d.comb += write_ram.en.eq(self.we)
-        m.d.comb += write_ram.addr.eq(self.addr0)
-        m.d.comb += write_ram.data.eq(self.d_i)
-
-        # read ports
-        m.d.comb += read_ram0.addr.eq(self.addr0)
-        m.d.comb += read_ram1.addr.eq(self.addr1)
-        m.d.sync += self.d0_o.eq(read_ram0.data)
-        m.d.sync += self.d1_o.eq(read_ram1.data)
-
-        return m
diff --git a/src/unused/iommu/axi_rab/slice_top.py b/src/unused/iommu/axi_rab/slice_top.py
deleted file mode 100644 (file)
index 6eedb1c..0000000
+++ /dev/null
@@ -1,141 +0,0 @@
-# // Copyright 2018 ETH Zurich and University of Bologna.
-# // Copyright and related rights are licensed under the Solderpad Hardware
-# // License, Version 0.51 (the "License"); you may not use this file except in
-# // compliance with the License.  You may obtain a copy of the License at
-# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# // or agreed to in writing, software, hardware and materials distributed under
-# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# // specific language governing permissions and limitations under the License.
-
-# this file has been generated by sv2nmigen
-
-from nmigen import Signal, Module, Const, Cat, Elaboratable
-import rab_slice
-import coreconfig
-
-#
-# module slice_top
-# //#(
-#  //  parameter N_SLICES        = 16,
-#  //  parameter N_REGS          = 4*N_SLICES,
-#   // parameter ADDR_WIDTH_PHYS = 40,
-#   // parameter ADDR_WIDTH_VIRT = 32
-#  //  )
-#   (
-#    input   logic   [N_REGS-1:0] [63:0] int_cfg_regs,
-#    input   logic                       int_rw,
-#    input   logic [ADDR_WIDTH_VIRT-1:0] int_addr_min,
-#    input   logic [ADDR_WIDTH_VIRT-1:0] int_addr_max,
-#    input   logic                       multi_hit_allow,
-#    output  logic                       multi_hit,
-#    output  logic        [N_SLICES-1:0] prot,
-#    output  logic        [N_SLICES-1:0] hit,
-#    output  logic                       cache_coherent,
-#    output  logic [ADDR_WIDTH_PHYS-1:0] out_addr
-#  );
-#
-
-
-class slice_top(Elaboratable):
-
-    def __init__(self):
-        # FIXME self.int_cfg_regs = Signal()  # input
-        self.params = coreconfig.CoreConfig() # rename ?
-        self.int_rw = Signal()  # input
-        self.int_addr_min = Signal(self.params.ADDR_WIDTH_VIRT)  # input
-        self.int_addr_max = Signal(self.params.ADDR_WIDTH_VIRT)  # input
-        self.multi_hit_allow = Signal()  # input
-        self.multi_hit = Signal()  # output
-        self.prot = Signal(self.params.N_SLICES)  # output
-        self.hit = Signal(self.params.N_SLICES)  # output
-        self.cache_coherent = Signal()  # output
-        self.out_addr = Signal(self.params.ADDR_WIDTH_PHYS)  # output
-
-    def elaborate(self, platform=None):
-        m = Module()
-
-        first_hit = Signal()
-
-        for i in range(self.params.N_SLICES):
-            # TODO pass params / core config here
-            u_slice = rab_slice.rab_slice(self.params)
-            setattr(m.submodules, "u_slice%d" % i, u_slice)
-            # TODO set param and connect ports
-
-        # In case of a multi hit, the lowest slice with a hit is selected.
-        # TODO always_comb begin : HIT_CHECK
-        m.d.comb += [
-            first_hit.eq(0),
-            self.multi_hit.eq(0),
-            self.out_addr.eq(0),
-            self.cache_coherent.eq(0)]
-
-        for j in range(self.params.N_SLICES):
-            with m.If(self.hit[j] == 1):
-                with m.If(first_hit == 1):
-                    with m.If(self.multi_hit_allow == 0):
-                        m.d.comb += [self.multi_hit.eq(1)]
-                with m.Elif(first_hit == 1):
-                    m.d.comb += [first_hit.eq(1)
-                                 # only output first slice that was hit
-                                 # SV self.out_addr.eq(slice_out_addr[ADDR_WIDTH_PHYS*j + : ADDR_WIDTH_PHYS]),
-                                 # SV self.cache_coherent.eq(int_cfg_regs[4*j+3][3]),
-                                 ]
-        return m
-
-  # TODO translate generate statement
-
-
-"""
-  logic [ADDR_WIDTH_PHYS*N_SLICES-1:0]  slice_out_addr;
-
-  generate
-    for ( i=0; i<N_SLICES; i++ )
-      begin
-        rab_slice
-          #(
-            .ADDR_WIDTH_PHYS ( ADDR_WIDTH_PHYS ),
-            .ADDR_WIDTH_VIRT ( ADDR_WIDTH_VIRT )
-            )
-          u_slice
-          (
-            .cfg_min       ( int_cfg_regs[4*i]  [ADDR_WIDTH_VIRT-1:0]                              ),
-            .cfg_max       ( int_cfg_regs[4*i+1][ADDR_WIDTH_VIRT-1:0]                              ),
-            .cfg_offset    ( int_cfg_regs[4*i+2][ADDR_WIDTH_PHYS-1:0]                              ),
-            .cfg_wen       ( int_cfg_regs[4*i+3][2]                                                ),
-            .cfg_ren       ( int_cfg_regs[4*i+3][1]                                                ),
-            .cfg_en        ( int_cfg_regs[4*i+3][0]                                                ),
-            .in_trans_type ( int_rw                                                                ),
-            .in_addr_min   ( int_addr_min                                                          ),
-            .in_addr_max   ( int_addr_max                                                          ),
-            .out_addr      ( slice_out_addr[ADDR_WIDTH_PHYS*i+ADDR_WIDTH_PHYS-1:ADDR_WIDTH_PHYS*i] ),
-            .out_prot      ( prot[i]                                                               ),
-            .out_hit       ( hit[i]                                                                )
-          );
-     end
-  endgenerate
-
-  // In case of a multi hit, the lowest slice with a hit is selected.
-  always_comb begin : HIT_CHECK
-    first_hit      =  0;
-    multi_hit      =  0;
-    out_addr       = '0;
-    cache_coherent =  0;
-    for (j = 0; j < N_SLICES; j++) begin
-      if (hit[j] == 1'b1) begin
-        if (first_hit == 1'b1) begin
-          if (multi_hit_allow == 1'b0) begin
-            multi_hit = 1'b1;
-          end
-        end else begin
-          first_hit       = 1'b1;
-          out_addr        = slice_out_addr[ADDR_WIDTH_PHYS*j +: ADDR_WIDTH_PHYS];
-          cache_coherent  = int_cfg_regs[4*j+3][3];
-        end
-      end
-    end
-  end
-"""
-
-# sv 2 migen: TODO add translate code for generate statements and for loops inside always_comb
diff --git a/src/unused/iommu/axi_rab/test/__init__.py b/src/unused/iommu/axi_rab/test/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/src/unused/iommu/axi_rab/test/test_ram_tp_no_change.py b/src/unused/iommu/axi_rab/test/test_ram_tp_no_change.py
deleted file mode 100644 (file)
index 8d23ef0..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-from ram_tp_write_first import ram_tp_write_first
-from nmigen.compat.sim import run_simulation
-import sys
-sys.path.append("../")
-
-
-def tbench(dut):
-    yield dut.we.eq(1)
-    for i in range(0, 255):
-        yield dut.addr0.eq(i)
-        yield dut.d_i.eq(i)
-        yield
-
-
-if __name__ == "__main__":
-    dut = ram_tp_write_first()
-    run_simulation(dut, tbench(dut), vcd_name="ram_tp_write_first.vcd")
-    print("ram_tp_write_first Unit Test Success")
diff --git a/src/unused/iommu/axi_rab/test/test_slice_top.py b/src/unused/iommu/axi_rab/test/test_slice_top.py
deleted file mode 100644 (file)
index c234b90..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-from nmigen.compat.sim import run_simulation
-import sys
-sys.path.append("../")
-# sys.path.append("../../../TestUtil")
-from slice_top import slice_top
-
-def tbench(dut):
-    yield
-
-
-if __name__ == "__main__":
-    dut = slice_top()
-    run_simulation(dut, tbench(dut), vcd_name="test_slice_top.vcd")
-    print("slice_top Unit Test Success")
diff --git a/src/unused/simulator/__init__.py b/src/unused/simulator/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/src/unused/simulator/internalop_sim.py b/src/unused/simulator/internalop_sim.py
deleted file mode 100644 (file)
index f7a4b26..0000000
+++ /dev/null
@@ -1,196 +0,0 @@
-from soc.decoder.power_enums import (Function, Form, InternalOp,
-                                     In1Sel, In2Sel, In3Sel, OutSel,
-                                     RC, LdstLen, CryIn, get_csv,
-                                     single_bit_flags,
-                                     get_signal_name, default_values)
-import math
-
-
-class MemorySim:
-    def __init__(self, bytes_per_word=8):
-        self.mem = {}
-        self.bytes_per_word = bytes_per_word
-        self.word_log2 = math.ceil(math.log2(bytes_per_word))
-
-    def _get_shifter_mask(self, width, remainder):
-        shifter = ((self.bytes_per_word - width) - remainder) * \
-            8  # bits per byte
-        mask = (1 << (width * 8)) - 1
-        return shifter, mask
-
-    # TODO: Implement ld/st of lesser width
-    def ld(self, address, width=8):
-        remainder = address & (self.bytes_per_word - 1)
-        address = address >> self.word_log2
-        assert remainder & (width - 1) == 0, "Unaligned access unsupported!"
-        if address in self.mem:
-            val = self.mem[address]
-        else:
-            val = 0
-
-        if width != self.bytes_per_word:
-            shifter, mask = self._get_shifter_mask(width, remainder)
-            val = val & (mask << shifter)
-            val >>= shifter
-        print("Read {:x} from addr {:x}".format(val, address))
-        return val
-
-    def st(self, address, value, width=8):
-        remainder = address & (self.bytes_per_word - 1)
-        address = address >> self.word_log2
-        assert remainder & (width - 1) == 0, "Unaligned access unsupported!"
-        print("Writing {:x} to addr {:x}".format(value, address))
-        if width != self.bytes_per_word:
-            if address in self.mem:
-                val = self.mem[address]
-            else:
-                val = 0
-            shifter, mask = self._get_shifter_mask(width, remainder)
-            val &= ~(mask << shifter)
-            val |= value << shifter
-            self.mem[address] = val
-        else:
-            self.mem[address] = value
-
-
-class RegFile:
-    def __init__(self):
-        self.regfile = [0] * 32
-        self.sprs = {}
-
-    def write_reg(self, regnum, value):
-        all1s = (1 << 64)-1  # 64 bits worth of 1s
-        value &= all1s
-        print("Writing {:x} to reg r{}".format(value, regnum))
-        self.regfile[regnum] = value
-
-    def read_reg(self, regnum):
-        val = self.regfile[regnum]
-        print("Read {:x} from reg r{}".format(val, regnum))
-        return val
-
-    def assert_gpr(self, gpr, val):
-        reg_val = self.read_reg(gpr)
-        msg = "reg r{} got {:x}, expecting {:x}".format(
-            gpr, reg_val, val)
-        assert reg_val == val, msg
-
-    def assert_gprs(self, gprs):
-        for k, v in list(gprs.items()):
-            self.assert_gpr(k, v)
-
-    def set_xer(self, result, operanda, operandb):
-        xer = 0
-        if result & 1 << 64:
-            xer |= XER.CA
-
-        self.xer = xer
-
-
-class InternalOpSimulator:
-    def __init__(self):
-        self.mem_sim = MemorySim()
-        self.regfile = RegFile()
-
-    def execute_alu_op(self, op1, op2, internal_op, carry=0):
-        print(internal_op)
-        if internal_op == InternalOp.OP_ADD.value:
-            return op1 + op2 + carry
-        elif internal_op == InternalOp.OP_AND.value:
-            return op1 & op2
-        elif internal_op == InternalOp.OP_OR.value:
-            return op1 | op2
-        elif internal_op == InternalOp.OP_MUL_L64.value:
-            return op1 * op2
-        else:
-            assert False, "Not implemented"
-
-    def update_cr0(self, result):
-        if result == 0:
-            self.cr0 = 0b001
-        elif result >> 63:
-            self.cr0 = 0b100
-        else:
-            self.cr0 = 0b010
-        print("update_cr0", self.cr0)
-
-    def alu_op(self, pdecode2):
-        all1s = (1 << 64)-1  # 64 bits worth of 1s
-        internal_op = yield pdecode2.dec.op.internal_op
-        operand1 = 0
-        operand2 = 0
-        result = 0
-        carry = 0
-        r1_ok = yield pdecode2.e.read_reg1.ok
-        r2_ok = yield pdecode2.e.read_reg2.ok
-        r3_ok = yield pdecode2.e.read_reg3.ok
-        imm_ok = yield pdecode2.e.imm_data.ok
-        if r1_ok:
-            r1_sel = yield pdecode2.e.read_reg1.data
-            operand1 = self.regfile.read_reg(r1_sel)
-        elif r3_ok:
-            r3_sel = yield pdecode2.e.read_reg3.data
-            operand1 = self.regfile.read_reg(r3_sel)
-        if r2_ok:
-            r2_sel = yield pdecode2.e.read_reg2.data
-            operand2 = self.regfile.read_reg(r2_sel)
-        if imm_ok:
-            operand2 = yield pdecode2.e.imm_data.data
-
-        inv_a = yield pdecode2.dec.op.inv_a
-        if inv_a:
-            operand1 = (~operand1) & all1s
-
-        cry_in = yield pdecode2.dec.op.cry_in
-        if cry_in == CryIn.ONE.value:
-            carry = 1
-        elif cry_in == CryIn.CA.value:
-            carry = self.carry_out
-
-        # TODO rc_sel = yield pdecode2.dec.op.rc_sel
-        result = self.execute_alu_op(operand1, operand2, internal_op,
-                                     carry=carry)
-
-        cry_out = yield pdecode2.dec.op.cry_out
-        rc = yield pdecode2.e.rc.data
-
-        if rc:
-            self.update_cr0(result)
-        if cry_out == 1:
-            self.carry_out = (result >> 64)
-            print("setting carry_out", self.carry_out)
-
-        ro_ok = yield pdecode2.e.write_reg.ok
-        if ro_ok:
-            ro_sel = yield pdecode2.e.write_reg.data
-            self.regfile.write_reg(ro_sel, result)
-
-    def mem_op(self, pdecode2):
-        internal_op = yield pdecode2.dec.op.internal_op
-        addr_reg = yield pdecode2.e.read_reg1.data
-        addr = self.regfile.read_reg(addr_reg)
-
-        imm_ok = yield pdecode2.e.imm_data.ok
-        r2_ok = yield pdecode2.e.read_reg2.ok
-        width = yield pdecode2.e.data_len
-        if imm_ok:
-            imm = yield pdecode2.e.imm_data.data
-            addr += imm
-        elif r2_ok:
-            r2_sel = yield pdecode2.e.read_reg2.data
-            addr += self.regfile.read_reg(r2_sel)
-        if internal_op == InternalOp.OP_STORE.value:
-            val_reg = yield pdecode2.e.read_reg3.data
-            val = self.regfile.read_reg(val_reg)
-            self.mem_sim.st(addr, val, width)
-        elif internal_op == InternalOp.OP_LOAD.value:
-            dest_reg = yield pdecode2.e.write_reg.data
-            val = self.mem_sim.ld(addr, width)
-            self.regfile.write_reg(dest_reg, val)
-
-    def execute_op(self, pdecode2):
-        function = yield pdecode2.dec.op.function_unit
-        if function == Function.ALU.value:
-            yield from self.alu_op(pdecode2)
-        elif function == Function.LDST.value:
-            yield from self.mem_op(pdecode2)
diff --git a/src/unused/simulator/test_sim.py b/src/unused/simulator/test_sim.py
deleted file mode 100644 (file)
index 0a26380..0000000
+++ /dev/null
@@ -1,140 +0,0 @@
-from nmigen import Module, Signal
-from nmigen.back.pysim import Simulator, Delay
-from nmigen.test.utils import FHDLTestCase
-import unittest
-from soc.simulator.internalop_sim import InternalOpSimulator
-from soc.decoder.power_decoder import (create_pdecode)
-from soc.decoder.power_enums import (Function, InternalOp,
-                                     In1Sel, In2Sel, In3Sel,
-                                     OutSel, RC, LdstLen, CryIn,
-                                     single_bit_flags, Form, SPR,
-                                     get_signal_name, get_csv)
-from soc.decoder.power_decoder2 import (PowerDecode2)
-from soc.simulator.program import Program
-from soc.simulator.qemu import run_program
-
-
-class Register:
-    def __init__(self, num):
-        self.num = num
-
-
-class DecoderTestCase(FHDLTestCase):
-
-    def run_tst(self, generator, simulator):
-        m = Module()
-        comb = m.d.comb
-        instruction = Signal(32)
-
-        pdecode = create_pdecode()
-
-        m.submodules.pdecode2 = pdecode2 = PowerDecode2(pdecode)
-        comb += pdecode2.dec.raw_opcode_in.eq(instruction)
-        sim = Simulator(m)
-        gen = generator.generate_instructions()
-
-        def process():
-            for ins in gen:
-
-                print("0x{:X}".format(ins & 0xffffffff))
-
-                # ask the decoder to decode this binary data (endian'd)
-                yield pdecode2.dec.bigendian.eq(0)  # little / big?
-                yield instruction.eq(ins)          # raw binary instr.
-                yield Delay(1e-6)
-                yield from simulator.execute_op(pdecode2)
-
-        sim.add_process(process)
-        with sim.write_vcd("simulator.vcd", "simulator.gtkw",
-                           traces=pdecode2.ports()):
-            sim.run()
-
-    def test_example(self):
-        lst = ["addi 1, 0, 0x1234",
-               "addi 2, 0, 0x5678",
-               "add  3, 1, 2",
-               "and  4, 1, 2"]
-        with Program(lst) as program:
-            self.run_tst_program(program, [1, 2, 3, 4])
-
-    def test_ldst(self):
-        lst = ["addi 1, 0, 0x1234",
-               "addi 2, 0, 0x5678",
-               "stw  1, 0(2)",
-               "lwz  3, 0(2)"]
-        with Program(lst) as program:
-            self.run_tst_program(program, [1, 2, 3])
-
-    def test_ldst_extended(self):
-        lst = ["addi 1, 0, 0x1234",
-               "addi 2, 0, 0x5678",
-               "addi 4, 0, 0x40",
-               "stw  1, 0x40(2)",
-               "lwzx  3, 4, 2"]
-        with Program(lst) as program:
-            self.run_tst_program(program, [1, 2, 3])
-
-    def test_ldst_widths(self):
-        lst = [" lis 1, 0xdead",
-               "ori 1, 1, 0xbeef",
-               "addi 2, 0, 0x1000",
-               "std 1, 0(2)",
-               "lbz 1, 5(2)",
-               "lhz 3, 4(2)",
-               "lwz 4, 4(2)",
-               "addi 5, 0, 0x12",
-               "stb 5, 5(2)",
-               "ld  5, 0(2)"]
-        with Program(lst) as program:
-            self.run_tst_program(program, [1, 2, 3, 4, 5])
-
-    def test_sub(self):
-        lst = ["addi 1, 0, 0x1234",
-               "addi 2, 0, 0x5678",
-               "subf 3, 1, 2",
-               "subfic 4, 1, 0x1337",
-               "neg 5, 1"]
-        with Program(lst) as program:
-            self.run_tst_program(program, [1, 2, 3, 4, 5])
-
-    def test_add_with_carry(self):
-        lst = ["addi 1, 0, 5",
-               "neg 1, 1",
-               "addi 2, 0, 7",
-               "neg 2, 2",
-               "addc 3, 2, 1",
-               "addi 3, 3, 1"
-               ]
-        with Program(lst) as program:
-            self.run_tst_program(program, [1, 2, 3])
-
-    def test_addis(self):
-        lst = ["addi 1, 0, 0x0FFF",
-               "addis 1, 1, 0x0F"
-               ]
-        with Program(lst) as program:
-            self.run_tst_program(program, [1])
-
-    def test_mulli(self):
-        lst = ["addi 1, 0, 3",
-               "mulli 1, 1, 2"
-               ]
-        with Program(lst) as program:
-            self.run_tst_program(program, [1])
-
-    def run_tst_program(self, prog, reglist):
-        simulator = InternalOpSimulator()
-        self.run_tst(prog, simulator)
-        prog.reset()
-        with run_program(prog) as q:
-            qemu_register_compare(simulator, q, reglist)
-
-
-def qemu_register_compare(simulator, qemu, regs):
-    for reg in regs:
-        qemu_val = qemu.get_register(reg)
-        simulator.regfile.assert_gpr(reg, qemu_val)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/unused_please_ignore_completely/TLB/.gitignore b/unused_please_ignore_completely/TLB/.gitignore
new file mode 100644 (file)
index 0000000..3324664
--- /dev/null
@@ -0,0 +1,2 @@
+*.wpr
+__pycache__
diff --git a/unused_please_ignore_completely/TLB/AddressEncoder.py b/unused_please_ignore_completely/TLB/AddressEncoder.py
new file mode 100644 (file)
index 0000000..49da819
--- /dev/null
@@ -0,0 +1,77 @@
+from nmigen import Module, Signal, Elaboratable
+from nmigen.lib.coding import Encoder, PriorityEncoder
+
+
+class AddressEncoder(Elaboratable):
+    """Address Encoder
+
+       The purpose of this module is to take in a vector and
+       encode the bits that are one hot into an address. This module
+       combines both nmigen's Encoder and PriorityEncoder and will state
+       whether the input line has a single bit hot, multiple bits hot,
+       or no bits hot. The output line will always have the lowest value
+       address output.
+
+       Usage:
+       The output is valid when either single or multiple match is high.
+       Otherwise output is 0.
+    """
+
+    def __init__(self, width):
+        """ Arguments:
+            * width: The desired length of the input vector
+        """
+        # Internal
+        self.encoder = Encoder(width)
+        self.p_encoder = PriorityEncoder(width)
+
+        # Input
+        self.i = Signal(width)
+
+        # Output
+        self.single_match = Signal(1)
+        self.multiple_match = Signal(1)
+        self.o = Signal(range(width))
+
+    def elaborate(self, platform=None):
+        m = Module()
+
+        # Add internal submodules
+        m.submodules.encoder = self.encoder
+        m.submodules.p_encoder = self.p_encoder
+
+        m.d.comb += [
+            self.encoder.i.eq(self.i),
+            self.p_encoder.i.eq(self.i)
+        ]
+
+        # Steps:
+        # 1. check if the input vector is non-zero
+        # 2. if non-zero, check if single match or multiple match
+        # 3. set output line to be lowest value address output
+
+        # If the priority encoder recieves an input of 0
+        # If n is 1 then the output is not valid
+        with m.If(self.p_encoder.n):
+            m.d.comb += [
+                self.single_match.eq(0),
+                self.multiple_match.eq(0),
+                self.o.eq(0)
+            ]
+        # If the priority encoder recieves an input > 0
+        with m.Else():
+            # Multiple Match if encoder n is invalid
+            with m.If(self.encoder.n):
+                m.d.comb += [
+                    self.single_match.eq(0),
+                    self.multiple_match.eq(1)
+                ]
+            # Single Match if encoder n is valid
+            with m.Else():
+                m.d.comb += [
+                    self.single_match.eq(1),
+                    self.multiple_match.eq(0)
+                ]
+            # Always set output based on priority encoder output
+            m.d.comb += self.o.eq(self.p_encoder.o)
+        return m
diff --git a/unused_please_ignore_completely/TLB/Cam.py b/unused_please_ignore_completely/TLB/Cam.py
new file mode 100644 (file)
index 0000000..c5fd069
--- /dev/null
@@ -0,0 +1,126 @@
+from nmigen import Array, Cat, Module, Signal, Elaboratable
+from nmigen.lib.coding import Decoder
+from nmigen.cli import main  # , verilog
+
+from .CamEntry import CamEntry
+from .AddressEncoder import AddressEncoder
+
+
+class Cam(Elaboratable):
+    """ Content Addressable Memory (CAM)
+
+        The purpose of this module is to quickly look up whether an
+        entry exists given a data key.
+        This module will search for the given data in all internal entries
+        and output whether a  single or multiple match was found.
+        If an single entry is found the address be returned and single_match
+        is set HIGH. If multiple entries are found the lowest address is
+        returned and multiple_match is set HIGH. If neither single_match or
+        multiple_match are HIGH this implies no match was found. To write
+        to the CAM set the address bus to the desired entry and set write_enable
+        HIGH. Entry managment should be performed one level above this block
+        as lookup is performed within.
+
+        Notes:
+        The read and write operations take one clock cycle to complete.
+        Currently the read_warning line is present for interfacing but
+        is not necessary for this design. This module is capable of writing
+        in the first cycle, reading on the second, and output the correct
+        address on the third.
+    """
+
+    def __init__(self, data_size, cam_size):
+        """ Arguments:
+            * data_size: (bits) The bit size of the data
+            * cam_size: (number) The number of entries in the CAM
+        """
+
+        # Internal
+        self.cam_size = cam_size
+        self.encoder = AddressEncoder(cam_size)
+        self.decoder = Decoder(cam_size)
+        self.entry_array = Array(CamEntry(data_size) for x in range(cam_size))
+
+        # Input
+        self.enable = Signal(1)
+        self.write_enable = Signal(1)
+        self.data_in = Signal(data_size)  # The data to be written
+        self.data_mask = Signal(data_size)  # mask for ternary writes
+        # address of CAM Entry to write
+        self.address_in = Signal(range(cam_size))
+
+        # Output
+        self.read_warning = Signal(1)  # High when a read interrupts a write
+        self.single_match = Signal(1)  # High when there is only one match
+        self.multiple_match = Signal(1)  # High when there at least two matches
+        # The lowest address matched
+        self.match_address = Signal(range(cam_size))
+
+    def elaborate(self, platform=None):
+        m = Module()
+        # AddressEncoder for match types and output address
+        m.submodules.AddressEncoder = self.encoder
+        # Decoder is used to select which entry will be written to
+        m.submodules.Decoder = self.decoder
+        # CamEntry Array Submodules
+        # Note these area added anonymously
+        entry_array = self.entry_array
+        m.submodules += entry_array
+
+        # Decoder logic
+        m.d.comb += [
+            self.decoder.i.eq(self.address_in),
+            self.decoder.n.eq(0)
+        ]
+
+        encoder_vector = []
+        with m.If(self.enable):
+            # Set the key value for every CamEntry
+            for index in range(self.cam_size):
+
+                # Write Operation
+                with m.If(self.write_enable):
+                    with m.If(self.decoder.o[index]):
+                        m.d.comb += entry_array[index].command.eq(2)
+                    with m.Else():
+                        m.d.comb += entry_array[index].command.eq(0)
+
+                # Read Operation
+                with m.Else():
+                    m.d.comb += entry_array[index].command.eq(1)
+
+                # Send data input to all entries
+                m.d.comb += entry_array[index].data_in.eq(self.data_in)
+                # Send all entry matches to encoder
+                ematch = entry_array[index].match
+                encoder_vector.append(ematch)
+
+            # Give input to and accept output from encoder module
+            m.d.comb += [
+                self.encoder.i.eq(Cat(*encoder_vector)),
+                self.single_match.eq(self.encoder.single_match),
+                self.multiple_match.eq(self.encoder.multiple_match),
+                self.match_address.eq(self.encoder.o)
+            ]
+
+        # If the CAM is not enabled set all outputs to 0
+        with m.Else():
+            m.d.comb += [
+                self.read_warning.eq(0),
+                self.single_match.eq(0),
+                self.multiple_match.eq(0),
+                self.match_address.eq(0)
+            ]
+
+        return m
+
+    def ports(self):
+        return [self.enable, self.write_enable,
+                self.data_in, self.data_mask,
+                self.read_warning, self.single_match,
+                self.multiple_match, self.match_address]
+
+
+if __name__ == '__main__':
+    cam = Cam(4, 4)
+    main(cam, ports=cam.ports())
diff --git a/unused_please_ignore_completely/TLB/CamEntry.py b/unused_please_ignore_completely/TLB/CamEntry.py
new file mode 100644 (file)
index 0000000..b1d9308
--- /dev/null
@@ -0,0 +1,46 @@
+from nmigen import Module, Signal, Elaboratable
+
+
+class CamEntry(Elaboratable):
+    """ Content Addressable Memory (CAM) Entry
+
+        The purpose of this module is to represent an entry within a CAM.
+        This module when given a read command will compare  the given data
+        and output whether a match was found or not. When given a write
+        command it will write the given data into internal registers.
+    """
+
+    def __init__(self, data_size):
+        """ Arguments:
+            * data_size: (bit count) The size of the data
+        """
+        # Input
+        self.command = Signal(2) # 00 => NA 01 => Read 10 => Write 11 => Reset
+        self.data_in = Signal(data_size) # Data input when writing
+
+        # Output
+        self.match = Signal(1) # Result of the internal/input key comparison
+        self.data = Signal(data_size)
+
+    def elaborate(self, platform=None):
+        m = Module()
+        with m.Switch(self.command):
+            with m.Case("00"):
+                m.d.sync += self.match.eq(0)
+            with m.Case("01"):
+                with m.If(self.data == self.data_in):
+                    m.d.sync += self.match.eq(1)
+                with m.Else():
+                    m.d.sync += self.match.eq(0)
+            with m.Case("10"):
+                m.d.sync += [
+                    self.data.eq(self.data_in),
+                    self.match.eq(0)
+                ]
+            with m.Case():
+                m.d.sync += [
+                    self.match.eq(0),
+                    self.data.eq(0)
+                ]
+
+        return m
diff --git a/unused_please_ignore_completely/TLB/LFSR.py b/unused_please_ignore_completely/TLB/LFSR.py
new file mode 100644 (file)
index 0000000..d8b606e
--- /dev/null
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# See Notices.txt for copyright information
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+from nmigen.cli import verilog, rtlil
+
+
+class LFSRPolynomial(set):
+    """ implements a polynomial for use in LFSR
+    """
+    def __init__(self, exponents=()):
+        for e in exponents:
+            assert isinstance(e, int), TypeError("%s must be an int" % repr(e))
+            assert (e >= 0), ValueError("%d must not be negative" % e)
+        set.__init__(self, set(exponents).union({0})) # must contain zero
+
+    @property
+    def max_exponent(self):
+        return max(self) # derived from set, so this returns the max exponent
+
+    @property
+    def exponents(self):
+        exponents = list(self) # get elements of set as a list
+        exponents.sort(reverse=True)
+        return exponents
+
+    def __str__(self):
+        expd = {0: "1", 1: 'x', 2: "x^{}"} # case 2 isn't 2, it's min(i,2)
+        retval = map(lambda i: expd[min(i,2)].format(i), self.exponents)
+        return " + ".join(retval)
+
+    def __repr__(self):
+        return "LFSRPolynomial(%s)" % self.exponents
+
+
+# list of selected polynomials from https://web.archive.org/web/20190418121923/https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Some_polynomials_for_maximal_LFSRs  # noqa
+LFSR_POLY_2 = LFSRPolynomial([2, 1, 0])
+LFSR_POLY_3 = LFSRPolynomial([3, 2, 0])
+LFSR_POLY_4 = LFSRPolynomial([4, 3, 0])
+LFSR_POLY_5 = LFSRPolynomial([5, 3, 0])
+LFSR_POLY_6 = LFSRPolynomial([6, 5, 0])
+LFSR_POLY_7 = LFSRPolynomial([7, 6, 0])
+LFSR_POLY_8 = LFSRPolynomial([8, 6, 5, 4, 0])
+LFSR_POLY_9 = LFSRPolynomial([9, 5, 0])
+LFSR_POLY_10 = LFSRPolynomial([10, 7, 0])
+LFSR_POLY_11 = LFSRPolynomial([11, 9, 0])
+LFSR_POLY_12 = LFSRPolynomial([12, 11, 10, 4, 0])
+LFSR_POLY_13 = LFSRPolynomial([13, 12, 11, 8, 0])
+LFSR_POLY_14 = LFSRPolynomial([14, 13, 12, 2, 0])
+LFSR_POLY_15 = LFSRPolynomial([15, 14, 0])
+LFSR_POLY_16 = LFSRPolynomial([16, 15, 13, 4, 0])
+LFSR_POLY_17 = LFSRPolynomial([17, 14, 0])
+LFSR_POLY_18 = LFSRPolynomial([18, 11, 0])
+LFSR_POLY_19 = LFSRPolynomial([19, 18, 17, 14, 0])
+LFSR_POLY_20 = LFSRPolynomial([20, 17, 0])
+LFSR_POLY_21 = LFSRPolynomial([21, 19, 0])
+LFSR_POLY_22 = LFSRPolynomial([22, 21, 0])
+LFSR_POLY_23 = LFSRPolynomial([23, 18, 0])
+LFSR_POLY_24 = LFSRPolynomial([24, 23, 22, 17, 0])
+
+
+class LFSR(LFSRPolynomial, Elaboratable):
+    """ implements a Linear Feedback Shift Register
+    """
+    def __init__(self, polynomial):
+        """ Inputs:
+            ------
+            :polynomial: the polynomial to feedback on.  may be a LFSRPolynomial
+                         instance or an iterable of ints (list/tuple/generator)
+            :enable:     enable (set LO to disable.  NOTE: defaults to HI)
+
+            Outputs:
+            -------
+            :state: the LFSR state.  bitwidth is taken from the polynomial
+                    maximum exponent.
+
+            Note: if an LFSRPolynomial is passed in as the input, because
+            LFSRPolynomial is derived from set() it's ok:
+            LFSRPolynomial(LFSRPolynomial(p)) == LFSRPolynomial(p)
+        """
+        LFSRPolynomial.__init__(self, polynomial)
+        self.state = Signal(self.max_exponent, reset=1)
+        self.enable = Signal(reset=1)
+
+    def elaborate(self, platform):
+        m = Module()
+        # do absolutely nothing if the polynomial is empty (always has a zero)
+        if self.max_exponent <= 1:
+            return m
+
+        # create XOR-bunch, select bits from state based on exponent
+        feedback = Const(0) # doesn't do any harm starting from 0b0 (xor chain)
+        for exponent in self:
+            if exponent > 0: # don't have to skip, saves CPU cycles though
+                feedback ^= self.state[exponent - 1]
+
+        # if enabled, shift-and-feedback
+        with m.If(self.enable):
+            # shift up lower bits by Cat'ing in a new bit zero (feedback)
+            newstate = Cat(feedback, self.state[:-1])
+            m.d.sync += self.state.eq(newstate)
+
+        return m
+
+
+# example: Poly24
+if __name__ == '__main__':
+    p24 = rtlil.convert(LFSR(LFSR_POLY_24))
+    with open("lfsr2_p24.il", "w") as f:
+        f.write(p24)
diff --git a/unused_please_ignore_completely/TLB/LFSR.pyi b/unused_please_ignore_completely/TLB/LFSR.pyi
new file mode 100644 (file)
index 0000000..64eb911
--- /dev/null
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# See Notices.txt for copyright information
+from nmigen import Module
+from typing import Iterable, Optional, Iterator, Any, Union
+from typing_extensions import final
+
+
+@final
+class LFSRPolynomial(set):
+    def __init__(self, exponents: Iterable[int] = ()):
+        def elements() -> Iterable[int]: ...
+    @property
+    def exponents(self) -> list[int]: ...
+    def __str__(self) -> str: ...
+    def __repr__(self) -> str: ...
+
+
+@final
+class LFSR:
+    def __init__(self, polynomial: Union[Iterable[int], LFSRPolynomial]): ...
+    @property
+    def width(self) -> int: ...
+    def elaborate(self, platform: Any) -> Module: ...
diff --git a/unused_please_ignore_completely/TLB/Makefile b/unused_please_ignore_completely/TLB/Makefile
new file mode 100644 (file)
index 0000000..1eb67ac
--- /dev/null
@@ -0,0 +1,2 @@
+verilog:
+       python3 Cam.py generate -t v > Cam.v
diff --git a/unused_please_ignore_completely/TLB/MemorySet.py b/unused_please_ignore_completely/TLB/MemorySet.py
new file mode 100644 (file)
index 0000000..11890ed
--- /dev/null
@@ -0,0 +1,66 @@
+from nmigen import Cat, Memory, Module, Signal, Elaboratable
+from nmigen.cli import main
+from nmigen.cli import verilog, rtlil
+
+
+class MemorySet(Elaboratable):
+    def __init__(self, data_size, tag_size, set_count, active):
+        self.active = active
+        input_size = tag_size + data_size  # Size of the input data
+        memory_width = input_size + 1  # The width of the cache memory
+        self.active = active
+        self.data_size = data_size
+        self.tag_size = tag_size
+
+        # XXX TODO, use rd-enable and wr-enable?
+        self.mem = Memory(width=memory_width, depth=set_count)
+        self.r = self.mem.read_port()
+        self.w = self.mem.write_port()
+
+        # inputs (address)
+        self.cset = Signal(range(set_count))  # The set to be checked
+        self.tag = Signal(tag_size)        # The tag to find
+        self.data_i = Signal(data_size)    # Incoming data
+
+        # outputs
+        self.valid = Signal()
+        self.data_o = Signal(data_size)    # Outgoing data (excludes tag)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.mem = self.mem
+        m.submodules.r = self.r
+        m.submodules.w = self.w
+
+        # temporaries
+        active_bit = Signal()
+        tag_valid = Signal()
+        data_start = self.active + 1
+        data_end = data_start + self.data_size
+        tag_start = data_end
+        tag_end = tag_start + self.tag_size
+
+        # connect the read port address to the set/entry
+        read_port = self.r
+        m.d.comb += read_port.addr.eq(self.cset)
+        # Pull out active bit from data
+        data = read_port.data
+        m.d.comb += active_bit.eq(data[self.active])
+        # Validate given tag vs stored tag
+        tag = data[tag_start:tag_end]
+        m.d.comb += tag_valid.eq(self.tag == tag)
+        # An entry is only valid if the tags match AND
+        # is marked as a valid entry
+        m.d.comb += self.valid.eq(tag_valid & active_bit)
+
+        # output data: TODO, check rd-enable?
+        m.d.comb += self.data_o.eq(data[data_start:data_end])
+
+        # connect the write port addr to the set/entry (only if write enabled)
+        # (which is only done on a match, see SAC.write_entry below)
+        write_port = self.w
+        with m.If(write_port.en):
+            m.d.comb += write_port.addr.eq(self.cset)
+            m.d.comb += write_port.data.eq(Cat(1, self.data_i, self.tag))
+
+        return m
diff --git a/unused_please_ignore_completely/TLB/PermissionValidator.py b/unused_please_ignore_completely/TLB/PermissionValidator.py
new file mode 100644 (file)
index 0000000..5bc90b2
--- /dev/null
@@ -0,0 +1,68 @@
+from nmigen import Module, Signal, Elaboratable
+from nmigen.cli import main
+
+from soc.TLB.PteEntry import PteEntry
+
+
+class PermissionValidator(Elaboratable):
+    """ The purpose of this Module is to check the Permissions of a given PTE
+        against the requested access permissions.
+
+        This module will either validate (by setting the valid bit HIGH)
+        the request or find a permission fault and invalidate (by setting
+        the valid bit LOW) the request
+    """
+
+    def __init__(self, asid_size, pte_size):
+        """ Arguments:
+            * asid_size: (bit count) The size of the asid to be processed
+            * pte_size: (bit count) The size of the pte to be processed
+
+            Return:
+            * valid HIGH when permissions are correct
+        """
+        # Internal
+        self.pte_entry = PteEntry(asid_size, pte_size)
+
+        # Input
+        self.data = Signal(asid_size + pte_size)
+        self.xwr = Signal(3)  # Execute, Write, Read
+        self.super_mode = Signal(1)  # Supervisor Mode
+        self.super_access = Signal(1)  # Supervisor Access
+        self.asid = Signal(15)  # Address Space IDentifier (ASID)
+
+        # Output
+        self.valid = Signal(1)  # Denotes if the permissions are correct
+
+    def elaborate(self, platform=None):
+        m = Module()
+
+        m.submodules.pte_entry = self.pte_entry
+
+        m.d.comb += self.pte_entry.i.eq(self.data)
+
+        # Check if the entry is valid
+        with m.If(self.pte_entry.v):
+            # ASID match or Global Permission
+            # Note that the MSB bound is exclusive
+            with m.If((self.pte_entry.asid == self.asid) | self.pte_entry.g):
+                # Check Execute, Write, Read (XWR) Permissions
+                with m.If(self.pte_entry.xwr == self.xwr):
+                    # Supervisor Logic
+                    with m.If(self.super_mode):
+                        # Valid if entry is not in user mode or supervisor
+                        # has Supervisor User Memory (SUM) access via the
+                        # SUM bit in the sstatus register
+                        m.d.comb += self.valid.eq((~self.pte_entry.u)
+                                                  | self.super_access)
+                    # User logic
+                    with m.Else():
+                        # Valid if the entry is in user mode only
+                        m.d.comb += self.valid.eq(self.pte_entry.u)
+                with m.Else():
+                    m.d.comb += self.valid.eq(0)
+            with m.Else():
+                m.d.comb += self.valid.eq(0)
+        with m.Else():
+            m.d.comb += self.valid.eq(0)
+        return m
diff --git a/unused_please_ignore_completely/TLB/PteEntry.py b/unused_please_ignore_completely/TLB/PteEntry.py
new file mode 100644 (file)
index 0000000..73ea922
--- /dev/null
@@ -0,0 +1,67 @@
+from nmigen import Module, Signal, Elaboratable
+from nmigen.cli import main
+
+
+class PteEntry(Elaboratable):
+    """ The purpose of this Module is to  centralize the parsing of Page
+        Table Entries (PTE) into one module to prevent common mistakes
+        and duplication of code. The control bits are parsed out for
+        ease of use.
+
+        This module parses according to the standard PTE given by the
+        Volume II: RISC-V Privileged Architectures V1.10 Pg 60.
+        The Address Space IDentifier (ASID) is appended to the MSB of the input
+        and is parsed out as such.
+
+        An valid input Signal would be:
+              ASID   PTE
+        Bits:[78-64][63-0]
+
+        The output PTE value will include the control bits.
+    """
+    def __init__(self, asid_size, pte_size):
+        """ Arguments:
+            * asid_size: (bit count) The size of the asid to be processed
+            * pte_size: (bit count) The size of the pte to be processed
+
+            Return:
+            * d The Dirty bit from the PTE portion of i
+            * a The Accessed bit from the PTE portion of i
+            * g The Global bit from the PTE portion of i
+            * u The User Mode bit from the PTE portion of i
+            * xwr The Execute/Write/Read bit from the PTE portion of i
+            * v The Valid bit from the PTE portion of i
+            * asid The asid portion of i
+            * pte The pte portion of i
+        """
+        # Internal
+        self.asid_start = pte_size
+        self.asid_end = pte_size + asid_size
+
+        # Input
+        self.i = Signal(asid_size + pte_size)
+
+        # Output
+        self.d = Signal(1) # Dirty bit (From pte)
+        self.a = Signal(1) # Accessed bit (From pte)
+        self.g = Signal(1) # Global Access (From pte)
+        self.u = Signal(1) # User Mode (From pte)
+        self.xwr = Signal(3) # Execute Read Write (From pte)
+        self.v = Signal(1) # Valid (From pte)
+        self.asid = Signal(asid_size) # Associated Address Space IDentifier
+        self.pte = Signal(pte_size) # Full Page Table Entry
+
+    def elaborate(self, platform=None):
+        m = Module()
+        # Pull out all control bites from PTE
+        m.d.comb += [
+            self.d.eq(self.i[7]),
+            self.a.eq(self.i[6]),
+            self.g.eq(self.i[5]),
+            self.u.eq(self.i[4]),
+            self.xwr.eq(self.i[1:4]),
+            self.v.eq(self.i[0])
+        ]
+        m.d.comb += self.asid.eq(self.i[self.asid_start:self.asid_end])
+        m.d.comb += self.pte.eq(self.i[0:self.asid_start])
+        return m
diff --git a/unused_please_ignore_completely/TLB/SetAssociativeCache.py b/unused_please_ignore_completely/TLB/SetAssociativeCache.py
new file mode 100644 (file)
index 0000000..30ad809
--- /dev/null
@@ -0,0 +1,274 @@
+"""
+
+Online simulator of 4-way set-associative cache:
+http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/sa4.html
+
+Python simulator of a N-way set-associative cache:
+https://github.com/vaskevich/CacheSim/blob/master/cachesim.py
+"""
+
+from nmigen import Array, Cat, Memory, Module, Signal, Mux, Elaboratable
+from nmigen.compat.genlib import fsm
+from nmigen.cli import main
+from nmigen.cli import verilog, rtlil
+
+from .AddressEncoder import AddressEncoder
+from .MemorySet import MemorySet
+
+# TODO: use a LFSR that advances continuously and picking the bottom
+# few bits from it to select which cache line to replace, instead of PLRU
+# http://bugs.libre-riscv.org/show_bug.cgi?id=71
+from .ariane.plru import PLRU
+from .LFSR import LFSR, LFSR_POLY_24
+
+SA_NA = "00"  # no action (none)
+SA_RD = "01"  # read
+SA_WR = "10"  # write
+
+
+class SetAssociativeCache(Elaboratable):
+    """ Set Associative Cache Memory
+
+        The purpose of this module is to generate a memory cache given the
+        constraints passed in. This will create a n-way set associative cache.
+        It is expected for the SV TLB that the VMA will provide the set number
+        while the ASID provides the tag (still to be decided).
+
+    """
+
+    def __init__(self, tag_size, data_size, set_count, way_count, lfsr=False):
+        """ Arguments
+            * tag_size (bits): The bit count of the tag
+            * data_size (bits): The bit count of the data to be stored
+            * set_count (number): The number of sets/entries in the cache
+            * way_count (number): The number of slots a data can be stored
+                                  in one set
+            * lfsr: if set, use an LFSR for (pseudo-randomly) selecting
+                    set/entry to write to.  otherwise, use a PLRU
+        """
+        # Internals
+        self.lfsr_mode = lfsr
+        self.way_count = way_count  # The number of slots in one set
+        self.tag_size = tag_size    # The bit count of the tag
+        self.data_size = data_size  # The bit count of the data to be stored
+
+        # set up Memory array
+        self.mem_array = Array()  # memory array
+        for i in range(way_count):
+            ms = MemorySet(data_size, tag_size, set_count, active=0)
+            self.mem_array.append(ms)
+
+        # Finds valid entries
+        self.encoder = AddressEncoder(way_count)
+
+        # setup PLRU or LFSR
+        if lfsr:
+            # LFSR mode
+            self.lfsr = LFSR(LFSR_POLY_24)
+        else:
+            # PLRU mode
+            # One block to handle plru calculations
+            self.plru = PLRU(way_count)
+            self.plru_array = Array()  # PLRU data on each set
+            for i in range(set_count):
+                name = "plru%d" % i
+                self.plru_array.append(Signal(self.plru.TLBSZ, name=name))
+
+        # Input
+        self.enable = Signal(1)   # Whether the cache is enabled
+        self.command = Signal(2)  # 00=None, 01=Read, 10=Write (see SA_XX)
+        self.cset = Signal(range(set_count))  # The set to be checked
+        self.tag = Signal(tag_size)        # The tag to find
+        self.data_i = Signal(data_size)    # The input data
+
+        # Output
+        self.ready = Signal(1)  # 0 => Processing 1 => Ready for commands
+        self.hit = Signal(1)            # Tag matched one way in the given set
+        # Tag matched many ways in the given set
+        self.multiple_hit = Signal(1)
+        self.data_o = Signal(data_size)  # The data linked to the matched tag
+
+    def check_tags(self, m):
+        """ Validate the tags in the selected set. If one and only one
+            tag matches set its state to zero and increment all others
+            by one. We only advance to next state if a single hit is found.
+        """
+        # Vector to store way valid results
+        # A zero denotes a way is invalid
+        valid_vector = []
+        # Loop through memory to prep read/write ports and set valid_vector
+        for i in range(self.way_count):
+            valid_vector.append(self.mem_array[i].valid)
+
+        # Pass encoder the valid vector
+        m.d.comb += self.encoder.i.eq(Cat(*valid_vector))
+
+        # Only one entry should be marked
+        # This is due to already verifying the tags
+        # matched and the valid bit is high
+        with m.If(self.hit):
+            m.next = "FINISHED_READ"
+            # Pull out data from the read port
+            data = self.mem_array[self.encoder.o].data_o
+            m.d.comb += self.data_o.eq(data)
+            if not self.lfsr_mode:
+                self.access_plru(m)
+
+        # Oh no! Seal the gates! Multiple tags matched?!? kasd;ljkafdsj;k
+        with m.Elif(self.multiple_hit):
+            # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck
+            m.d.comb += self.data_o.eq(0)
+
+        # No tag matches means no data
+        with m.Else():
+            # XXX TODO, m.next = "FINISHED_READ" ? otherwise stuck
+            m.d.comb += self.data_o.eq(0)
+
+    def access_plru(self, m):
+        """ An entry was accessed and the plru tree must now be updated
+        """
+        # Pull out the set's entry being edited
+        plru_entry = self.plru_array[self.cset]
+        m.d.comb += [
+            # Set the plru data to the current state
+            self.plru.plru_tree.eq(plru_entry),
+            # Set that the cache was accessed
+            self.plru.lu_access_i.eq(1)
+        ]
+
+    def read(self, m):
+        """ Go through the read process of the cache.
+            This takes two cycles to complete. First it checks for a valid tag
+            and secondly it updates the LRU values.
+        """
+        with m.FSM() as fsm_read:
+            with m.State("READY"):
+                m.d.comb += self.ready.eq(0)
+                # check_tags will set the state if the conditions are met
+                self.check_tags(m)
+            with m.State("FINISHED_READ"):
+                m.next = "READY"
+                m.d.comb += self.ready.eq(1)
+                if not self.lfsr_mode:
+                    plru_tree_o = self.plru.plru_tree_o
+                    m.d.sync += self.plru_array[self.cset].eq(plru_tree_o)
+
+    def write_entry(self, m):
+        if not self.lfsr_mode:
+            m.d.comb += [  # set cset (mem address) into PLRU
+                self.plru.plru_tree.eq(self.plru_array[self.cset]),
+                # and connect plru to encoder for write
+                self.encoder.i.eq(self.plru.replace_en_o)
+            ]
+            write_port = self.mem_array[self.encoder.o].w
+        else:
+            # use the LFSR to generate a random(ish) one of the mem array
+            lfsr_output = Signal(range(self.way_count))
+            lfsr_random = Signal(range(self.way_count))
+            m.d.comb += lfsr_output.eq(self.lfsr.state)  # lose some bits
+            # address too big, limit to range of array
+            m.d.comb += lfsr_random.eq(Mux(lfsr_output > self.way_count,
+                                           lfsr_output - self.way_count,
+                                           lfsr_output))
+            write_port = self.mem_array[lfsr_random].w
+
+        # then if there is a match from the encoder, enable the selected write
+        with m.If(self.encoder.single_match):
+            m.d.comb += write_port.en.eq(1)
+
+    def write(self, m):
+        """ Go through the write process of the cache.
+            This takes two cycles to complete. First it writes the entry,
+            and secondly it updates the PLRU (in plru mode)
+        """
+        with m.FSM() as fsm_write:
+            with m.State("READY"):
+                m.d.comb += self.ready.eq(0)
+                self.write_entry(m)
+                m.next = "FINISHED_WRITE"
+            with m.State("FINISHED_WRITE"):
+                m.d.comb += self.ready.eq(1)
+                if not self.lfsr_mode:
+                    plru_entry = self.plru_array[self.cset]
+                    m.d.sync += plru_entry.eq(self.plru.plru_tree_o)
+                m.next = "READY"
+
+    def elaborate(self, platform=None):
+        m = Module()
+
+        # ----
+        # set up Modules: AddressEncoder, LFSR/PLRU, Mem Array
+        # ----
+
+        m.submodules.AddressEncoder = self.encoder
+        if self.lfsr_mode:
+            m.submodules.LFSR = self.lfsr
+        else:
+            m.submodules.PLRU = self.plru
+
+        for i, mem in enumerate(self.mem_array):
+            setattr(m.submodules, "mem%d" % i, mem)
+
+        # ----
+        # select mode: PLRU connect to encoder, LFSR do... something
+        # ----
+
+        if not self.lfsr_mode:
+            # Set what entry was hit
+            m.d.comb += self.plru.lu_hit.eq(self.encoder.o)
+        else:
+            # enable LFSR
+            m.d.comb += self.lfsr.enable.eq(self.enable)
+
+        # ----
+        # connect hit/multiple hit to encoder output
+        # ----
+
+        m.d.comb += [
+            self.hit.eq(self.encoder.single_match),
+            self.multiple_hit.eq(self.encoder.multiple_match),
+        ]
+
+        # ----
+        # connect incoming data/tag/cset(addr) to mem_array
+        # ----
+
+        for mem in self.mem_array:
+            write_port = mem.w
+            m.d.comb += [mem.cset.eq(self.cset),
+                         mem.tag.eq(self.tag),
+                         mem.data_i.eq(self.data_i),
+                         write_port.en.eq(0),  # default: disable write
+                         ]
+        # ----
+        # Commands: READ/WRITE/TODO
+        # ----
+
+        with m.If(self.enable):
+            with m.Switch(self.command):
+                # Search all sets at a particular tag
+                with m.Case(SA_RD):
+                    self.read(m)
+                with m.Case(SA_WR):
+                    self.write(m)
+                    # Maybe catch multiple tags write here?
+                    # TODO
+                # TODO: invalidate/flush, flush-all?
+
+        return m
+
+    def ports(self):
+        return [self.enable, self.command, self.cset, self.tag, self.data_i,
+                self.ready, self.hit, self.multiple_hit, self.data_o]
+
+
+if __name__ == '__main__':
+    sac = SetAssociativeCache(4, 8, 4, 6)
+    vl = rtlil.convert(sac, ports=sac.ports())
+    with open("SetAssociativeCache.il", "w") as f:
+        f.write(vl)
+
+    sac_lfsr = SetAssociativeCache(4, 8, 4, 6, True)
+    vl = rtlil.convert(sac_lfsr, ports=sac_lfsr.ports())
+    with open("SetAssociativeCacheLFSR.il", "w") as f:
+        f.write(vl)
diff --git a/unused_please_ignore_completely/TLB/TLB.py b/unused_please_ignore_completely/TLB/TLB.py
new file mode 100644 (file)
index 0000000..a3c0224
--- /dev/null
@@ -0,0 +1,177 @@
+""" TLB Module
+
+    The expected form of the data is:
+    * Item (Bits)
+    * Tag (N - 79) / ASID (78 - 64) / PTE (63 - 0)
+"""
+
+from nmigen import Memory, Module, Signal, Cat, Elaboratable
+from nmigen.cli import main
+
+from .PermissionValidator import PermissionValidator
+from .Cam import Cam
+
+
+class TLB(Elaboratable):
+    def __init__(self, asid_size, vma_size, pte_size, L1_size):
+        """ Arguments
+            * asid_size: Address Space IDentifier (ASID) typically 15 bits
+            * vma_size: Virtual Memory Address (VMA) typically 36 bits
+            * pte_size: Page Table Entry (PTE) typically 64 bits
+
+            Notes:
+            These arguments should represent the largest possible size
+            defined by the MODE settings. See
+            Volume II: RISC-V Privileged Architectures V1.10 Page 57
+        """
+
+        # Internal
+        self.state = 0
+        # L1 Cache Modules
+        self.cam_L1 = Cam(vma_size, L1_size)
+        self.mem_L1 = Memory(width=asid_size + pte_size, depth=L1_size)
+
+        # Permission Validator
+        self.perm_validator = PermissionValidator(asid_size, pte_size)
+
+        # Inputs
+        self.supermode = Signal(1)  # Supervisor Mode
+        self.super_access = Signal(1)  # Supervisor Access
+        # 00=None, 01=Search, 10=Write L1, 11=Write L2
+        self.command = Signal(2)
+        self.xwr = Signal(3)  # Execute, Write, Read
+        self.mode = Signal(4)  # 4 bits for access to Sv48 on Rv64
+        self.address_L1 = Signal(range(L1_size))
+        self.asid = Signal(asid_size)  # Address Space IDentifier (ASID)
+        self.vma = Signal(vma_size)  # Virtual Memory Address (VMA)
+        self.pte_in = Signal(pte_size)  # To be saved Page Table Entry (PTE)
+
+        # Outputs
+        self.hit = Signal(1)  # Denotes if the VMA had a mapped PTE
+        self.perm_valid = Signal(1)  # Denotes if the permissions are correct
+        self.pte_out = Signal(pte_size)  # PTE that was mapped to by the VMA
+
+    def search(self, m, read_L1, write_L1):
+        """ searches the TLB
+        """
+        m.d.comb += [
+            write_L1.en.eq(0),
+            self.cam_L1.write_enable.eq(0),
+            self.cam_L1.data_in.eq(self.vma)
+        ]
+        # Match found in L1 CAM
+        match_found = Signal(reset_less=True)
+        m.d.comb += match_found.eq(self.cam_L1.single_match
+                                   | self.cam_L1.multiple_match)
+        with m.If(match_found):
+            # Memory shortcut variables
+            mem_address = self.cam_L1.match_address
+            # Memory Logic
+            m.d.comb += read_L1.addr.eq(mem_address)
+            # Permission Validator Logic
+            m.d.comb += [
+                self.hit.eq(1),
+                # Set permission validator data to the correct
+                # register file data according to CAM match
+                # address
+                self.perm_validator.data.eq(read_L1.data),
+                # Execute, Read, Write
+                self.perm_validator.xwr.eq(self.xwr),
+                # Supervisor Mode
+                self.perm_validator.super_mode.eq(self.supermode),
+                # Supverisor Access
+                self.perm_validator.super_access.eq(self.super_access),
+                # Address Space IDentifier (ASID)
+                self.perm_validator.asid.eq(self.asid),
+                # Output result of permission validation
+                self.perm_valid.eq(self.perm_validator.valid)
+            ]
+            # Only output PTE if permissions are valid
+            with m.If(self.perm_validator.valid):
+                # XXX TODO - dummy for now
+                reg_data = Signal.like(self.pte_out)
+                m.d.comb += [
+                    self.pte_out.eq(reg_data)
+                ]
+            with m.Else():
+                m.d.comb += [
+                    self.pte_out.eq(0)
+                ]
+        # Miss Logic
+        with m.Else():
+            m.d.comb += [
+                self.hit.eq(0),
+                self.perm_valid.eq(0),
+                self.pte_out.eq(0)
+            ]
+
+    def write_l1(self, m, read_L1, write_L1):
+        """ writes to the L1 cache
+        """
+        # Memory_L1 Logic
+        m.d.comb += [
+            write_L1.en.eq(1),
+            write_L1.addr.eq(self.address_L1),
+            # The Cat places arguments from LSB -> MSB
+            write_L1.data.eq(Cat(self.pte_in, self.asid))
+        ]
+        # CAM_L1 Logic
+        m.d.comb += [
+            self.cam_L1.write_enable.eq(1),
+            self.cam_L1.data_in.eq(self.vma),  # data_in is sent to all entries
+            # self.cam_L1.address_in.eq(todo) # a CAM entry needs to be selected
+
+        ]
+
+    def elaborate(self, platform):
+        m = Module()
+        # Add submodules
+        # Submodules for L1 Cache
+        m.submodules.cam_L1 = self.cam_L1
+        m.submodules.read_L1 = read_L1 = self.mem_L1.read_port()
+        m.submodules.write_L1 = write_L1 = self.mem_L1.write_port()
+
+        # Permission Validator Submodule
+        m.submodules.perm_valididator = self.perm_validator
+
+        # When MODE specifies translation
+        # TODO add in different bit length handling ie prefix 0s
+        tlb_enable = Signal(reset_less=True)
+        m.d.comb += tlb_enable.eq(self.mode != 0)
+
+        with m.If(tlb_enable):
+            m.d.comb += [
+                self.cam_L1.enable.eq(1)
+            ]
+            with m.Switch(self.command):
+                # Search
+                with m.Case("01"):
+                    self.search(m, read_L1, write_L1)
+
+                # Write L1
+                # Expected that the miss will be handled in software
+                with m.Case("10"):
+                    self.write_l1(m, read_L1, write_L1)
+
+                # TODO
+                # with m.Case("11"):
+
+        # When disabled
+        with m.Else():
+            m.d.comb += [
+                self.cam_L1.enable.eq(0),
+                # XXX TODO - self.reg_file.enable.eq(0),
+                self.hit.eq(0),
+                self.perm_valid.eq(0),  # XXX TODO, check this
+                self.pte_out.eq(0)
+            ]
+        return m
+
+
+if __name__ == '__main__':
+    tlb = TLB(15, 36, 64, 4)
+    main(tlb, ports=[tlb.supermode, tlb.super_access, tlb.command,
+                     tlb.xwr, tlb.mode, tlb.address_L1, tlb.asid,
+                     tlb.vma, tlb.pte_in,
+                     tlb.hit, tlb.perm_valid, tlb.pte_out,
+                     ] + tlb.cam_L1.ports())
diff --git a/unused_please_ignore_completely/TLB/__init__.py b/unused_please_ignore_completely/TLB/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/unused_please_ignore_completely/TLB/ariane/TreePLRU.cpp b/unused_please_ignore_completely/TLB/ariane/TreePLRU.cpp
new file mode 100644 (file)
index 0000000..2f6aeea
--- /dev/null
@@ -0,0 +1,211 @@
+#include <cstdint>
+#include <iostream>
+#include <cmath>
+
+
+#define NWAY 4
+#define NLINE 256
+#define HIT 0
+#define MISS 1
+#define MS 1000
+/*
+Detailed TreePLRU inference see here: https://docs.google.com/spreadsheets/d/14zQpPYPwDAbCCjBT_a3KLaE5FEk-RNhI8Z7Qm_biW8g/edit?usp=sharing
+Ref: https://people.cs.clemson.edu/~mark/464/p_lru.txt
+four-way set associative - three bits
+   each bit represents one branch point in a binary decision tree; let 1
+   represent that the left side has been referenced more recently than the
+   right side, and 0 vice-versa
+              are all 4 lines valid?
+                   /       \
+                 yes        no, use an invalid line
+                  |
+                  |
+                  |
+             bit_0 == 0?            state | replace      ref to | next state
+              /       \             ------+--------      -------+-----------
+             y         n             00x  |  line_0      line_0 |    11_
+            /           \            01x  |  line_1      line_1 |    10_
+     bit_1 == 0?    bit_2 == 0?      1x0  |  line_2      line_2 |    0_1
+       /    \          /    \        1x1  |  line_3      line_3 |    0_0
+      y      n        y      n
+     /        \      /        \        ('x' means       ('_' means unchanged)
+   line_0  line_1  line_2  line_3      don't care)
+ 8-way set associative - 7  = 1+2+4 bits
+16-way set associative - 15 = 1+2+4+8 bits
+32-way set associative - 31 = 1+2+4+8+16 bits
+64-way set associative - 63 = 1+2+4+8+16+32 bits
+*/
+using namespace std;
+struct AddressField {
+    uint64_t wd_idx : 2;//Unused
+    uint64_t offset : 4;//Unused
+    uint64_t index  : 8;//NLINE = 256 = 2^8
+    uint64_t tag    : 50;
+};
+
+union Address {
+    uint32_t* p;
+    AddressField fields;
+};
+
+struct Cell {
+    bool v;
+    uint64_t tag;
+
+    Cell() : v(false), tag(0) {}
+
+    bool isHit(uint64_t tag) {
+        return v && (tag == this->tag);
+    }
+
+    void fetch(uint32_t* address) {
+        Address addr;
+        addr.p = address;
+        addr.fields.offset = 0;
+        addr.fields.wd_idx = 0;
+        tag = addr.fields.tag;
+        v = true;
+    }
+};
+
+ostream& operator<<(ostream & out, const Cell& cell) {
+    out << " v:" << cell.v << " tag:" << hex << cell.tag;
+    return out;
+}
+
+struct Block {
+    Cell cell[NWAY];
+    uint32_t state;
+    uint64_t *mask;//Mask the state to get accurate value for specified 1 bit.
+    uint64_t *value;
+    uint64_t *next_value;
+
+    Block() : state(0) {
+        switch (NWAY) {
+            case 4:
+                mask = new uint64_t[4]{0b110, 0b110, 0b101, 0b101};
+                value = new uint64_t[4]{0b000, 0b010, 0b100, 0b101};
+                next_value = new uint64_t[4]{0b110, 0b100, 0b001, 0b000};
+                break;
+            case 8:
+                mask = new uint64_t[8]{0b1101000, 0b1101000, 0b1100100, 0b1100100, 0b1010010, 0b1010010, 0b1010001,
+                                       0b1010001};
+                value = new uint64_t[8]{0b0000000, 0b0001000, 0b0100000, 0b0100100, 0b1000000, 0b1000010, 0b1010000,
+                                        0b1010001};
+                next_value = new uint64_t[8]{0b1101000, 0b1100000, 0b1000100, 0b1000000, 0b0010010, 0b0010000,
+                                             0b0000001, 0b0000000};
+                break;
+                //TODO - more NWAY goes here.
+            default:
+                std::cout << "Error definition NWAY = " << NWAY << std::endl;
+        }
+    }
+
+    uint32_t *getByTag(uint64_t tag, uint32_t *pway) {
+        for (int i = 0; i < NWAY; ++i) {
+            if (cell[i].isHit(tag)) {
+                *pway = i;
+                return pway;
+            }
+        }
+        return NULL;
+    }
+
+    void setLRU(uint32_t *address) {
+        int way = 0;
+        uint32_t st = state;
+        for (int i = 0; i < NWAY; ++i) {
+            if ((state & mask[i]) == value[i]) {
+                state ^= mask[i];
+                way = i;
+                break;
+            }
+        }
+        cell[way].fetch(address);
+        cout << "MISS: way:" << way << " address:" << address << " state:" << st << "->" << state << endl;
+    }
+
+    uint32_t *get(uint32_t *address, uint32_t *pway) {
+        Address addr;
+        addr.p = address;
+        uint32_t *d = getByTag(addr.fields.tag, pway);
+        if (d != NULL) {
+            return &d[addr.fields.offset];
+        }
+        return d;
+    }
+
+    int set(uint32_t *address) {
+        uint32_t way = 0;
+        uint32_t *p = get(address, &way);
+        if (p != NULL) {
+            printf("HIT: address:%p ref_to way:%d state %X --> ", address, way, state);
+            state &= ~mask[way];
+            printf("%X --> ", state);
+            state |= next_value[way];
+            printf("%X\n", state);
+            // *p = *address; //skip since address is fake.
+            return HIT;
+        } else {
+            setLRU(address);
+            return MISS;
+        }
+    }
+};
+
+ostream& operator<<(ostream & out, const Block& block) {
+    out << "state:" << block.state << " ";
+    for (int i = 0; i<NWAY; i++) {
+        out << block.cell[i];
+    }
+    return out;
+}
+
+struct Cache {
+    Block block[NLINE];
+    uint32_t count[2];
+    Cache() { count[HIT] = 0; count[MISS] = 0; }
+
+    void access(uint32_t* address) {
+        Address addr;
+        addr.p = address;
+        Block& b = block[addr.fields.index];
+        ++count[b.set(address)];
+    }
+
+};
+ostream& operator<<(ostream & out, const Cache& cache) {
+    out << "\n==Summary==\n\tHit: " << cache.count[HIT] <<  " Miss: " << cache.count[MISS] << std::endl;
+    for (int i = 0; i < NLINE; i++) {
+        out << cache.block[i] << endl;
+    }
+    return out;
+}
+
+Cache cache;
+void multiply(uint32_t* m1, uint32_t* m2, uint32_t* res)
+{
+    int x, i, j;
+    for (i = 0; i < MS; i++) {
+        for (j = 0; j < MS; j++) {
+            cache.access(res + i*MS +j);
+            for (x = 0; x < MS; x++) {
+                cache.access(m1 + i*MS + x);
+                cache.access(m2 + x*MS + j);
+                cache.access(res + i*MS +j);
+                // res[i][j] += m1[i][x] * m2[x][j];
+                cache.access(res + i*MS +j);
+            }
+        }
+    }
+}
+
+int main()
+{
+    uint32_t* m1 = (uint32_t*) 0xFACE00A000000000LL;  // fake virtual address; don’t access it
+    uint32_t* m2 = (uint32_t*) 0xFACE00B000000000LL;  // fake virtual address; don’t access it
+    uint32_t* res =  (uint32_t*) 0xFACE00C000000000LL; // fake virtual address; don’t access it
+    multiply(m1, m2, res);
+    cout << cache << endl;
+    return 0;
+}
diff --git a/unused_please_ignore_completely/TLB/ariane/__init__.py b/unused_please_ignore_completely/TLB/ariane/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/unused_please_ignore_completely/TLB/ariane/exceptcause.py b/unused_please_ignore_completely/TLB/ariane/exceptcause.py
new file mode 100644 (file)
index 0000000..4c5cb2d
--- /dev/null
@@ -0,0 +1,16 @@
+from nmigen import Const
+
+INSTR_ADDR_MISALIGNED = Const(0, 64)
+INSTR_ACCESS_FAULT    = Const(1, 64)
+ILLEGAL_INSTR         = Const(2, 64)
+BREAKPOINT            = Const(3, 64)
+LD_ADDR_MISALIGNED    = Const(4, 64)
+LD_ACCESS_FAULT       = Const(5, 64)
+ST_ADDR_MISALIGNED    = Const(6, 64)
+ST_ACCESS_FAULT       = Const(7, 64)
+ENV_CALL_UMODE        = Const(8, 64)  # environment call from user mode
+ENV_CALL_SMODE        = Const(9, 64)  # environment call from supervisor mode
+ENV_CALL_MMODE        = Const(11, 64) # environment call from machine mode
+INSTR_PAGE_FAULT      = Const(12, 64) # Instruction page fault
+LOAD_PAGE_FAULT       = Const(13, 64) # Load page fault
+STORE_PAGE_FAULT      = Const(15, 64) # Store page fault
diff --git a/unused_please_ignore_completely/TLB/ariane/miss_handler.py b/unused_please_ignore_completely/TLB/ariane/miss_handler.py
new file mode 100644 (file)
index 0000000..5ddc725
--- /dev/null
@@ -0,0 +1,786 @@
+# Copyright 2018 ETH Zurich and University of Bologna.
+# Copyright and related rights are licensed under the Solderpad Hardware
+# License, Version 0.51 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# or agreed to in writing, software, hardware and materials distributed under
+# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Author: Florian Zaruba, ETH Zurich
+# Date: 12.11.2017
+# Description: Handles cache misses.
+from nmigen.lib.coding import Encoder, PriorityEncoder
+
+
+# --------------
+# MISS Handler
+# --------------
+import ariane_pkg::*;
+import std_cache_pkg::*;
+
+unsigned NR_PORTS         = 3
+
+class MissReq(RecordObject):
+    def __init__(self, name=None):
+        Record.__init__(self, name)
+        self.valid = Signal()
+        self.addr = Signal(64)
+        self.be = Signal(8)
+        self.size = Signal(2)
+        self.we = Signal()
+        self.wdata = Signal(64)
+        bypass = Signal()
+
+class CacheLine:
+    def __init__(self):
+        self.tag = Signal(DCACHE_TAG_WIDTH) # tag array
+        self.data = Signal(DCACHE_LINE_WIDTH) # data array
+        self.valid = Signal() # state array
+        self.dirty = Signal() # state array
+
+# cache line byte enable
+class CLBE:
+    def __init__(self):
+        self.tag = Signal(DCACHE_TAG_WIDTH+7)//8) # byte enable into tag array
+        self.data = Signal(DCACHE_LINE_WIDTH+7)//8) # byte enable data array
+        # bit enable into state array (valid for a pair of dirty/valid bits)
+        self.vldrty = Signal(DCACHE_SET_ASSOC)
+    } cl_be_t;
+
+
+
+    # FSM states
+"""
+    enum logic [3:0] {
+        IDLE,               # 0
+        FLUSHING,           # 1
+        FLUSH,              # 2
+        WB_CACHELINE_FLUSH, # 3
+        FLUSH_REQ_STATUS,   # 4
+        WB_CACHELINE_MISS,  # 5
+        WAIT_GNT_SRAM,      # 6
+        MISS,               # 7
+        REQ_CACHELINE,      # 8
+        MISS_REPL,          # 9
+        SAVE_CACHELINE,     # A
+        INIT,               # B
+        AMO_LOAD,           # C
+        AMO_SAVE_LOAD,      # D
+        AMO_STORE           # E
+    } state_d, state_q;
+"""
+
+class MissHandler(Elaboratable):
+    def __init__(self, NR_PORTS):
+        self.NR_PORTS = NR_PORTS
+        self.pwid = pwid = ceil(log(NR_PORTS) / log(2))
+        self.flush_i = Signal()      # flush request
+        self.flush_ack_o = Signal()  # acknowledge successful flush
+        self.miss_o = Signal()
+        self.busy_i = Signal()       # dcache is busy with something
+
+        # Bypass or miss
+        self.miss_req_i = Array(MissReq(name="missreq") for i in range(NR_PORTS))
+        # Bypass handling
+        self.bypass_gnt_o = Signal(NR_PORTS)
+        self.bypass_valid_o = Signal(NR_PORTS)
+        self.bypass_data_o = Array(Signal(name="bdata_o", 64) \
+                                    for i in range(NR_PORTS))
+
+        # AXI port
+        output ariane_axi::req_t                            axi_bypass_o,
+        input  ariane_axi::resp_t                           axi_bypass_i,
+
+        # Miss handling (~> cacheline refill)
+        self.miss_gnt_o = Signal(NR_PORTS)
+        self.active_serving_o = Signal(NR_PORTS)
+
+        self.critical_word_o = Signal(64)
+        self.critical_word_valid_o = Signal()
+        output ariane_axi::req_t                            axi_data_o,
+        input  ariane_axi::resp_t                           axi_data_i,
+
+        self.mshr_addr_i = Array(Signal(name="bdata_o", 56) \
+                                    for i in range(NR_PORTS))
+        self.mshr_addr_matches_o = Signal(NR_PORTS)
+        self.mshr_index_matches_o = Signal(NR_PORTS)
+
+        # AMO
+        self.amo_req_i = AMOReq()
+        self.amo_resp_o = AMOResp()
+        # Port to SRAMs, for refill and eviction
+        self.req_o = Signal(DCACHE_SET_ASSOC)
+        self.addr_o = Signal(DCACHE_INDEX_WIDTH) # address into cache array
+        self.data_o = CacheLine()
+        self.be_o = CLBE()
+        self.data_i = Array(CacheLine() \
+                                    for i in range(DCACHE_SET_ASSOC))
+        self.we_o = Signal()
+
+    def elaborate(self, platform):
+        # Registers
+        mshr_t                                  mshr_d, mshr_q;
+        logic [DCACHE_INDEX_WIDTH-1:0]          cnt_d, cnt_q;
+        logic [DCACHE_SET_ASSOC-1:0]            evict_way_d, evict_way_q;
+        # cache line to evict
+        cache_line_t                            evict_cl_d, evict_cl_q;
+
+        logic serve_amo_d, serve_amo_q;
+        # Request from one FSM
+        miss_req_valid = Signal(self.NR_PORTS)
+        miss_req_bypass = Signal(self.NR_PORTS)
+        miss_req_addr = Array(Signal(name="miss_req_addr", 64) \
+                                    for i in range(NR_PORTS))
+        miss_req_wdata = Array(Signal(name="miss_req_wdata", 64) \
+                                    for i in range(NR_PORTS))
+        miss_req_we = Signal(self.NR_PORTS)
+        miss_req_be = Array(Signal(name="miss_req_be", 8) \
+                                    for i in range(NR_PORTS))
+        miss_req_size = Array(Signal(name="miss_req_size", 2) \
+                                    for i in range(NR_PORTS))
+
+        # Cache Line Refill <-> AXI
+        req_fsm_miss_valid = Signal()
+        req_fsm_miss_addr = Signal(64)
+        req_fsm_miss_wdata = Signal(DCACHE_LINE_WIDTH)
+        req_fsm_miss_we = Signal()
+        req_fsm_miss_be = Signal(DCACHE_LINE_WIDTH//8)
+        ariane_axi::ad_req_t                     req_fsm_miss_req;
+        req_fsm_miss_size = Signal(2)
+
+        gnt_miss_fsm = Signal()
+        valid_miss_fsm = Signal()
+        nmiss = DCACHE_LINE_WIDTH//64
+        data_miss_fsm = Array(Signal(name="data_miss_fsm", 64) \
+                                    for i in range(nmiss))
+
+        # Cache Management <-> LFSR
+        lfsr_enable = Signal()
+        lfsr_oh = Signal(DCACHE_SET_ASSOC)
+        lfsr_bin = Signal($clog2(DCACHE_SET_ASSOC-1))
+        # AMOs
+        ariane_pkg::amo_t amo_op;
+        amo_operand_a = Signal(64)
+        amo_operand_b = Signal(64)
+        amo_result_o = Signal(64)
+
+        struct packed {
+            logic [63:3] address;
+            logic        valid;
+        } reservation_d, reservation_q;
+
+        # ------------------------------
+        # Cache Management
+        # ------------------------------
+        evict_way = Signal(DCACHE_SET_ASSOC)
+        valid_way = Signal(DCACHE_SET_ASSOC)
+
+        for (i in range(DCACHE_SET_ASSOC):
+            comb += evict_way[i].eq(data_i[i].valid & data_i[i].dirty)
+            comb += valid_way[i].eq(data_i[i].valid)
+
+        # ----------------------
+        # Default Assignments
+        # ----------------------
+        # to AXI refill
+        req_fsm_miss_req    = ariane_axi::CACHE_LINE_REQ;
+        req_fsm_miss_size   = Const(0b11, 2)
+        # core
+        serve_amo_d         = serve_amo_q;
+        # --------------------------------
+        # Flush and Miss operation
+        # --------------------------------
+        state_d      = state_q;
+        cnt_d        = cnt_q;
+        evict_way_d  = evict_way_q;
+        evict_cl_d   = evict_cl_q;
+        mshr_d       = mshr_q;
+        # communicate to the requester which unit we are currently serving
+        active_serving_o[mshr_q.id] = mshr_q.valid;
+        # AMOs
+        # silence the unit when not used
+        amo_op = amo_req_i.amo_op;
+
+        reservation_d = reservation_q;
+        with m.FSM() as state_q:
+
+            with m.Case("IDLE"):
+                # lowest priority are AMOs, wait until everything else
+                # is served before going for the AMOs
+                with m.If (amo_req_i.req & ~busy_i):
+                    # 1. Flush the cache
+                    with m.If(~serve_amo_q):
+                        m.next = "FLUSH_REQ_STATUS"
+                        serve_amo_d.eq(0b1
+                        cnt_d.eq(0
+                    # 2. Do the AMO
+                    with m.Else():
+                        m.next = "AMO_LOAD"
+                        serve_amo_d.eq(0b0
+
+                # check if we want to flush and can flush
+                # e.g.: we are not busy anymore
+                # TODO: Check that the busy flag is indeed needed
+                with m.If (flush_i & ~busy_i):
+                    m.next = "FLUSH_REQ_STATUS"
+                    cnt_d = 0
+
+                # check if one of the state machines missed
+                for i in range(NR_PORTS):
+                    # here comes the refill portion of code
+                    with m.If (miss_req_valid[i] & ~miss_req_bypass[i]):
+                        m.next = "MISS"
+                        # we are taking another request so don't
+                        # take the AMO
+                        serve_amo_d  = 0b0;
+                        # save to MSHR
+                        wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH
+                        comb += [ mshr_d.valid.eq(0b1),
+                                  mshr_d.we.eq(miss_req_we[i]),
+                                  mshr_d.id.eq(i),
+                                  mshr_d.addr.eq(miss_req_addr[i][0:wid]),
+                                  mshr_d.wdata.eq(miss_req_wdata[i]),
+                                  mshr_d.be.eq(miss_req_be[i]),
+                                ]
+                        break
+
+            #  ~> we missed on the cache
+            with m.Case("MISS"):
+                # 1. Check if there is an empty cache-line
+                # 2. If not -> evict one
+                comb += req_o.eq(1)
+                sync += addr_o.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH]
+                m.next = "MISS_REPL"
+                comb += miss_o.eq(1)
+
+            # ~> second miss cycle
+            with m.Case("MISS_REPL"):
+                # if all are valid we need to evict one, 
+                # pseudo random from LFSR
+                with m.If(~(~valid_way).bool()):
+                    comb += lfsr_enable.eq(0b1)
+                    comb += evict_way_d.eq(lfsr_oh)
+                    # do we need to write back the cache line?
+                    with m.If(data_i[lfsr_bin].dirty):
+                        state_d = WB_CACHELINE_MISS;
+                        comb += evict_cl_d.tag.eq(data_i[lfsr_bin].tag)
+                        comb += evict_cl_d.data.eq(data_i[lfsr_bin].data)
+                        comb += cnt_d.eq(mshr_q.addr[:DCACHE_INDEX_WIDTH])
+                    # no - we can request a cache line now
+                    with m.Else():
+                        m.next = "REQ_CACHELINE"
+                # we have at least one free way
+                with m.Else():
+                    # get victim cache-line by looking for the
+                    # first non-valid bit
+                    comb += evict_way_d.eq(get_victim_cl(~valid_way)
+                    m.next = "REQ_CACHELINE"
+
+            # ~> we can just load the cache-line,
+            # the way is store in evict_way_q
+            with m.Case("REQ_CACHELINE"):
+                comb += req_fsm_miss_valid .eq(1)
+                sync += req_fsm_miss_addr  .eq(mshr_q.addr)
+
+                with m.If (gnt_miss_fsm):
+                    m.next = "SAVE_CACHELINE"
+                    comb += miss_gnt_o[mshr_q.id].eq(1)
+
+            # ~> replace the cacheline
+            with m.Case("SAVE_CACHELINE"):
+                # calculate cacheline offset
+                automatic logic [$clog2(DCACHE_LINE_WIDTH)-1:0] cl_offset;
+                sync += cl_offset.eq(mshr_q.addr[3:DCACHE_BYTE_OFFSET] << 6)
+                # we've got a valid response from refill unit
+                with m.If (valid_miss_fsm):
+                    wid = DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH
+                    sync += addr_o      .eq(mshr_q.addr[:DCACHE_INDEX_WIDTH])
+                    sync += req_o       .eq(evict_way_q)
+                    comb += we_o        .eq(1)
+                    comb += be_o        .eq(1)
+                    sync += be_o.vldrty .eq(evict_way_q)
+                    sync += data_o.tag  .eq(mshr_q.addr[DCACHE_INDEX_WIDTH:wid]
+                    comb += data_o.data .eq(data_miss_fsm)
+                    comb += data_o.valid.eq(1)
+                    comb += data_o.dirty.eq(0)
+
+                    # is this a write?
+                    with m.If (mshr_q.we):
+                        # Yes, so safe the updated data now
+                        for i in range(8):
+                            # check if we really want to write
+                            # the corresponding byte
+                            with m.If (mshr_q.be[i]):
+                                sync += data_o.data[(cl_offset + i*8) +: 8].eq(mshr_q.wdata[i];
+                        # it's immediately dirty if we write
+                        comb += data_o.dirty.eq(1)
+
+                    # reset MSHR
+                    comb += mshr_d.valid.eq(0)
+                    # go back to idle
+                    m.next = 'IDLE'
+
+            # ------------------------------
+            # Write Back Operation
+            # ------------------------------
+            # ~> evict a cache line from way saved in evict_way_q
+            with m.Case("WB_CACHELINE_FLUSH"):
+            with m.Case("WB_CACHELINE_MISS"):
+
+                comb += req_fsm_miss_valid .eq(0b1)
+                sync += req_fsm_miss_addr  .eq({evict_cl_q.tag, cnt_q[DCACHE_INDEX_WIDTH-1:DCACHE_BYTE_OFFSET], {{DCACHE_BYTE_OFFSET}{0b0}}};
+                comb += req_fsm_miss_be    .eq(1)
+                comb += req_fsm_miss_we    .eq(0b1)
+                sync += req_fsm_miss_wdata .eq(evict_cl_q.data;
+
+                # we've got a grant --> this is timing critical, think about it
+                if (gnt_miss_fsm) begin
+                    # write status array
+                    sync += addr_o    .eq(cnt_q)
+                    comb += req_o     .eq(0b1)
+                    comb += we_o      .eq(0b1)
+                    comb += data_o.valid.eq(INVALIDATE_ON_FLUSH ? 0b0 : 0b1)
+                    # invalidate
+                    sync += be_o.vldrty.eq(evict_way_q)
+                    # go back to handling the miss or flushing,
+                    # depending on where we came from
+                    with m.If(state_q == WB_CACHELINE_MISS):
+                        m.next = "MISS"
+                    with m.Else():
+                        m.next = "FLUSH_REQ_STATUS"
+
+            # ------------------------------
+            # Flushing & Initialization
+            # ------------------------------
+            # ~> make another request to check the same
+            # cache-line if there are still some valid entries
+            with m.Case("FLUSH_REQ_STATUS"):
+                comb += req_o  .eq(1)
+                sync += addr_o .eq(cnt_q)
+                m.next = "FLUSHING"
+
+            with m.Case("FLUSHING"):
+                # this has priority
+                # at least one of the cache lines is dirty
+                with m.If(~evict_way):
+                    # evict cache line, look for the first
+                    # cache-line which is dirty
+                    comb += evict_way_d.eq(get_victim_cl(evict_way))
+                    comb += evict_cl_d .eq(data_i[one_hot_to_bin(evict_way)])
+                    state_d     = WB_CACHELINE_FLUSH;
+                # not dirty ~> increment and continue
+                with m.Else():
+                    # increment and re-request
+                    sync += cnt_d.eq(cnt_q + (1 << DCACHE_BYTE_OFFSET))
+                    m.next = "FLUSH_REQ_STATUS"
+                    sync += addr_o     .eq(cnt_q)
+                    comb += req_o      .eq(1)
+                    comb += be_o.vldrty.eq(INVALIDATE_ON_FLUSH ? 1 : 0)
+                    comb += we_o       .eq(1)
+                    # finished with flushing operation, go back to idle
+                    with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \
+                               == DCACHE_NUM_WORDS-1):
+                        # only acknowledge if the flush wasn't
+                        # triggered by an atomic
+                        sync += flush_ack_o.eq(~serve_amo_q)
+                        m.next = "IDLE"
+
+            # ~> only called after reset
+            with m.Case("INIT"):
+                # initialize status array
+                sync += addr_o.eq(cnt_q)
+                comb += req_o .eq(1)
+                comb += we_o  .eq(1)
+                # only write the dirty array
+                comb += be_o.vldrty.eq(1)
+                sync += cnt_d      .eq(cnt_q + (1 << DCACHE_BYTE_OFFSET))
+                # finished initialization
+                with m.If (cnt_q[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] \
+                            == DCACHE_NUM_WORDS-1)
+                    m.next = "IDLE"
+
+            # ----------------------
+            # AMOs
+            # ----------------------
+            # TODO(zarubaf) Move this closer to memory
+            # ~> we are here because we need to do the AMO,
+            # the cache is clean at this point
+            # start by executing the load
+            with m.Case("AMO_LOAD"):
+                comb += req_fsm_miss_valid.eq(1)
+                # address is in operand a
+                comb += req_fsm_miss_addr.eq(amo_req_i.operand_a)
+                comb += req_fsm_miss_req.eq(ariane_axi::SINGLE_REQ)
+                comb += req_fsm_miss_size.eq(amo_req_i.size)
+                # the request has been granted
+                with m.If(gnt_miss_fsm):
+                    m.next = "AMO_SAVE_LOAD"
+            # save the load value
+            with m.Case("AMO_SAVE_LOAD"):
+                with m.If (valid_miss_fsm):
+                    # we are only concerned about the lower 64-bit
+                    comb += mshr_d.wdata.eq(data_miss_fsm[0])
+                    m.next = "AMO_STORE"
+            # and do the store
+            with m.Case("AMO_STORE"):
+                load_data = Signal(64)
+                # re-align load data
+                comb += load_data.eq(data_align(amo_req_i.operand_a[:3],
+                                                mshr_q.wdata))
+                # Sign-extend for word operation
+                with m.If (amo_req_i.size == 0b10):
+                    comb += amo_operand_a.eq(sext32(load_data[:32]))
+                    comb += amo_operand_b.eq(sext32(amo_req_i.operand_b[:32]))
+                with m.Else():
+                    comb += amo_operand_a.eq(load_data)
+                    comb += amo_operand_b.eq(amo_req_i.operand_b)
+
+                #  we do not need a store request for load reserved
+                # or a failing store conditional
+                #  we can bail-out without making any further requests
+                with m.If ((amo_req_i.amo_op == AMO_LR) | \
+                           ((amo_req_i.amo_op == AMO_SC) & \
+                           ((reservation_q.valid & \
+                            (reservation_q.address != \
+                             amo_req_i.operand_a[3:64])) | \
+                             ~reservation_q.valid))):
+                    comb += req_fsm_miss_valid.eq(0)
+                    m.next = "IDLE"
+                    comb += amo_resp_o.ack.eq(1)
+                    # write-back the result
+                    comb += amo_resp_o.result.eq(amo_operand_a)
+                    # we know that the SC failed
+                    with m.If (amo_req_i.amo_op == AMO_SC):
+                        comb += amo_resp_o.result.eq(1)
+                        # also clear the reservation
+                        comb += reservation_d.valid.eq(0)
+                with m.Else():
+                    comb += req_fsm_miss_valid.eq(1)
+
+                comb += req_fsm_miss_we  .eq(1)
+                comb += req_fsm_miss_req .eq(ariane_axi::SINGLE_REQ)
+                comb += req_fsm_miss_size.eq(amo_req_i.size)
+                comb += req_fsm_miss_addr.eq(amo_req_i.operand_a)
+
+                comb += req_fsm_miss_wdata.eq(
+                    data_align(amo_req_i.operand_a[0:3], amo_result_o))
+                comb += req_fsm_miss_be.eq(
+                    be_gen(amo_req_i.operand_a[0:3], amo_req_i.size))
+
+                # place a reservation on the memory
+                with m.If (amo_req_i.amo_op == AMO_LR):
+                    comb += reservation_d.address.eq(amo_req_i.operand_a[3:64])
+                    comb += reservation_d.valid.eq(1)
+
+                # the request is valid or we didn't need to go for another store
+                with m.If (valid_miss_fsm):
+                    m.next = "IDLE"
+                    comb += amo_resp_o.ack.eq(1)
+                    # write-back the result
+                    comb += amo_resp_o.result.eq(amo_operand_a;
+
+                    if (amo_req_i.amo_op == AMO_SC) begin
+                        comb += amo_resp_o.result.eq(0)
+                        # An SC must fail if there is another SC
+                        # (to any address) between the LR and the SC in
+                        # program order (even to the same address).
+                        # in any case destroy the reservation
+                        comb += reservation_d.valid.eq(0)
+
+        # check MSHR for aliasing
+
+        comb += mshr_addr_matches_o .eq(0)
+        comb += mshr_index_matches_o.eq()
+
+        for i in range(NR_PORTS):
+            # check mshr for potential matching of other units,
+            # exclude the unit currently being served
+            with m.If (mshr_q.valid & \
+                    (mshr_addr_i[i][DCACHE_BYTE_OFFSET:56] == \
+                     mshr_q.addr[DCACHE_BYTE_OFFSET:56])):
+                comb += mshr_addr_matches_o[i].eq(1)
+
+            # same as previous, but checking only the index
+            with m.If (mshr_q.valid & \
+                    (mshr_addr_i[i][DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH] == \
+                     mshr_q.addr[DCACHE_BYTE_OFFSET:DCACHE_INDEX_WIDTH])):
+                mshr_index_matches_o[i].eq(1)
+
+        # --------------------
+        # Sequential Process
+        # --------------------
+
+        """
+        #pragma translate_off
+        `ifndef VERILATOR
+        # assert that cache only hits on one way
+        assert property (
+          @(posedge clk_i) $onehot0(evict_way_q)) else $warning("Evict-way should be one-hot encoded");
+        `endif
+        #pragma translate_on
+        """
+
+        # ----------------------
+        # Bypass Arbiter
+        # ----------------------
+        # Connection Arbiter <-> AXI
+        req_fsm_bypass_valid = Signal()
+        req_fsm_bypass_addr = Signal(64)
+        req_fsm_bypass_wdata = Signal(64)
+        req_fsm_bypass_we = Signal()
+        req_fsm_bypass_be = Signal(8)
+        req_fsm_bypass_size = Signal(2)
+        gnt_bypass_fsm = Signal()
+        valid_bypass_fsm = Signal()
+        data_bypass_fsm = Signal(64)
+        logic [$clog2(NR_PORTS)-1:0] id_fsm_bypass;
+        logic [3:0]                  id_bypass_fsm;
+        logic [3:0]                  gnt_id_bypass_fsm;
+
+        i_bypass_arbiter = ib = AXIArbiter( NR_PORTS, 64)
+        comb += [
+            # Master Side
+            ib.data_req_i     .eq( miss_req_valid & miss_req_bypass         ),
+            ib.address_i      .eq( miss_req_addr                            ),
+            ib.data_wdata_i   .eq( miss_req_wdata                           ),
+            ib.data_we_i      .eq( miss_req_we                              ),
+            ib.data_be_i      .eq( miss_req_be                              ),
+            ib.data_size_i    .eq( miss_req_size                            ),
+            ib.data_gnt_o     .eq( bypass_gnt_o                             ),
+            ib.data_rvalid_o  .eq( bypass_valid_o                           ),
+            ib.data_rdata_o   .eq( bypass_data_o                            ),
+            # Slave Sid
+            ib.id_i           .eq( id_bypass_fsm[$clog2(NR_PORTS)-1:0]      ),
+            ib.id_o           .eq( id_fsm_bypass                            ),
+            ib.gnt_id_i       .eq( gnt_id_bypass_fsm[$clog2(NR_PORTS)-1:0]  ),
+            ib.address_o      .eq( req_fsm_bypass_addr                      ),
+            ib.data_wdata_o   .eq( req_fsm_bypass_wdata                     ),
+            ib.data_req_o     .eq( req_fsm_bypass_valid                     ),
+            ib.data_we_o      .eq( req_fsm_bypass_we                        ),
+            ib.data_be_o      .eq( req_fsm_bypass_be                        ),
+            ib.data_size_o    .eq( req_fsm_bypass_size                      ),
+            ib.data_gnt_i     .eq( gnt_bypass_fsm                           ),
+            ib.data_rvalid_i  .eq( valid_bypass_fsm                         ),
+            ib.data_rdata_i   .eq( data_bypass_fsm                          ),
+        ]
+
+        axi_adapter #(
+            .DATA_WIDTH            ( 64                 ),
+            .AXI_ID_WIDTH          ( 4                  ),
+            .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET )
+        ) i_bypass_axi_adapter (
+            .clk_i,
+            .rst_ni,
+            .req_i                 ( req_fsm_bypass_valid   ),
+            .type_i                ( ariane_axi::SINGLE_REQ ),
+            .gnt_o                 ( gnt_bypass_fsm         ),
+            .addr_i                ( req_fsm_bypass_addr    ),
+            .we_i                  ( req_fsm_bypass_we      ),
+            .wdata_i               ( req_fsm_bypass_wdata   ),
+            .be_i                  ( req_fsm_bypass_be      ),
+            .size_i                ( req_fsm_bypass_size    ),
+            .id_i                  ( Cat(id_fsm_bypass, 0, 0) ),
+            .valid_o               ( valid_bypass_fsm       ),
+            .rdata_o               ( data_bypass_fsm        ),
+            .gnt_id_o              ( gnt_id_bypass_fsm      ),
+            .id_o                  ( id_bypass_fsm          ),
+            .critical_word_o       (                        ), # not used for single requests
+            .critical_word_valid_o (                        ), # not used for single requests
+            .axi_req_o             ( axi_bypass_o           ),
+            .axi_resp_i            ( axi_bypass_i           )
+        );
+
+        # ----------------------
+        # Cache Line AXI Refill
+        # ----------------------
+        axi_adapter  #(
+            .DATA_WIDTH            ( DCACHE_LINE_WIDTH  ),
+            .AXI_ID_WIDTH          ( 4                  ),
+            .CACHELINE_BYTE_OFFSET ( DCACHE_BYTE_OFFSET )
+        ) i_miss_axi_adapter (
+            .clk_i,
+            .rst_ni,
+            .req_i               ( req_fsm_miss_valid ),
+            .type_i              ( req_fsm_miss_req   ),
+            .gnt_o               ( gnt_miss_fsm       ),
+            .addr_i              ( req_fsm_miss_addr  ),
+            .we_i                ( req_fsm_miss_we    ),
+            .wdata_i             ( req_fsm_miss_wdata ),
+            .be_i                ( req_fsm_miss_be    ),
+            .size_i              ( req_fsm_miss_size  ),
+            .id_i                ( Const(0b1100, 4)   ),
+            .gnt_id_o            (                    ), # open
+            .valid_o             ( valid_miss_fsm     ),
+            .rdata_o             ( data_miss_fsm      ),
+            .id_o                (                    ),
+            .critical_word_o,
+            .critical_word_valid_o,
+            .axi_req_o           ( axi_data_o         ),
+            .axi_resp_i          ( axi_data_i         )
+        );
+
+        # -----------------
+        # Replacement LFSR
+        # -----------------
+        lfsr_8bit #(.WIDTH (DCACHE_SET_ASSOC)) i_lfsr (
+            .en_i           ( lfsr_enable ),
+            .refill_way_oh  ( lfsr_oh     ),
+            .refill_way_bin ( lfsr_bin    ),
+            .*
+        );
+
+        # -----------------
+        # AMO ALU
+        # -----------------
+        amo_alu i_amo_alu (
+            .amo_op_i        ( amo_op        ),
+            .amo_operand_a_i ( amo_operand_a ),
+            .amo_operand_b_i ( amo_operand_b ),
+            .amo_result_o    ( amo_result_o  )
+        );
+
+        # -----------------
+        # Struct Split
+        # -----------------
+
+        for i in range(NR_PORTS):
+            miss_req = MissReq()
+            comb += miss_req.eq(miss_req_i[i]);
+            comb += miss_req_valid  [i] .eq(miss_req.valid)
+            comb += miss_req_bypass [i] .eq(miss_req.bypass)
+            comb += miss_req_addr   [i] .eq(miss_req.addr)
+            comb += miss_req_wdata  [i] .eq(miss_req.wdata)
+            comb += miss_req_we     [i] .eq(miss_req.we)
+            comb += miss_req_be     [i] .eq(miss_req.be)
+            comb += miss_req_size   [i] .eq(miss_req.size)
+
+    # --------------
+    # AXI Arbiter
+    # --------------s
+    #
+    # Description: Arbitrates access to AXI refill/bypass
+    #
+class AXIArbiter:
+    def __init__(self, NR_PORTS   = 3, DATA_WIDTH = 64):
+        self.NR_PORTS = NR_PORTS
+        self.DATA_WIDTH = DATA_WIDTH
+        self.pwid = pwid = ceil(log(NR_PORTS) / log(2))
+        rst_ni = ResetSignal() # Asynchronous reset active low
+        # master ports
+        self.data_req_i = Signal(NR_PORTS)
+        self.address_i = Array(Signal(name="address_i", 64) \
+                                    for i in range(NR_PORTS))
+        self.data_wdata_i = Array(Signal(name="data_wdata_i", 64) \
+                                    for i in range(NR_PORTS))
+        self.data_we_i = Signal(NR_PORTS)
+        self.data_be_i = Array(Signal(name="data_wdata_i", DATA_WIDTH/8) \
+                                    for i in range(NR_PORTS))
+        self.data_size_i = Array(Signal(name="data_size_i", 2) \
+                                    for i in range(NR_PORTS))
+        self.data_gnt_o = Signal(NR_PORTS)
+        self.data_rvalid_o = Signal(NR_PORTS)
+        self.data_rdata_o = Array(Signal(name="data_rdata_o", 64) \
+                                    for i in range(NR_PORTS))
+
+        # slave port
+        self.id_i = Signal(pwid)
+        self.id_o = Signal(pwid)
+        self.gnt_id_i = Signal(pwid)
+        self.data_req_o = Signal()
+        self.address_o = Signal(64)
+        self.data_wdata_o = Signal(DATA_WIDTH)
+        self.data_we_o = Signal()
+        self.data_be_o = Signal(DATA_WIDTH/8)
+        self.data_size_o = Signal(2)
+        self.data_gnt_i = Signal()
+        self.data_rvalid_i = Signal()
+        self.data_rdata_i = Signal(DATA_WIDTH)
+
+    def elaborate(self, platform):
+        #enum logic [1:0] { IDLE, REQ, SERVING } state_d, state_q;
+
+        class Packet:
+            def __init__(self, pwid, DATA_WIDTH):
+                self.id = Signal(pwid)
+                self.address = Signal(64)
+                self.data = Signal(64)
+                self.size = Signal(2)
+                self.be = Signal(DATA_WIDTH/8)
+                self.we = Signal()
+
+        request_index = Signal(self.pwid)
+        req_q = Packet(self.pwid, self.DATA_WIDTH)
+        req_d = Packet(self.pwid, self.DATA_WIDTH)
+
+        # request register
+        sync += req_q.eq(req_d)
+
+        # request port
+        comb += self.address_o             .eq(req_q.address)
+        comb += self.data_wdata_o          .eq(req_q.data)
+        comb += self.data_be_o             .eq(req_q.be)
+        comb += self.data_size_o           .eq(req_q.size)
+        comb += self.data_we_o             .eq(req_q.we)
+        comb += self.id_o                  .eq(req_q.id)
+        comb += self.data_gnt_o            .eq(0)
+        # read port
+        comb += self.data_rvalid_o         .eq(0)
+        comb += self.data_rdata_o          .eq(0)
+        comb += self.data_rdata_o[req_q.id].eq(data_rdata_i)
+
+        m.submodules.pp = pp = PriorityEncoder(self.NR_PORTS)
+        comb += pp.i.eq(self.data_req_i) # select one request (priority-based)
+        comb += request_index.eq(pp.o)
+
+        with m.Switch("state") as s:
+
+            with m.Case("IDLE"):
+                # wait for incoming requests (priority encoder data_req_i)
+                with m.If(~pp.n): # one output valid from encoder
+                    comb += self.data_req_o   .eq(self.data_req_i[i])
+                    comb += self.data_gnt_o[i].eq(self.data_req_i[i])
+                    # save the request
+                    comb += req_d.address.eq(self.address_i[i])
+                    comb += req_d.id.eq(request_index)
+                    comb += req_d.data.eq(self.data_wdata_i[i])
+                    comb += req_d.size.eq(self.data_size_i[i])
+                    comb += req_d.be.eq(self.data_be_i[i])
+                    comb += req_d.we.eq(self.data_we_i[i])
+                    m.next = "SERVING"
+
+                comb += self.address_o    .eq(self.address_i[request_index])
+                comb += self.data_wdata_o .eq(self.data_wdata_i[request_index])
+                comb += self.data_be_o    .eq(self.data_be_i[request_index])
+                comb += self.data_size_o  .eq(self.data_size_i[request_index])
+                comb += self.data_we_o    .eq(self.data_we_i[request_index])
+                comb += self.id_o         .eq(request_index)
+
+            with m.Case("SERVING"):
+                comb += self.data_req_o.eq(1)
+                with m.If (self.data_rvalid_i):
+                    comb += self.data_rvalid_o[req_q.id].eq(1)
+                    m.next = "IDLE"
+
+        # ------------
+        # Assertions
+        # ------------
+
+        """
+#pragma translate_off
+`ifndef VERILATOR
+# make sure that we eventually get an rvalid after we received a grant
+assert property (@(posedge clk_i) data_gnt_i |-> ##[1:$] data_rvalid_i )
+    else begin $error("There was a grant without a rvalid"); $stop(); end
+# assert that there is no grant without a request
+assert property (@(negedge clk_i) data_gnt_i |-> data_req_o)
+    else begin $error("There was a grant without a request."); $stop(); end
+# assert that the address does not contain X when request is sent
+assert property ( @(posedge clk_i) (data_req_o) |-> (!$isunknown(address_o)) )
+  else begin $error("address contains X when request is set"); $stop(); end
+
+`endif
+#pragma translate_on
+        """
+
diff --git a/unused_please_ignore_completely/TLB/ariane/mmu.py b/unused_please_ignore_completely/TLB/ariane/mmu.py
new file mode 100644 (file)
index 0000000..a14862c
--- /dev/null
@@ -0,0 +1,474 @@
+"""
+# Copyright 2018 ETH Zurich and University of Bologna.
+# Copyright and related rights are licensed under the Solderpad Hardware
+# License, Version 0.51 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# or agreed to in writing, software, hardware and materials distributed under
+# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Author: Florian Zaruba, ETH Zurich
+# Date: 19/04/2017
+# Description: Memory Management Unit for Ariane, contains TLB and
+#              address translation unit. SV48 as defined in
+#              Volume II: RISC-V Privileged Architectures V1.10 Page 63
+
+import ariane_pkg::*;
+"""
+
+from nmigen import Const, Signal, Cat, Module, Mux
+from nmigen.cli import verilog, rtlil
+
+from ptw import DCacheReqI, DCacheReqO, TLBUpdate, PTE, PTW
+from tlb import TLB
+from exceptcause import (INSTR_ACCESS_FAULT, INSTR_PAGE_FAULT,
+                         LOAD_PAGE_FAULT, STORE_PAGE_FAULT)
+
+PRIV_LVL_M = Const(0b11, 2)
+PRIV_LVL_S = Const(0b01, 2)
+PRIV_LVL_U = Const(0b00, 2)
+
+
+class RVException:
+    def __init__(self):
+         self.cause = Signal(64) # cause of exception
+         self.tval = Signal(64) # more info of causing exception
+                                # (e.g.: instruction causing it),
+                                #        address of LD/ST fault
+         self.valid = Signal()
+
+    def eq(self, inp):
+        res = []
+        for (o, i) in zip(self.ports(), inp.ports()):
+            res.append(o.eq(i))
+        return res
+
+    def __iter__(self):
+        yield self.cause
+        yield self.tval
+        yield self.valid
+
+    def ports(self):
+        return list(self)
+
+
+class ICacheReqI:
+    def __init__(self):
+        self.fetch_valid = Signal()   # address translation valid
+        self.fetch_paddr = Signal(64) # physical address in
+        self.fetch_exception = RVException() # exception occurred during fetch
+
+    def __iter__(self):
+        yield self.fetch_valid
+        yield self.fetch_paddr
+        yield from self.fetch_exception
+
+    def ports(self):
+        return list(self)
+
+
+class ICacheReqO:
+    def __init__(self):
+        self.fetch_req = Signal()     # address translation request
+        self.fetch_vaddr = Signal(64) # virtual address out
+
+    def __iter__(self):
+        yield self.fetch_req
+        yield self.fetch_vaddr
+
+    def ports(self):
+        return list(self)
+
+
+class MMU:
+    def __init__(self, instr_tlb_entries = 4,
+                       data_tlb_entries  = 4,
+                       asid_width        = 1):
+        self.instr_tlb_entries = instr_tlb_entries
+        self.data_tlb_entries = data_tlb_entries
+        self.asid_width = asid_width
+
+        self.flush_i = Signal()
+        self.enable_translation_i = Signal()
+        self.en_ld_st_translation_i = Signal() # enable VM translation for LD/ST
+        # IF interface
+        self.icache_areq_i = ICacheReqO()
+        self.icache_areq_o = ICacheReqI()
+        # LSU interface
+        # this is a more minimalistic interface because the actual addressing
+        # logic is handled in the LSU as we distinguish load and stores,
+        # what we do here is simple address translation
+        self.misaligned_ex_i = RVException()
+        self.lsu_req_i = Signal()   # request address translation
+        self.lsu_vaddr_i = Signal(64) # virtual address in
+        self.lsu_is_store_i = Signal() # the translation is requested by a store
+        # if we need to walk the page table we can't grant in the same cycle
+
+        # Cycle 0
+        self.lsu_dtlb_hit_o = Signal() # sent in the same cycle as the request
+                                       # if translation hits in the DTLB
+        # Cycle 1
+        self.lsu_valid_o = Signal()  # translation is valid
+        self.lsu_paddr_o = Signal(64) # translated address
+        self.lsu_exception_o = RVException() # addr translate threw exception
+
+        # General control signals
+        self.priv_lvl_i = Signal(2)
+        self.ld_st_priv_lvl_i = Signal(2)
+        self.sum_i = Signal()
+        self.mxr_i = Signal()
+        # input logic flag_mprv_i,
+        self.satp_ppn_i = Signal(44)
+        self.asid_i = Signal(self.asid_width)
+        self.flush_tlb_i = Signal()
+        # Performance counters
+        self.itlb_miss_o = Signal()
+        self.dtlb_miss_o = Signal()
+        # PTW memory interface
+        self.req_port_i = DCacheReqO()
+        self.req_port_o = DCacheReqI()
+
+    def elaborate(self, platform):
+        m = Module()
+
+        iaccess_err = Signal()   # insufficient priv to access instr page
+        daccess_err = Signal()   # insufficient priv to access data page
+        ptw_active = Signal()    # PTW is currently walking a page table
+        walking_instr = Signal() # PTW is walking because of an ITLB miss
+        ptw_error = Signal()     # PTW threw an exception
+
+        update_vaddr = Signal(48)                                # guessed
+        uaddr64 = Cat(update_vaddr, Const(0, 25)) # extend to 64bit with zeros
+        update_ptw_itlb = TLBUpdate(self.asid_width)
+        update_ptw_dtlb = TLBUpdate(self.asid_width)
+
+        itlb_lu_access = Signal()
+        itlb_content = PTE()
+        itlb_is_2M = Signal()
+        itlb_is_1G = Signal()
+        itlb_is_512G = Signal()
+        itlb_lu_hit = Signal()
+
+        dtlb_lu_access = Signal()
+        dtlb_content = PTE()
+        dtlb_is_2M = Signal()
+        dtlb_is_1G = Signal()
+        dtlb_is_512G = Signal()
+        dtlb_lu_hit = Signal()
+
+        # Assignments
+        m.d.comb += [itlb_lu_access.eq(self.icache_areq_i.fetch_req),
+                     dtlb_lu_access.eq(self.lsu_req_i)
+                    ]
+
+        # ITLB
+        m.submodules.i_tlb = i_tlb = TLB(self.instr_tlb_entries,
+                                         self.asid_width)
+        m.d.comb += [i_tlb.flush_i.eq(self.flush_tlb_i),
+                     i_tlb.update_i.eq(update_ptw_itlb),
+                     i_tlb.lu_access_i.eq(itlb_lu_access),
+                     i_tlb.lu_asid_i.eq(self.asid_i),
+                     i_tlb.lu_vaddr_i.eq(self.icache_areq_i.fetch_vaddr),
+                     itlb_content.eq(i_tlb.lu_content_o),
+                     itlb_is_2M.eq(i_tlb.lu_is_2M_o),
+                     itlb_is_1G.eq(i_tlb.lu_is_1G_o),
+                     itlb_is_512G.eq(i_tlb.lu_is_512G_o),
+                     itlb_lu_hit.eq(i_tlb.lu_hit_o),
+                    ]
+
+        # DTLB
+        m.submodules.d_tlb = d_tlb = TLB(self.data_tlb_entries,
+                                         self.asid_width)
+        m.d.comb += [d_tlb.flush_i.eq(self.flush_tlb_i),
+                     d_tlb.update_i.eq(update_ptw_dtlb),
+                     d_tlb.lu_access_i.eq(dtlb_lu_access),
+                     d_tlb.lu_asid_i.eq(self.asid_i),
+                     d_tlb.lu_vaddr_i.eq(self.lsu_vaddr_i),
+                     dtlb_content.eq(d_tlb.lu_content_o),
+                     dtlb_is_2M.eq(d_tlb.lu_is_2M_o),
+                     dtlb_is_1G.eq(d_tlb.lu_is_1G_o),
+                     dtlb_is_512G.eq(d_tlb.lu_is_512G_o),
+                     dtlb_lu_hit.eq(d_tlb.lu_hit_o),
+                    ]
+
+        # PTW
+        m.submodules.ptw = ptw = PTW(self.asid_width)
+        m.d.comb += [ptw_active.eq(ptw.ptw_active_o),
+                     walking_instr.eq(ptw.walking_instr_o),
+                     ptw_error.eq(ptw.ptw_error_o),
+                     ptw.enable_translation_i.eq(self.enable_translation_i),
+
+                     update_vaddr.eq(ptw.update_vaddr_o),
+                     update_ptw_itlb.eq(ptw.itlb_update_o),
+                     update_ptw_dtlb.eq(ptw.dtlb_update_o),
+
+                     ptw.itlb_access_i.eq(itlb_lu_access),
+                     ptw.itlb_hit_i.eq(itlb_lu_hit),
+                     ptw.itlb_vaddr_i.eq(self.icache_areq_i.fetch_vaddr),
+
+                     ptw.dtlb_access_i.eq(dtlb_lu_access),
+                     ptw.dtlb_hit_i.eq(dtlb_lu_hit),
+                     ptw.dtlb_vaddr_i.eq(self.lsu_vaddr_i),
+
+                     ptw.req_port_i.eq(self.req_port_i),
+                     self.req_port_o.eq(ptw.req_port_o),
+                    ]
+
+        # ila_1 i_ila_1 (
+        #     .clk(clk_i), # input wire clk
+        #     .probe0({req_port_o.address_tag, req_port_o.address_index}),
+        #     .probe1(req_port_o.data_req), # input wire [63:0]  probe1
+        #     .probe2(req_port_i.data_gnt), # input wire [0:0]  probe2
+        #     .probe3(req_port_i.data_rdata), # input wire [0:0]  probe3
+        #     .probe4(req_port_i.data_rvalid), # input wire [0:0]  probe4
+        #     .probe5(ptw_error), # input wire [1:0]  probe5
+        #     .probe6(update_vaddr), # input wire [0:0]  probe6
+        #     .probe7(update_ptw_itlb.valid), # input wire [0:0]  probe7
+        #     .probe8(update_ptw_dtlb.valid), # input wire [0:0]  probe8
+        #     .probe9(dtlb_lu_access), # input wire [0:0]  probe9
+        #     .probe10(lsu_vaddr_i), # input wire [0:0]  probe10
+        #     .probe11(dtlb_lu_hit), # input wire [0:0]  probe11
+        #     .probe12(itlb_lu_access), # input wire [0:0]  probe12
+        #     .probe13(icache_areq_i.fetch_vaddr), # input wire [0:0]  probe13
+        #     .probe14(itlb_lu_hit) # input wire [0:0]  probe13
+        # );
+
+        #-----------------------
+        # Instruction Interface
+        #-----------------------
+        # The instruction interface is a simple request response interface
+
+        # MMU disabled: just pass through
+        m.d.comb += [self.icache_areq_o.fetch_valid.eq(
+                                                self.icache_areq_i.fetch_req),
+                     # play through in case we disabled address translation
+                     self.icache_areq_o.fetch_paddr.eq(
+                                                self.icache_areq_i.fetch_vaddr)
+                    ]
+        # two potential exception sources:
+        # 1. HPTW threw an exception -> signal with a page fault exception
+        # 2. We got an access error because of insufficient permissions ->
+        #    throw an access exception
+        m.d.comb += self.icache_areq_o.fetch_exception.valid.eq(0)
+        # Check whether we are allowed to access this memory region
+        # from a fetch perspective
+
+        # PLATEN TODO: use PermissionValidator instead [we like modules]
+        m.d.comb += iaccess_err.eq(self.icache_areq_i.fetch_req & \
+                                   (((self.priv_lvl_i == PRIV_LVL_U) & \
+                                      ~itlb_content.u) | \
+                                   ((self.priv_lvl_i == PRIV_LVL_S) & \
+                                    itlb_content.u)))
+
+        # MMU enabled: address from TLB, request delayed until hit.
+        # Error when TLB hit and no access right or TLB hit and
+        # translated address not valid (e.g.  AXI decode error),
+        # or when PTW performs walk due to ITLB miss and raises
+        # an error.
+        with m.If (self.enable_translation_i):
+            # we work with SV48, so if VM is enabled, check that
+            # all bits [47:38] are equal
+            with m.If (self.icache_areq_i.fetch_req & \
+                ~(((~self.icache_areq_i.fetch_vaddr[47:64]) == 0) | \
+                 (self.icache_areq_i.fetch_vaddr[47:64]) == 0)):
+                fe = self.icache_areq_o.fetch_exception
+                m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT),
+                             fe.tval.eq(self.icache_areq_i.fetch_vaddr),
+                             fe.valid.eq(1)
+                            ]
+
+            m.d.comb += self.icache_areq_o.fetch_valid.eq(0)
+
+            # 4K page
+            paddr = Signal.like(self.icache_areq_o.fetch_paddr)
+            paddr4k = Cat(self.icache_areq_i.fetch_vaddr[0:12],
+                          itlb_content.ppn)
+            m.d.comb += paddr.eq(paddr4k)
+            # Mega page
+            with m.If(itlb_is_2M):
+                m.d.comb += paddr[12:21].eq(
+                          self.icache_areq_i.fetch_vaddr[12:21])
+            # Giga page
+            with m.If(itlb_is_1G):
+                m.d.comb += paddr[12:30].eq(
+                          self.icache_areq_i.fetch_vaddr[12:30])
+            m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr)
+            # Tera page
+            with m.If(itlb_is_512G):
+                m.d.comb += paddr[12:39].eq(
+                          self.icache_areq_i.fetch_vaddr[12:39])
+            m.d.comb += self.icache_areq_o.fetch_paddr.eq(paddr)
+
+            # ---------
+            # ITLB Hit
+            # --------
+            # if we hit the ITLB output the request signal immediately
+            with m.If(itlb_lu_hit):
+                m.d.comb += self.icache_areq_o.fetch_valid.eq(
+                                          self.icache_areq_i.fetch_req)
+                # we got an access error
+                with m.If (iaccess_err):
+                    # throw a page fault
+                    fe = self.icache_areq_o.fetch_exception
+                    m.d.comb += [fe.cause.eq(INSTR_ACCESS_FAULT),
+                                 fe.tval.eq(self.icache_areq_i.fetch_vaddr),
+                                 fe.valid.eq(1)
+                                ]
+            # ---------
+            # ITLB Miss
+            # ---------
+            # watch out for exceptions happening during walking the page table
+            with m.Elif(ptw_active & walking_instr):
+                m.d.comb += self.icache_areq_o.fetch_valid.eq(ptw_error)
+                fe = self.icache_areq_o.fetch_exception
+                m.d.comb += [fe.cause.eq(INSTR_PAGE_FAULT),
+                             fe.tval.eq(uaddr64),
+                             fe.valid.eq(1)
+                            ]
+
+        #-----------------------
+        # Data Interface
+        #-----------------------
+
+        lsu_vaddr = Signal(64)
+        dtlb_pte = PTE()
+        misaligned_ex = RVException()
+        lsu_req = Signal()
+        lsu_is_store = Signal()
+        dtlb_hit = Signal()
+        #dtlb_is_2M = Signal()
+        #dtlb_is_1G = Signal()
+        #dtlb_is_512 = Signal()
+
+        # check if we need to do translation or if we are always
+        # ready (e.g.: we are not translating anything)
+        m.d.comb += self.lsu_dtlb_hit_o.eq(Mux(self.en_ld_st_translation_i,
+                                          dtlb_lu_hit, 1))
+
+        # The data interface is simpler and only consists of a
+        # request/response interface
+        m.d.comb += [
+            # save request and DTLB response
+            lsu_vaddr.eq(self.lsu_vaddr_i),
+            lsu_req.eq(self.lsu_req_i),
+            misaligned_ex.eq(self.misaligned_ex_i),
+            dtlb_pte.eq(dtlb_content),
+            dtlb_hit.eq(dtlb_lu_hit),
+            lsu_is_store.eq(self.lsu_is_store_i),
+            #dtlb_is_2M.eq(dtlb_is_2M),
+            #dtlb_is_1G.eq(dtlb_is_1G),
+            ##dtlb_is_512.eq(self.dtlb_is_512G) #????
+        ]
+        m.d.sync += [
+            self.lsu_paddr_o.eq(lsu_vaddr),
+            self.lsu_valid_o.eq(lsu_req),
+            self.lsu_exception_o.eq(misaligned_ex),
+        ]
+
+        sverr = Signal()
+        usrerr = Signal()
+
+        m.d.comb += [
+            # mute misaligned exceptions if there is no request
+            # otherwise they will throw accidental exceptions
+            misaligned_ex.valid.eq(self.misaligned_ex_i.valid & self.lsu_req_i),
+
+            # SUM is not set and we are trying to access a user
+            # page in supervisor mode
+            sverr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_S & ~self.sum_i & \
+                       dtlb_pte.u),
+            # this is not a user page but we are in user mode and
+            # trying to access it
+            usrerr.eq(self.ld_st_priv_lvl_i == PRIV_LVL_U & ~dtlb_pte.u),
+
+            # Check if the User flag is set, then we may only
+            # access it in supervisor mode if SUM is enabled
+            daccess_err.eq(sverr | usrerr),
+            ]
+
+        # translation is enabled and no misaligned exception occurred
+        with m.If(self.en_ld_st_translation_i & ~misaligned_ex.valid):
+            m.d.comb += lsu_req.eq(0)
+            # 4K page
+            paddr = Signal.like(lsu_vaddr)
+            paddr4k = Cat(lsu_vaddr[0:12], itlb_content.ppn)
+            m.d.comb += paddr.eq(paddr4k)
+            # Mega page
+            with m.If(dtlb_is_2M):
+                m.d.comb += paddr[12:21].eq(lsu_vaddr[12:21])
+            # Giga page
+            with m.If(dtlb_is_1G):
+                m.d.comb += paddr[12:30].eq(lsu_vaddr[12:30])
+            m.d.sync += self.lsu_paddr_o.eq(paddr)
+            # TODO platen tera_page
+
+            # ---------
+            # DTLB Hit
+            # --------
+            with m.If(dtlb_hit & lsu_req):
+                m.d.comb += lsu_req.eq(1)
+                # this is a store
+                with m.If (lsu_is_store):
+                    # check if the page is write-able and
+                    # we are not violating privileges
+                    # also check if the dirty flag is set
+                    with m.If(~dtlb_pte.w | daccess_err | ~dtlb_pte.d):
+                        le = self.lsu_exception_o
+                        m.d.sync += [le.cause.eq(STORE_PAGE_FAULT),
+                                     le.tval.eq(lsu_vaddr),
+                                     le.valid.eq(1)
+                                    ]
+
+                # this is a load, check for sufficient access
+                # privileges - throw a page fault if necessary
+                with m.Elif(daccess_err):
+                    le = self.lsu_exception_o
+                    m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT),
+                                 le.tval.eq(lsu_vaddr),
+                                 le.valid.eq(1)
+                                ]
+            # ---------
+            # DTLB Miss
+            # ---------
+            # watch out for exceptions
+            with m.Elif (ptw_active & ~walking_instr):
+                # page table walker threw an exception
+                with m.If (ptw_error):
+                    # an error makes the translation valid
+                    m.d.comb += lsu_req.eq(1)
+                    # the page table walker can only throw page faults
+                    with m.If (lsu_is_store):
+                        le = self.lsu_exception_o
+                        m.d.sync += [le.cause.eq(STORE_PAGE_FAULT),
+                                     le.tval.eq(uaddr64),
+                                     le.valid.eq(1)
+                                    ]
+                    with m.Else():
+                        m.d.sync += [le.cause.eq(LOAD_PAGE_FAULT),
+                                     le.tval.eq(uaddr64),
+                                     le.valid.eq(1)
+                                    ]
+
+        return m
+
+    def ports(self):
+        return [self.flush_i, self.enable_translation_i,
+                self.en_ld_st_translation_i,
+                self.lsu_req_i,
+                self.lsu_vaddr_i, self.lsu_is_store_i, self.lsu_dtlb_hit_o,
+                self.lsu_valid_o, self.lsu_paddr_o,
+                self.priv_lvl_i, self.ld_st_priv_lvl_i, self.sum_i, self.mxr_i,
+                self.satp_ppn_i, self.asid_i, self.flush_tlb_i,
+                self.itlb_miss_o, self.dtlb_miss_o] + \
+                self.icache_areq_i.ports() + self.icache_areq_o.ports() + \
+                self.req_port_i.ports() + self.req_port_o.ports() + \
+                self.misaligned_ex_i.ports() + self.lsu_exception_o.ports()
+
+if __name__ == '__main__':
+    mmu = MMU()
+    vl = rtlil.convert(mmu, ports=mmu.ports())
+    with open("test_mmu.il", "w") as f:
+        f.write(vl)
+
diff --git a/unused_please_ignore_completely/TLB/ariane/p_lru.txt b/unused_please_ignore_completely/TLB/ariane/p_lru.txt
new file mode 100644 (file)
index 0000000..4bac768
--- /dev/null
@@ -0,0 +1,51 @@
+pseudo-LRU
+
+two-way set associative - one bit
+
+   indicates which line of the two has been reference more recently
+
+
+four-way set associative - three bits
+
+   each bit represents one branch point in a binary decision tree; let 1
+   represent that the left side has been referenced more recently than the
+   right side, and 0 vice-versa
+
+              are all 4 lines valid?
+                   /       \
+                 yes        no, use an invalid line
+                  |
+                  |
+                  |
+             bit_0 == 0?            state | replace      ref to | next state
+              /       \             ------+--------      -------+-----------
+             y         n             00x  |  line_0      line_0 |    11_
+            /           \            01x  |  line_1      line_1 |    10_
+     bit_1 == 0?    bit_2 == 0?      1x0  |  line_2      line_2 |    0_1
+       /    \          /    \        1x1  |  line_3      line_3 |    0_0
+      y      n        y      n
+     /        \      /        \        ('x' means       ('_' means unchanged)
+   line_0  line_1  line_2  line_3      don't care)
+
+   (see Figure 3-7, p. 3-18, in Intel Embedded Pentium Processor Family Dev.
+    Manual, 1998, http://www.intel.com/design/intarch/manuals/273204.htm)
+
+
+note that there is a 6-bit encoding for true LRU for four-way set associative
+
+  bit 0: bank[1] more recently used than bank[0]
+  bit 1: bank[2] more recently used than bank[0]
+  bit 2: bank[2] more recently used than bank[1]
+  bit 3: bank[3] more recently used than bank[0]
+  bit 4: bank[3] more recently used than bank[1]
+  bit 5: bank[3] more recently used than bank[2]
+
+  this results in 24 valid bit patterns within the 64 possible bit patterns
+  (4! possible valid traces for bank references)
+
+  e.g., a trace of 0 1 2 3, where 0 is LRU and 3 is MRU, is encoded as 111111
+
+  you can implement a state machine with a 256x6 ROM (6-bit state encoding
+  appended with a 2-bit bank reference input will yield a new 6-bit state),
+  and you can implement an LRU bank indicator with a 64x2 ROM
+
diff --git a/unused_please_ignore_completely/TLB/ariane/plru.py b/unused_please_ignore_completely/TLB/ariane/plru.py
new file mode 100644 (file)
index 0000000..fb4f5c0
--- /dev/null
@@ -0,0 +1,2 @@
+# moved to nmutil https://git.libre-soc.org/?p=nmutil.git;a=tree
+from nmutil.plru import PLRU
diff --git a/unused_please_ignore_completely/TLB/ariane/ptw.py b/unused_please_ignore_completely/TLB/ariane/ptw.py
new file mode 100644 (file)
index 0000000..4046c71
--- /dev/null
@@ -0,0 +1,556 @@
+"""
+# Copyright 2018 ETH Zurich and University of Bologna.
+# Copyright and related rights are licensed under the Solderpad Hardware
+# License, Version 0.51 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# or agreed to in writing, software, hardware and materials distributed under
+# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Author: David Schaffenrath, TU Graz
+# Author: Florian Zaruba, ETH Zurich
+# Date: 24.4.2017
+# Description: Hardware-PTW
+
+/* verilator lint_off WIDTH */
+import ariane_pkg::*;
+
+see linux kernel source:
+
+* "arch/riscv/include/asm/page.h"
+* "arch/riscv/include/asm/mmu_context.h"
+* "arch/riscv/Kconfig" (CONFIG_PAGE_OFFSET)
+
+"""
+
+from nmigen import Const, Signal, Cat, Module, Elaboratable
+from nmigen.hdl.ast import ArrayProxy
+from nmigen.cli import verilog, rtlil
+from math import log2
+
+
+DCACHE_SET_ASSOC = 8
+CONFIG_L1D_SIZE =  32*1024
+DCACHE_INDEX_WIDTH = int(log2(CONFIG_L1D_SIZE / DCACHE_SET_ASSOC))
+DCACHE_TAG_WIDTH = 56 - DCACHE_INDEX_WIDTH
+
+ASID_WIDTH = 8
+
+
+class DCacheReqI:
+    def __init__(self):
+        self.address_index = Signal(DCACHE_INDEX_WIDTH)
+        self.address_tag = Signal(DCACHE_TAG_WIDTH)
+        self.data_wdata = Signal(64)
+        self.data_req = Signal()
+        self.data_we = Signal()
+        self.data_be = Signal(8)
+        self.data_size = Signal(2)
+        self.kill_req = Signal()
+        self.tag_valid = Signal()
+
+    def eq(self, inp):
+        res = []
+        for (o, i) in zip(self.ports(), inp.ports()):
+            res.append(o.eq(i))
+        return res
+
+    def ports(self):
+        return [self.address_index, self.address_tag,
+                self.data_wdata, self.data_req,
+                self.data_we, self.data_be, self.data_size,
+                self.kill_req, self.tag_valid,
+            ]
+
+class DCacheReqO:
+    def __init__(self):
+        self.data_gnt = Signal()
+        self.data_rvalid = Signal()
+        self.data_rdata = Signal(64) # actually in PTE object format
+
+    def eq(self, inp):
+        res = []
+        for (o, i) in zip(self.ports(), inp.ports()):
+            res.append(o.eq(i))
+        return res
+
+    def ports(self):
+        return [self.data_gnt, self.data_rvalid, self.data_rdata]
+
+
+class PTE: #(RecordObject):
+    def __init__(self):
+        self.v = Signal()
+        self.r = Signal()
+        self.w = Signal()
+        self.x = Signal()
+        self.u = Signal()
+        self.g = Signal()
+        self.a = Signal()
+        self.d = Signal()
+        self.rsw = Signal(2)
+        self.ppn = Signal(44)
+        self.reserved = Signal(10)
+
+    def flatten(self):
+        return Cat(*self.ports())
+
+    def eq(self, x):
+        if isinstance(x, ArrayProxy):
+            res = []
+            for o in self.ports():
+                i = getattr(x, o.name)
+                res.append(i)
+            x = Cat(*res)
+        else:
+            x = x.flatten()
+        return self.flatten().eq(x)
+
+    def __iter__(self):
+        """ order is critical so that flatten creates LSB to MSB
+        """
+        yield self.v
+        yield self.r
+        yield self.w
+        yield self.x
+        yield self.u
+        yield self.g
+        yield self.a
+        yield self.d
+        yield self.rsw
+        yield self.ppn
+        yield self.reserved
+
+    def ports(self):
+        return list(self)
+
+
+class TLBUpdate:
+    def __init__(self, asid_width):
+        self.valid = Signal()      # valid flag
+        self.is_2M = Signal()
+        self.is_1G = Signal()
+        self.is_512G = Signal()
+        self.vpn = Signal(36)
+        self.asid = Signal(asid_width)
+        self.content = PTE()
+
+    def flatten(self):
+        return Cat(*self.ports())
+
+    def eq(self, x):
+        return self.flatten().eq(x.flatten())
+
+    def ports(self):
+        return [self.valid, self.is_2M, self.is_1G, self.vpn, self.asid] + \
+                self.content.ports()
+
+
+# SV48 defines four levels of page tables
+LVL1 = Const(0, 2) # defined to 0 so that ptw_lvl default-resets to LVL1
+LVL2 = Const(1, 2)
+LVL3 = Const(2, 2)
+LVL4 = Const(3, 2)
+
+
+class PTW(Elaboratable):
+    def __init__(self, asid_width=8):
+        self.asid_width = asid_width
+
+        self.flush_i = Signal() # flush everything, we need to do this because
+        # actually everything we do is speculative at this stage
+        # e.g.: there could be a CSR instruction that changes everything
+        self.ptw_active_o = Signal(reset=1)    # active if not IDLE
+        self.walking_instr_o = Signal()        # set when walking for TLB
+        self.ptw_error_o = Signal()            # set when an error occurred
+        self.enable_translation_i = Signal()   # CSRs indicate to enable SV48
+        self.en_ld_st_translation_i = Signal() # enable VM translation for ld/st
+
+        self.lsu_is_store_i = Signal()       # translation triggered by store
+        # PTW memory interface
+        self.req_port_i = DCacheReqO()
+        self.req_port_o = DCacheReqI()
+
+        # to TLBs, update logic
+        self.itlb_update_o = TLBUpdate(asid_width)
+        self.dtlb_update_o = TLBUpdate(asid_width)
+
+        self.update_vaddr_o = Signal(48)
+
+        self.asid_i = Signal(self.asid_width)
+        # from TLBs
+        # did we miss?
+        self.itlb_access_i = Signal()
+        self.itlb_hit_i = Signal()
+        self.itlb_vaddr_i = Signal(64)
+
+        self.dtlb_access_i = Signal()
+        self.dtlb_hit_i = Signal()
+        self.dtlb_vaddr_i = Signal(64)
+        # from CSR file
+        self.satp_ppn_i = Signal(44) # ppn from satp
+        self.mxr_i = Signal()
+        # Performance counters
+        self.itlb_miss_o = Signal()
+        self.dtlb_miss_o = Signal()
+
+    def ports(self):
+        return [self.ptw_active_o, self.walking_instr_o, self.ptw_error_o,
+                ]
+        return [
+                self.enable_translation_i, self.en_ld_st_translation_i,
+                self.lsu_is_store_i, self.req_port_i, self.req_port_o,
+                self.update_vaddr_o,
+                self.asid_i,
+                self.itlb_access_i, self.itlb_hit_i, self.itlb_vaddr_i,
+                self.dtlb_access_i, self.dtlb_hit_i, self.dtlb_vaddr_i,
+                self.satp_ppn_i, self.mxr_i,
+                self.itlb_miss_o, self.dtlb_miss_o
+            ] + self.itlb_update_o.ports() + self.dtlb_update_o.ports()
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # input registers
+        data_rvalid = Signal()
+        data_rdata = Signal(64)
+
+        # NOTE: pte decodes the incoming bit-field (data_rdata). data_rdata
+        # is spec'd in 64-bit binary-format: better to spec as Record?
+        pte = PTE()
+        m.d.comb += pte.flatten().eq(data_rdata)
+
+        # SV48 defines four levels of page tables
+        ptw_lvl = Signal(2) # default=0=LVL1 on reset (see above)
+        ptw_lvl1 = Signal()
+        ptw_lvl2 = Signal()
+        ptw_lvl3 = Signal()
+        ptw_lvl4 = Signal()
+        m.d.comb += [ptw_lvl1.eq(ptw_lvl == LVL1),
+                     ptw_lvl2.eq(ptw_lvl == LVL2),
+                     ptw_lvl3.eq(ptw_lvl == LVL3),
+                     ptw_lvl4.eq(ptw_lvl == LVL4)
+                     ]
+
+        # is this an instruction page table walk?
+        is_instr_ptw = Signal()
+        global_mapping = Signal()
+        # latched tag signal
+        tag_valid = Signal()
+        # register the ASID
+        tlb_update_asid = Signal(self.asid_width)
+        # register VPN we need to walk, SV48 defines a 48 bit virtual addr
+        vaddr = Signal(64)
+        # 4 byte aligned physical pointer
+        ptw_pptr = Signal(56)
+
+        end = DCACHE_INDEX_WIDTH + DCACHE_TAG_WIDTH
+        m.d.sync += [
+            # Assignments
+            self.update_vaddr_o.eq(vaddr),
+
+            self.walking_instr_o.eq(is_instr_ptw),
+            # directly output the correct physical address
+            self.req_port_o.address_index.eq(ptw_pptr[0:DCACHE_INDEX_WIDTH]),
+            self.req_port_o.address_tag.eq(ptw_pptr[DCACHE_INDEX_WIDTH:end]),
+            # we are never going to kill this request
+            self.req_port_o.kill_req.eq(0),              # XXX assign comb?
+            # we are never going to write with the HPTW
+            self.req_port_o.data_wdata.eq(Const(0, 64)), # XXX assign comb?
+            # -----------
+            # TLB Update
+            # -----------
+            self.itlb_update_o.vpn.eq(vaddr[12:48]),
+            self.dtlb_update_o.vpn.eq(vaddr[12:48]),
+            # update the correct page table level
+            self.itlb_update_o.is_2M.eq(ptw_lvl3),
+            self.itlb_update_o.is_1G.eq(ptw_lvl2),
+            self.itlb_update_o.is_512G.eq(ptw_lvl1),
+            self.dtlb_update_o.is_2M.eq(ptw_lvl3),
+            self.dtlb_update_o.is_1G.eq(ptw_lvl2),
+            self.dtlb_update_o.is_512G.eq(ptw_lvl1),
+            
+            # output the correct ASID
+            self.itlb_update_o.asid.eq(tlb_update_asid),
+            self.dtlb_update_o.asid.eq(tlb_update_asid),
+            # set the global mapping bit
+            self.itlb_update_o.content.eq(pte),
+            self.itlb_update_o.content.g.eq(global_mapping),
+            self.dtlb_update_o.content.eq(pte),
+            self.dtlb_update_o.content.g.eq(global_mapping),
+
+            self.req_port_o.tag_valid.eq(tag_valid),
+        ]
+
+        #-------------------
+        # Page table walker   #needs update
+        #-------------------
+        # A virtual address va is translated into a physical address pa as
+        # follows:
+        # 1. Let a be sptbr.ppn × PAGESIZE, and let i = LEVELS-1. (For Sv48,
+        #    PAGESIZE=2^12 and LEVELS=4.)
+        # 2. Let pte be the value of the PTE at address a+va.vpn[i]×PTESIZE.
+        #    (For Sv32, PTESIZE=4.)
+        # 3. If pte.v = 0, or if pte.r = 0 and pte.w = 1, stop and raise an
+        #    access exception.
+        # 4. Otherwise, the PTE is valid. If pte.r = 1 or pte.x = 1, go to
+        #    step 5.  Otherwise, this PTE is a pointer to the next level of
+        #    the page table.
+        #    Let i=i-1. If i < 0, stop and raise an access exception.
+        #    Otherwise, let a = pte.ppn × PAGESIZE and go to step 2.
+        # 5. A leaf PTE has been found. Determine if the requested memory
+        #    access is allowed by the pte.r, pte.w, and pte.x bits. If not,
+        #    stop and raise an access exception. Otherwise, the translation is
+        #    successful.  Set pte.a to 1, and, if the memory access is a
+        #    store, set pte.d to 1.
+        #    The translated physical address is given as follows:
+        #      - pa.pgoff = va.pgoff.
+        #      - If i > 0, then this is a superpage translation and
+        #        pa.ppn[i-1:0] = va.vpn[i-1:0].
+        #      - pa.ppn[LEVELS-1:i] = pte.ppn[LEVELS-1:i].
+        # 6. If i > 0 and pa.ppn[i − 1 : 0] != 0, this is a misaligned
+        #    superpage stop and raise a page-fault exception.
+
+        m.d.sync += tag_valid.eq(0)
+
+        # default assignments
+        m.d.comb += [
+            # PTW memory interface
+            self.req_port_o.data_req.eq(0),
+            self.req_port_o.data_be.eq(Const(0xFF, 8)),
+            self.req_port_o.data_size.eq(Const(0b11, 2)),
+            self.req_port_o.data_we.eq(0),
+            self.ptw_error_o.eq(0),
+            self.itlb_update_o.valid.eq(0),
+            self.dtlb_update_o.valid.eq(0),
+
+            self.itlb_miss_o.eq(0),
+            self.dtlb_miss_o.eq(0),
+        ]
+
+        # ------------
+        # State Machine
+        # ------------
+
+        with m.FSM() as fsm:
+
+            with m.State("IDLE"):
+                self.idle(m, is_instr_ptw, ptw_lvl, global_mapping,
+                          ptw_pptr, vaddr, tlb_update_asid)
+
+            with m.State("WAIT_GRANT"):
+                self.grant(m, tag_valid, data_rvalid)
+
+            with m.State("PTE_LOOKUP"):
+                # we wait for the valid signal
+                with m.If(data_rvalid):
+                    self.lookup(m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4,
+                                data_rvalid, global_mapping,
+                                is_instr_ptw, ptw_pptr)
+
+            # Propagate error to MMU/LSU
+            with m.State("PROPAGATE_ERROR"):
+                m.next = "IDLE"
+                m.d.comb += self.ptw_error_o.eq(1)
+
+            # wait for the rvalid before going back to IDLE
+            with m.State("WAIT_RVALID"):
+                with m.If(data_rvalid):
+                    m.next = "IDLE"
+
+        m.d.sync += [data_rdata.eq(self.req_port_i.data_rdata),
+                     data_rvalid.eq(self.req_port_i.data_rvalid)
+                    ]
+
+        return m
+
+    def set_grant_state(self, m):
+        # should we have flushed before we got an rvalid,
+        # wait for it until going back to IDLE
+        with m.If(self.flush_i):
+            with m.If (self.req_port_i.data_gnt):
+                m.next = "WAIT_RVALID"
+            with m.Else():
+                m.next = "IDLE"
+        with m.Else():
+            m.next = "WAIT_GRANT"
+
+    def idle(self, m, is_instr_ptw, ptw_lvl, global_mapping,
+                          ptw_pptr, vaddr, tlb_update_asid):
+        # by default we start with the top-most page table
+        m.d.sync += [is_instr_ptw.eq(0),
+                     ptw_lvl.eq(LVL1),
+                     global_mapping.eq(0),
+                     self.ptw_active_o.eq(0), # deactive (IDLE)
+                    ]
+        # work out itlb/dtlb miss
+        m.d.comb += self.itlb_miss_o.eq(self.enable_translation_i & \
+                                 self.itlb_access_i & \
+                                 ~self.itlb_hit_i & \
+                                 ~self.dtlb_access_i)
+        m.d.comb += self.dtlb_miss_o.eq(self.en_ld_st_translation_i & \
+                                        self.dtlb_access_i & \
+                                        ~self.dtlb_hit_i)
+        # we got an ITLB miss?
+        with m.If(self.itlb_miss_o):
+            pptr = Cat(Const(0, 3), self.itlb_vaddr_i[30:48],
+                       self.satp_ppn_i)
+            m.d.sync += [ptw_pptr.eq(pptr),
+                         is_instr_ptw.eq(1),
+                         vaddr.eq(self.itlb_vaddr_i),
+                         tlb_update_asid.eq(self.asid_i),
+                        ]
+            self.set_grant_state(m)
+
+        # we got a DTLB miss?
+        with m.Elif(self.dtlb_miss_o):
+            pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:48],
+                       self.satp_ppn_i)
+            m.d.sync += [ptw_pptr.eq(pptr),
+                         vaddr.eq(self.dtlb_vaddr_i),
+                         tlb_update_asid.eq(self.asid_i),
+                        ]
+            self.set_grant_state(m)
+
+    def grant(self, m, tag_valid, data_rvalid):
+        # we've got a data WAIT_GRANT so tell the
+        # cache that the tag is valid
+
+        # send a request out
+        m.d.comb += self.req_port_o.data_req.eq(1)
+        # wait for the WAIT_GRANT
+        with m.If(self.req_port_i.data_gnt):
+            # send the tag valid signal one cycle later
+            m.d.sync += tag_valid.eq(1)
+            # should we have flushed before we got an rvalid,
+            # wait for it until going back to IDLE
+            with m.If(self.flush_i):
+                with m.If (~data_rvalid):
+                    m.next = "WAIT_RVALID"
+                with m.Else():
+                    m.next = "IDLE"
+            with m.Else():
+                m.next = "PTE_LOOKUP"
+
+    def lookup(self, m, pte, ptw_lvl, ptw_lvl1, ptw_lvl2, ptw_lvl3, ptw_lvl4, 
+                            data_rvalid, global_mapping,
+                            is_instr_ptw, ptw_pptr):
+        # temporaries
+        pte_rx = Signal(reset_less=True)
+        pte_exe = Signal(reset_less=True)
+        pte_inv = Signal(reset_less=True)
+        pte_a = Signal(reset_less=True)
+        st_wd = Signal(reset_less=True)
+        m.d.comb += [pte_rx.eq(pte.r | pte.x),
+                    pte_exe.eq(~pte.x | ~pte.a),
+                    pte_inv.eq(~pte.v | (~pte.r & pte.w)),
+                    pte_a.eq(pte.a & (pte.r | (pte.x & self.mxr_i))),
+                    st_wd.eq(self.lsu_is_store_i & (~pte.w | ~pte.d))]
+
+        l1err = Signal(reset_less=True)
+        l2err = Signal(reset_less=True)
+        l3err = Signal(reset_less=True)
+        m.d.comb += [l3err.eq((ptw_lvl3) & pte.ppn[0:9] != Const(0,0)),
+                     l2err.eq((ptw_lvl2) & pte.ppn[0:18] != Const(0, 18)),
+                     l1err.eq((ptw_lvl1) & pte.ppn[0:27] != Const(0, 27))]
+
+        # check if the global mapping bit is set
+        with m.If (pte.g):
+            m.d.sync += global_mapping.eq(1)
+
+        m.next = "IDLE"
+
+        # -------------
+        # Invalid PTE
+        # -------------
+        # If pte.v = 0, or if pte.r = 0 and pte.w = 1,
+        # stop and raise a page-fault exception.
+        with m.If (pte_inv):
+            m.next = "PROPAGATE_ERROR"
+
+        # -----------
+        # Valid PTE
+        # -----------
+
+        # it is a valid PTE
+        # if pte.r = 1 or pte.x = 1 it is a valid PTE
+        with m.Elif (pte_rx):
+            # Valid translation found (either 1G, 2M or 4K)
+            with m.If(is_instr_ptw):
+                # ------------
+                # Update ITLB
+                # ------------
+                # If page not executable, we can directly raise error.
+                # This doesn't put a useless entry into the TLB.
+                # The same idea applies to the access flag since we let
+                # the access flag be managed by SW.
+                with m.If (pte_exe):
+                    m.next = "IDLE"
+                with m.Else():
+                    m.d.comb += self.itlb_update_o.valid.eq(1)
+
+            with m.Else():
+                # ------------
+                # Update DTLB
+                # ------------
+                # Check if the access flag has been set, otherwise
+                # throw page-fault and let software handle those bits.
+                # If page not readable (there are no write-only pages)
+                # directly raise an error. This doesn't put a useless
+                # entry into the TLB.
+                with m.If(pte_a):
+                    m.d.comb += self.dtlb_update_o.valid.eq(1)
+                with m.Else():
+                    m.next = "PROPAGATE_ERROR"
+                # Request is a store: perform additional checks
+                # If the request was a store and the page not
+                # write-able, raise an error
+                # the same applies if the dirty flag is not set
+                with m.If (st_wd):
+                    m.d.comb += self.dtlb_update_o.valid.eq(0)
+                    m.next = "PROPAGATE_ERROR"
+
+            # check if the ppn is correctly aligned: Case (6)
+            with m.If(l1err | l2err | l3err):
+                m.next = "PROPAGATE_ERROR"
+                m.d.comb += [self.dtlb_update_o.valid.eq(0),
+                             self.itlb_update_o.valid.eq(0)]
+
+        # this is a pointer to the next TLB level
+        with m.Else():
+            # pointer to next level of page table
+            with m.If (ptw_lvl1):
+                # we are in the second level now
+                pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[30:39], pte.ppn)
+                m.d.sync += [ptw_pptr.eq(pptr),
+                             ptw_lvl.eq(LVL2)
+                            ]
+            with m.If(ptw_lvl2):
+                # here we received a pointer to the third level
+                pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[21:30], pte.ppn)
+                m.d.sync += [ptw_pptr.eq(pptr),
+                             ptw_lvl.eq(LVL3)
+                            ]
+            with m.If(ptw_lvl3): #guess: shift page levels by one
+                # here we received a pointer to the fourth level
+                # the last one is near the page offset
+                pptr = Cat(Const(0, 3), self.dtlb_vaddr_i[12:21], pte.ppn)
+                m.d.sync += [ptw_pptr.eq(pptr),
+                             ptw_lvl.eq(LVL4)
+                            ]
+            self.set_grant_state(m)
+
+            with m.If (ptw_lvl4):
+                # Should already be the last level
+                # page table => Error
+                m.d.sync += ptw_lvl.eq(LVL4)
+                m.next = "PROPAGATE_ERROR"
+
+
+if __name__ == '__main__':
+    ptw = PTW()
+    vl = rtlil.convert(ptw, ports=ptw.ports())
+    with open("test_ptw.il", "w") as f:
+        f.write(vl)
diff --git a/unused_please_ignore_completely/TLB/ariane/test/__init__.py b/unused_please_ignore_completely/TLB/ariane/test/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/unused_please_ignore_completely/TLB/ariane/test/test_plru.py b/unused_please_ignore_completely/TLB/ariane/test/test_plru.py
new file mode 100644 (file)
index 0000000..9222d79
--- /dev/null
@@ -0,0 +1,13 @@
+import sys
+from soc.TLB.ariane.plru import PLRU
+from nmigen.compat.sim import run_simulation
+
+
+def tbench(dut):
+    yield
+
+
+if __name__ == "__main__":
+    dut = PLRU(4)
+    run_simulation(dut, tbench(dut), vcd_name="test_plru.vcd")
+    print("PLRU Unit Test Success")
diff --git a/unused_please_ignore_completely/TLB/ariane/test/test_ptw.py b/unused_please_ignore_completely/TLB/ariane/test/test_ptw.py
new file mode 100644 (file)
index 0000000..3969756
--- /dev/null
@@ -0,0 +1,127 @@
+from nmigen.compat.sim import run_simulation
+from soc.TLB.ariane.ptw import PTW, PTE
+
+# unit was changed, test needs to be changed
+
+
+def tbench(dut):
+
+    addr = 0x8000000
+
+    #pte = PTE()
+    # yield pte.v.eq(1)
+    # yield pte.r.eq(1)
+
+    yield dut.req_port_i.data_gnt.eq(1)
+    yield dut.req_port_i.data_rvalid.eq(1)
+    yield dut.req_port_i.data_rdata.eq(0x43)  # pte.flatten())
+
+    # data lookup
+    yield dut.en_ld_st_translation_i.eq(1)
+    yield dut.asid_i.eq(1)
+
+    yield dut.dtlb_access_i.eq(1)
+    yield dut.dtlb_hit_i.eq(0)
+    yield dut.dtlb_vaddr_i.eq(0x400000000)
+
+    yield
+    yield
+    yield
+
+    yield dut.dtlb_access_i.eq(1)
+    yield dut.dtlb_hit_i.eq(0)
+    yield dut.dtlb_vaddr_i.eq(0x200000)
+
+    yield
+    yield
+    yield
+
+    yield dut.req_port_i.data_gnt.eq(0)
+    yield dut.dtlb_access_i.eq(1)
+    yield dut.dtlb_hit_i.eq(0)
+    yield dut.dtlb_vaddr_i.eq(0x400000011)
+
+    yield
+    yield dut.req_port_i.data_gnt.eq(1)
+    yield
+    yield
+
+    # data lookup, PTW levels 1-2-3
+    addr = 0x4000000
+    yield dut.dtlb_vaddr_i.eq(addr)
+    yield dut.mxr_i.eq(0x1)
+    yield dut.req_port_i.data_gnt.eq(1)
+    yield dut.req_port_i.data_rvalid.eq(1)
+    # pte.flatten())
+    yield dut.req_port_i.data_rdata.eq(0x41 | (addr >> 12) << 10)
+
+    yield dut.en_ld_st_translation_i.eq(1)
+    yield dut.asid_i.eq(1)
+
+    yield dut.dtlb_access_i.eq(1)
+    yield dut.dtlb_hit_i.eq(0)
+    yield dut.dtlb_vaddr_i.eq(addr)
+
+    yield
+    yield
+    yield
+    yield
+    yield
+    yield
+    yield
+    yield
+
+    yield dut.req_port_i.data_gnt.eq(0)
+    yield dut.dtlb_access_i.eq(1)
+    yield dut.dtlb_hit_i.eq(0)
+    yield dut.dtlb_vaddr_i.eq(0x400000011)
+
+    yield
+    yield dut.req_port_i.data_gnt.eq(1)
+    yield
+    yield
+    yield
+    yield
+
+    # instruction lookup
+    yield dut.en_ld_st_translation_i.eq(0)
+    yield dut.enable_translation_i.eq(1)
+    yield dut.asid_i.eq(1)
+
+    yield dut.itlb_access_i.eq(1)
+    yield dut.itlb_hit_i.eq(0)
+    yield dut.itlb_vaddr_i.eq(0x800000)
+
+    yield
+    yield
+    yield
+
+    yield dut.itlb_access_i.eq(1)
+    yield dut.itlb_hit_i.eq(0)
+    yield dut.itlb_vaddr_i.eq(0x200000)
+
+    yield
+    yield
+    yield
+
+    yield dut.req_port_i.data_gnt.eq(0)
+    yield dut.itlb_access_i.eq(1)
+    yield dut.itlb_hit_i.eq(0)
+    yield dut.itlb_vaddr_i.eq(0x800011)
+
+    yield
+    yield dut.req_port_i.data_gnt.eq(1)
+    yield
+    yield
+
+    yield
+
+
+def test_ptw():
+    dut = PTW()
+    run_simulation(dut, tbench(dut), vcd_name="test_ptw.vcd")
+    print("PTW Unit Test Success")
+
+
+if __name__ == "__main__":
+    test_ptw()
diff --git a/unused_please_ignore_completely/TLB/ariane/test/test_tlb.py b/unused_please_ignore_completely/TLB/ariane/test/test_tlb.py
new file mode 100644 (file)
index 0000000..e1b17b8
--- /dev/null
@@ -0,0 +1,67 @@
+from nmigen.compat.sim import run_simulation
+
+from soc.TLB.ariane.tlb import TLB
+
+
+def set_vaddr(addr):
+    yield dut.lu_vaddr_i.eq(addr)
+    yield dut.update_i.vpn.eq(addr >> 12)
+
+
+def tbench(dut):
+    yield dut.lu_access_i.eq(1)
+    yield dut.lu_asid_i.eq(1)
+    yield dut.update_i.valid.eq(1)
+    yield dut.update_i.is_1G.eq(0)
+    yield dut.update_i.is_2M.eq(0)
+    yield dut.update_i.asid.eq(1)
+    yield dut.update_i.content.ppn.eq(0)
+    yield dut.update_i.content.rsw.eq(0)
+    yield dut.update_i.content.r.eq(1)
+
+    yield
+
+    addr = 0x80000
+    yield from set_vaddr(addr)
+    yield
+
+    addr = 0x90001
+    yield from set_vaddr(addr)
+    yield
+
+    addr = 0x28000000
+    yield from set_vaddr(addr)
+    yield
+
+    addr = 0x28000001
+    yield from set_vaddr(addr)
+
+    addr = 0x28000001
+    yield from set_vaddr(addr)
+    yield
+
+    addr = 0x1000040000
+    yield from set_vaddr(addr)
+    yield
+
+    addr = 0x1000040001
+    yield from set_vaddr(addr)
+    yield
+
+    yield dut.update_i.is_1G.eq(1)
+    addr = 0x2040000
+    yield from set_vaddr(addr)
+    yield
+
+    yield dut.update_i.is_1G.eq(1)
+    addr = 0x2040001
+    yield from set_vaddr(addr)
+    yield
+
+    yield
+
+
+if __name__ == "__main__":
+    dut = TLB()
+    run_simulation(dut, tbench(dut), vcd_name="test_tlb.vcd")
+    print("TLB Unit Test Success")
diff --git a/unused_please_ignore_completely/TLB/ariane/test/test_tlb_content.py b/unused_please_ignore_completely/TLB/ariane/test/test_tlb_content.py
new file mode 100644 (file)
index 0000000..1bc60d8
--- /dev/null
@@ -0,0 +1,63 @@
+from nmigen.compat.sim import run_simulation
+
+from soc.TLB.ariane.tlb_content import TLBContent
+from soc.TestUtil.test_helper import assert_op, assert_eq
+
+
+def update(dut, a, t, g, m):
+    yield dut.replace_en_i.eq(1)
+    yield dut.update_i.valid.eq(1)
+    yield dut.update_i.is_512G.eq(t)
+    yield dut.update_i.is_1G.eq(g)
+    yield dut.update_i.is_2M.eq(m)
+    yield dut.update_i.vpn.eq(a)
+    yield
+    yield
+
+
+def check_hit(dut, hit, pagesize):
+    hit_d = yield dut.lu_hit_o
+    assert_eq("hit", hit_d, hit)
+
+    if(hit):
+        if(pagesize == "t"):
+            hitp = yield dut.lu_is_512G_o
+            assert_eq("lu_is_512G_o", hitp, 1)
+        elif(pagesize == "g"):
+            hitp = yield dut.lu_is_1G_o
+            assert_eq("lu_is_1G_o", hitp, 1)
+        elif(pagesize == "m"):
+            hitp = yield dut.lu_is_2M_o
+            assert_eq("lu_is_2M_o", hitp, 1)
+
+
+def addr(a, b, c, d):
+    return a | b << 9 | c << 18 | d << 27
+
+
+def tbench(dut):
+    yield dut.vpn0.eq(0x0A)
+    yield dut.vpn1.eq(0x0B)
+    yield dut.vpn2.eq(0x0C)
+    yield dut.vpn3.eq(0x0D)
+    yield from update(dut, addr(0xFF, 0xFF, 0xFF, 0x0D), 1, 0, 0)
+    yield from check_hit(dut, 1, "t")
+
+    yield from update(dut, addr(0xFF, 0xFF, 0x0C, 0x0D), 0, 1, 0)
+    yield from check_hit(dut, 1, "g")
+
+    yield from update(dut, addr(0xFF, 0x0B, 0x0C, 0x0D), 0, 0, 1)
+    yield from check_hit(dut, 1, "m")
+
+    yield from update(dut, addr(0x0A, 0x0B, 0x0C, 0x0D), 0, 0, 0)
+    yield from check_hit(dut, 1, "")
+
+    yield from update(dut, addr(0xAA, 0xBB, 0xCC, 0xDD), 0, 0, 0)
+    yield from check_hit(dut, 0, "miss")
+
+
+if __name__ == "__main__":
+    dut = TLBContent(4, 4)
+    #
+    run_simulation(dut, tbench(dut), vcd_name="test_tlb_content.vcd")
+    print("TLBContent Unit Test Success")
diff --git a/unused_please_ignore_completely/TLB/ariane/tlb.py b/unused_please_ignore_completely/TLB/ariane/tlb.py
new file mode 100644 (file)
index 0000000..72b67a2
--- /dev/null
@@ -0,0 +1,176 @@
+"""
+# Copyright 2018 ETH Zurich and University of Bologna.
+# Copyright and related rights are licensed under the Solderpad Hardware
+# License, Version 0.51 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+# http:#solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# or agreed to in writing, software, hardware and materials distributed under
+# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Author: David Schaffenrath, TU Graz
+# Author: Florian Zaruba, ETH Zurich
+# Date: 21.4.2017
+# Description: Translation Lookaside Buffer, SV48
+#              fully set-associative
+
+Implementation in c++:
+https://raw.githubusercontent.com/Tony-Hu/TreePLRU/master/TreePLRU.cpp
+
+Text description:
+https://people.cs.clemson.edu/~mark/464/p_lru.txt
+
+Online simulator:
+http://www.ntu.edu.sg/home/smitha/ParaCache/Paracache/vm.html
+"""
+from math import log2
+from nmigen import Signal, Module, Cat, Const, Array, Elaboratable
+from nmigen.cli import verilog, rtlil
+from nmigen.lib.coding import Encoder
+
+from soc.TLB.ariane.ptw import TLBUpdate, PTE, ASID_WIDTH
+from soc.TLB.ariane.plru import PLRU
+from soc.TLB.ariane.tlb_content import TLBContent
+
+TLB_ENTRIES = 8
+
+
+class TLB(Elaboratable):
+    def __init__(self, tlb_entries=8, asid_width=8):
+        self.tlb_entries = tlb_entries
+        self.asid_width = asid_width
+
+        self.flush_i = Signal()  # Flush signal
+        # Lookup signals
+        self.lu_access_i = Signal()
+        self.lu_asid_i = Signal(self.asid_width)
+        self.lu_vaddr_i = Signal(64)
+        self.lu_content_o = PTE()
+        self.lu_is_2M_o = Signal()
+        self.lu_is_1G_o = Signal()
+        self.lu_is_512G_o = Signal()
+        self.lu_hit_o = Signal()
+        # Update TLB
+        self.pte_width = len(self.lu_content_o.flatten())
+        self.update_i = TLBUpdate(asid_width)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        vpn3 = Signal(9)  # FIXME unused signal
+        vpn2 = Signal(9)
+        vpn1 = Signal(9)
+        vpn0 = Signal(9)
+
+        # -------------
+        # Translation
+        # -------------
+
+        # SV48 defines four levels of page tables
+        m.d.comb += [vpn0.eq(self.lu_vaddr_i[12:21]),
+                     vpn1.eq(self.lu_vaddr_i[21:30]),
+                     vpn2.eq(self.lu_vaddr_i[30:39]),
+                     vpn3.eq(self.lu_vaddr_i[39:48]),  # FIXME
+                     ]
+
+        tc = []
+        for i in range(self.tlb_entries):
+            tlc = TLBContent(self.pte_width, self.asid_width)
+            setattr(m.submodules, "tc%d" % i, tlc)
+            tc.append(tlc)
+            # connect inputs
+            tlc.update_i = self.update_i     # saves a lot of graphviz links
+            m.d.comb += [tlc.vpn0.eq(vpn0),
+                         tlc.vpn1.eq(vpn1),
+                         tlc.vpn2.eq(vpn2),
+                         # TODO 4th
+                         tlc.flush_i.eq(self.flush_i),
+                         # tlc.update_i.eq(self.update_i),
+                         tlc.lu_asid_i.eq(self.lu_asid_i)]
+        tc = Array(tc)
+
+        # --------------
+        # Select hit
+        # --------------
+
+        # use Encoder to select hit index
+        # XXX TODO: assert that there's only one valid entry (one lu_hit)
+        hitsel = Encoder(self.tlb_entries)
+        m.submodules.hitsel = hitsel
+
+        hits = []
+        for i in range(self.tlb_entries):
+            hits.append(tc[i].lu_hit_o)
+        m.d.comb += hitsel.i.eq(Cat(*hits))  # (goes into plru as well)
+        idx = hitsel.o
+
+        active = Signal(reset_less=True)
+        m.d.comb += active.eq(~hitsel.n)
+        with m.If(active):
+            # active hit, send selected as output
+            m.d.comb += [self.lu_is_512G_o.eq(tc[idx].lu_is_512G_o),
+                         self.lu_is_1G_o.eq(tc[idx].lu_is_1G_o),
+                         self.lu_is_2M_o.eq(tc[idx].lu_is_2M_o),
+                         self.lu_hit_o.eq(1),
+                         self.lu_content_o.flatten().eq(tc[idx].lu_content_o),
+                         ]
+
+        # --------------
+        # PLRU.
+        # --------------
+
+        p = PLRU(self.tlb_entries)
+        plru_tree = Signal(p.TLBSZ)
+        m.submodules.plru = p
+
+        # connect PLRU inputs/outputs
+        # XXX TODO: assert that there's only one valid entry (one replace_en)
+        en = []
+        for i in range(self.tlb_entries):
+            en.append(tc[i].replace_en_i)
+        m.d.comb += [Cat(*en).eq(p.replace_en_o),  # output from PLRU into tags
+                     p.lu_hit.eq(hitsel.i),
+                     p.lu_access_i.eq(self.lu_access_i),
+                     p.plru_tree.eq(plru_tree)]
+        m.d.sync += plru_tree.eq(p.plru_tree_o)
+
+        # --------------
+        # Sanity checks
+        # --------------
+
+        assert (self.tlb_entries % 2 == 0) and (self.tlb_entries > 1), \
+            "TLB size must be a multiple of 2 and greater than 1"
+        assert (self.asid_width >= 1), \
+            "ASID width must be at least 1"
+
+        return m
+
+        """
+        # Just for checking
+        function int countSetBits(logic[self.tlb_entries-1:0] vector);
+          automatic int count = 0;
+          foreach (vector[idx]) begin
+            count += vector[idx];
+          end
+          return count;
+        endfunction
+
+        assert property (@(posedge clk_i)(countSetBits(lu_hit) <= 1))
+          else $error("More then one hit in TLB!"); $stop(); end
+        assert property (@(posedge clk_i)(countSetBits(replace_en) <= 1))
+          else $error("More then one TLB entry selected for next replace!");
+        """
+
+    def ports(self):
+        return [self.flush_i, self.lu_access_i,
+                self.lu_asid_i, self.lu_vaddr_i,
+                self.lu_is_2M_o, self.lu_1G_o, self.lu_is_512G_o, self.lu_hit_o
+                ] + self.lu_content_o.ports() + self.update_i.ports()
+
+
+if __name__ == '__main__':
+    tlb = TLB()
+    vl = rtlil.convert(tlb, ports=tlb.ports())
+    with open("test_tlb.il", "w") as f:
+        f.write(vl)
diff --git a/unused_please_ignore_completely/TLB/ariane/tlb_content.py b/unused_please_ignore_completely/TLB/ariane/tlb_content.py
new file mode 100644 (file)
index 0000000..bfd17c1
--- /dev/null
@@ -0,0 +1,143 @@
+from nmigen import Signal, Module, Cat, Const, Elaboratable
+
+from soc.TLB.ariane.ptw import TLBUpdate, PTE
+
+
+class TLBEntry:
+    def __init__(self, asid_width):
+        self.asid = Signal(asid_width, name="ent_asid")
+        # SV48 defines four levels of page tables
+        self.vpn0 = Signal(9, name="ent_vpn0")
+        self.vpn1 = Signal(9, name="ent_vpn1")
+        self.vpn2 = Signal(9, name="ent_vpn2")
+        self.vpn3 = Signal(9, name="ent_vpn3")
+        self.is_2M = Signal(name="ent_is_2M")
+        self.is_1G = Signal(name="ent_is_1G")
+        self.is_512G = Signal(name="ent_is_512G")
+        self.valid = Signal(name="ent_valid")
+
+    def flatten(self):
+        return Cat(*self.ports())
+
+    def eq(self, x):
+        return self.flatten().eq(x.flatten())
+
+    def ports(self):
+        return [self.asid, self.vpn0, self.vpn1, self.vpn2,
+                self.is_2M, self.is_1G, self.valid]
+
+
+class TLBContent(Elaboratable):
+    def __init__(self, pte_width, asid_width):
+        self.asid_width = asid_width
+        self.pte_width = pte_width
+        self.flush_i = Signal()  # Flush signal
+        # Update TLB
+        self.update_i = TLBUpdate(asid_width)
+        self.vpn3 = Signal(9)
+        self.vpn2 = Signal(9)
+        self.vpn1 = Signal(9)
+        self.vpn0 = Signal(9)
+        self.replace_en_i = Signal()  # replace the following entry,
+        # set by replacement strategy
+        # Lookup signals
+        self.lu_asid_i = Signal(asid_width)
+        self.lu_content_o = Signal(pte_width)
+        self.lu_is_512G_o = Signal()
+        self.lu_is_2M_o = Signal()
+        self.lu_is_1G_o = Signal()
+        self.lu_hit_o = Signal()
+
+    def elaborate(self, platform):
+        m = Module()
+
+        tags = TLBEntry(self.asid_width)
+
+        content = Signal(self.pte_width)
+
+        m.d.comb += [self.lu_hit_o.eq(0),
+                     self.lu_is_512G_o.eq(0),
+                     self.lu_is_2M_o.eq(0),
+                     self.lu_is_1G_o.eq(0)]
+
+        # temporaries for lookup
+        asid_ok = Signal(reset_less=True)
+        # tags_ok = Signal(reset_less=True)
+
+        vpn3_ok = Signal(reset_less=True)
+        vpn2_ok = Signal(reset_less=True)
+        vpn1_ok = Signal(reset_less=True)
+        vpn0_ok = Signal(reset_less=True)
+
+        #tags_2M = Signal(reset_less=True)
+        vpn0_or_2M = Signal(reset_less=True)
+
+        m.d.comb += [
+            # compare asid and vpn*
+            asid_ok.eq(tags.asid == self.lu_asid_i),
+            vpn3_ok.eq(tags.vpn3 == self.vpn3),
+            vpn2_ok.eq(tags.vpn2 == self.vpn2),
+            vpn1_ok.eq(tags.vpn1 == self.vpn1),
+            vpn0_ok.eq(tags.vpn0 == self.vpn0),
+            vpn0_or_2M.eq(tags.is_2M | vpn0_ok)
+        ]
+
+        with m.If(asid_ok & tags.valid):
+            # first level, only vpn3 needs to match
+            with m.If(tags.is_512G & vpn3_ok):
+                m.d.comb += [self.lu_content_o.eq(content),
+                             self.lu_is_512G_o.eq(1),
+                             self.lu_hit_o.eq(1),
+                             ]
+            # second level , second level vpn2 and vpn3 need to match
+            with m.Elif(tags.is_1G & vpn2_ok & vpn3_ok):
+                m.d.comb += [self.lu_content_o.eq(content),
+                             self.lu_is_1G_o.eq(1),
+                             self.lu_hit_o.eq(1),
+                             ]
+            # not a giga page hit nor a tera page hit so check further
+            with m.Elif(vpn1_ok):
+                # this could be a 2 mega page hit or a 4 kB hit
+                # output accordingly
+                with m.If(vpn0_or_2M):
+                    m.d.comb += [self.lu_content_o.eq(content),
+                                 self.lu_is_2M_o.eq(tags.is_2M),
+                                 self.lu_hit_o.eq(1),
+                                 ]
+        # ------------------
+        # Update or Flush
+        # ------------------
+
+        # temporaries
+        replace_valid = Signal(reset_less=True)
+        m.d.comb += replace_valid.eq(self.update_i.valid & self.replace_en_i)
+
+        # flush
+        with m.If(self.flush_i):
+            # invalidate (flush) conditions: all if zero or just this ASID
+            with m.If(self.lu_asid_i == Const(0, self.asid_width) |
+                      (self.lu_asid_i == tags.asid)):
+                m.d.sync += tags.valid.eq(0)
+
+        # normal replacement
+        with m.Elif(replace_valid):
+            m.d.sync += [  # update tag array
+                tags.asid.eq(self.update_i.asid),
+                tags.vpn3.eq(self.update_i.vpn[27:36]),
+                tags.vpn2.eq(self.update_i.vpn[18:27]),
+                tags.vpn1.eq(self.update_i.vpn[9:18]),
+                tags.vpn0.eq(self.update_i.vpn[0:9]),
+                tags.is_512G.eq(self.update_i.is_512G),
+                tags.is_1G.eq(self.update_i.is_1G),
+                tags.is_2M.eq(self.update_i.is_2M),
+                tags.valid.eq(1),
+                # and content as well
+                content.eq(self.update_i.content.flatten())
+            ]
+        return m
+
+    def ports(self):
+        return [self.flush_i,
+                self.lu_asid_i,
+                self.lu_is_2M_o, self.lu_is_1G_o, self.lu_is_512G_o, self.lu_hit_o,
+                ] + self.update_i.content.ports() + self.update_i.ports()
diff --git a/unused_please_ignore_completely/TLB/test/__init__.py b/unused_please_ignore_completely/TLB/test/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/unused_please_ignore_completely/TLB/test/test_LFSR2.py b/unused_please_ignore_completely/TLB/test/test_LFSR2.py
new file mode 100644 (file)
index 0000000..33208f8
--- /dev/null
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# See Notices.txt for copyright information
+from soc.TLB.LFSR import LFSR, LFSRPolynomial, LFSR_POLY_3
+
+from nmigen.back.pysim import Simulator, Delay, Tick
+import unittest
+
+
+class TestLFSR(unittest.TestCase):
+    def test_poly(self):
+        v = LFSRPolynomial()
+        self.assertEqual(repr(v), "LFSRPolynomial([0])")
+        self.assertEqual(str(v), "1")
+        v = LFSRPolynomial([1])
+        self.assertEqual(repr(v), "LFSRPolynomial([1, 0])")
+        self.assertEqual(str(v), "x + 1")
+        v = LFSRPolynomial([0, 1])
+        self.assertEqual(repr(v), "LFSRPolynomial([1, 0])")
+        self.assertEqual(str(v), "x + 1")
+        v = LFSRPolynomial([1, 2])
+        self.assertEqual(repr(v), "LFSRPolynomial([2, 1, 0])")
+        self.assertEqual(str(v), "x^2 + x + 1")
+        v = LFSRPolynomial([2])
+        self.assertEqual(repr(v), "LFSRPolynomial([2, 0])")
+        self.assertEqual(str(v), "x^2 + 1")
+        self.assertEqual(str(LFSR_POLY_3), "x^3 + x^2 + 1")
+
+    def test_lfsr_3(self):
+        module = LFSR(LFSR_POLY_3)
+        traces = [module.state, module.enable]
+        with Simulator(module,
+                       vcd_file=open("Waveforms/test_LFSR2.vcd", "w"),
+                       gtkw_file=open("Waveforms/test_LFSR2.gtkw", "w"),
+                       traces=traces) as sim:
+            sim.add_clock(1e-6, phase=0.25e-6)
+            delay = Delay(1e-7)
+
+            def async_process():
+                yield module.enable.eq(0)
+                yield Tick()
+                self.assertEqual((yield module.state), 0x1)
+                yield Tick()
+                self.assertEqual((yield module.state), 0x1)
+                yield module.enable.eq(1)
+                yield Tick()
+                yield delay
+                self.assertEqual((yield module.state), 0x2)
+                yield Tick()
+                yield delay
+                self.assertEqual((yield module.state), 0x5)
+                yield Tick()
+                yield delay
+                self.assertEqual((yield module.state), 0x3)
+                yield Tick()
+                yield delay
+                self.assertEqual((yield module.state), 0x7)
+                yield Tick()
+                yield delay
+                self.assertEqual((yield module.state), 0x6)
+                yield Tick()
+                yield delay
+                self.assertEqual((yield module.state), 0x4)
+                yield Tick()
+                yield delay
+                self.assertEqual((yield module.state), 0x1)
+                yield Tick()
+
+            sim.add_process(async_process)
+            sim.run()
diff --git a/unused_please_ignore_completely/TLB/test/test_address_encoder.py b/unused_please_ignore_completely/TLB/test/test_address_encoder.py
new file mode 100644 (file)
index 0000000..70d435d
--- /dev/null
@@ -0,0 +1,116 @@
+from nmigen.compat.sim import run_simulation
+from soc.TLB.AddressEncoder import AddressEncoder
+from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
+
+
+# This function allows for the easy setting of values to the AddressEncoder
+# Arguments:
+#   dut: The AddressEncoder being tested
+#   i (Input): The array of single bits to be written
+def set_encoder(dut, i):
+    yield dut.i.eq(i)
+    yield
+
+# Checks the single match of the AddressEncoder
+# Arguments:
+#   dut: The AddressEncoder being tested
+#   sm (Single Match): The expected match result
+#   op (Operation): (0 => ==), (1 => !=)
+
+
+def check_single_match(dut, sm, op):
+    out_sm = yield dut.single_match
+    assert_op("Single Match", out_sm, sm, op)
+
+# Checks the multiple match of the AddressEncoder
+# Arguments:
+#   dut: The AddressEncoder being tested
+#   mm (Multiple Match): The expected match result
+#   op (Operation): (0 => ==), (1 => !=)
+
+
+def check_multiple_match(dut, mm, op):
+    out_mm = yield dut.multiple_match
+    assert_op("Multiple Match", out_mm, mm, op)
+
+# Checks the output of the AddressEncoder
+# Arguments:
+#   dut: The AddressEncoder being tested
+#   o (Output): The expected output
+#   op (Operation): (0 => ==), (1 => !=)
+
+
+def check_output(dut, o, op):
+    out_o = yield dut.o
+    assert_op("Output", out_o, o, op)
+
+# Checks the state of the AddressEncoder
+# Arguments:
+#   dut: The AddressEncoder being tested
+#   sm (Single Match): The expected match result
+#   mm (Multiple Match): The expected match result
+#   o (Output): The expected output
+#   ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+#   mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+#   o_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+
+
+def check_all(dut, sm, mm, o, sm_op, mm_op, o_op):
+    yield from check_single_match(dut, sm, sm_op)
+    yield from check_multiple_match(dut, mm, mm_op)
+    yield from check_output(dut, o, o_op)
+
+
+def tbench(dut):
+    # Check invalid input
+    in_val = 0b000
+    single_match = 0
+    multiple_match = 0
+    output = 0
+    yield from set_encoder(dut, in_val)
+    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
+
+    # Check single bit
+    in_val = 0b001
+    single_match = 1
+    multiple_match = 0
+    output = 0
+    yield from set_encoder(dut, in_val)
+    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
+
+    # Check another single bit
+    in_val = 0b100
+    single_match = 1
+    multiple_match = 0
+    output = 2
+    yield from set_encoder(dut, in_val)
+    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
+
+    # Check multiple match
+    # We expected the lowest bit to be returned which is address 0
+    in_val = 0b101
+    single_match = 0
+    multiple_match = 1
+    output = 0
+    yield from set_encoder(dut, in_val)
+    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
+
+    # Check another multiple match
+    # We expected the lowest bit to be returned which is address 1
+    in_val = 0b110
+    single_match = 0
+    multiple_match = 1
+    output = 1
+    yield from set_encoder(dut, in_val)
+    yield from check_all(dut, single_match, multiple_match, output, 0, 0, 0)
+
+
+def test_addr():
+    dut = AddressEncoder(4)
+    run_simulation(dut, tbench(dut),
+                   vcd_name="Waveforms/test_address_encoder.vcd")
+    print("AddressEncoder Unit Test Success")
+
+
+if __name__ == "__main__":
+    test_addr()
diff --git a/unused_please_ignore_completely/TLB/test/test_cam.py b/unused_please_ignore_completely/TLB/test/test_cam.py
new file mode 100644 (file)
index 0000000..d11cd97
--- /dev/null
@@ -0,0 +1,218 @@
+from nmigen.compat.sim import run_simulation
+
+from soc.TLB.Cam import Cam
+
+from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
+
+# This function allows for the easy setting of values to the Cam
+# Arguments:
+#   dut: The Cam being tested
+#   e (Enable): Whether the block is going to be enabled
+#   we (Write Enable): Whether the Cam will write on the next cycle
+#   a (Address): Where the data will be written if write enable is high
+#   d (Data): Either what we are looking for or will write to the address
+
+
+def set_cam(dut, e, we, a, d):
+    yield dut.enable.eq(e)
+    yield dut.write_enable.eq(we)
+    yield dut.address_in.eq(a)
+    yield dut.data_in.eq(d)
+    yield
+
+# Checks the multiple match of the Cam
+# Arguments:
+#   dut: The Cam being tested
+#   mm (Multiple Match): The expected match result
+#   op (Operation): (0 => ==), (1 => !=)
+
+
+def check_multiple_match(dut, mm, op):
+    out_mm = yield dut.multiple_match
+    assert_op("Multiple Match", out_mm, mm, op)
+
+# Checks the single match of the Cam
+# Arguments:
+#   dut: The Cam being tested
+#   sm (Single Match): The expected match result
+#   op (Operation): (0 => ==), (1 => !=)
+
+
+def check_single_match(dut, sm, op):
+    out_sm = yield dut.single_match
+    assert_op("Single Match", out_sm, sm, op)
+
+# Checks the address output of the Cam
+# Arguments:
+#   dut: The Cam being tested
+#   ma (Match Address): The expected match result
+#   op (Operation): (0 => ==), (1 => !=)
+
+
+def check_match_address(dut, ma, op):
+    out_ma = yield dut.match_address
+    assert_op("Match Address", out_ma, ma, op)
+
+# Checks the state of the Cam
+# Arguments:
+#   dut: The Cam being tested
+#   sm (Single Match): The expected match result
+#   mm (Multiple Match): The expected match result
+#   ma: (Match Address): The expected address output
+#   ss_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+#   mm_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+#   ma_op (Operation): Operation for the address assertion (0 => ==), (1 => !=)
+
+
+def check_all(dut, mm, sm, ma, mm_op, sm_op, ma_op):
+    yield from check_multiple_match(dut, mm, mm_op)
+    yield from check_single_match(dut, sm, sm_op)
+    yield from check_match_address(dut, ma, ma_op)
+
+
+def tbench(dut):
+    # NA
+    enable = 0
+    write_enable = 0
+    address = 0
+    data = 0
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_single_match(dut, single_match, 0)
+
+    # Read Miss Multiple
+    # Note that the default starting entry data bits are all 0
+    enable = 1
+    write_enable = 0
+    address = 0
+    data = 0
+    multiple_match = 1
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_multiple_match(dut, multiple_match, 0)
+
+    # Read Miss
+    # Note that the default starting entry data bits are all 0
+    enable = 1
+    write_enable = 0
+    address = 0
+    data = 1
+    multiple_match = 0
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_single_match(dut, single_match, 0)
+
+    # Write Entry 0
+    enable = 1
+    write_enable = 1
+    address = 0
+    data = 4
+    multiple_match = 0
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_single_match(dut, single_match, 0)
+
+    # Read Hit Entry 0
+    enable = 1
+    write_enable = 0
+    address = 0
+    data = 4
+    multiple_match = 0
+    single_match = 1
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
+
+    # Search Hit
+    enable = 1
+    write_enable = 0
+    address = 0
+    data = 4
+    multiple_match = 0
+    single_match = 1
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
+
+    # Search Miss
+    enable = 1
+    write_enable = 0
+    address = 0
+    data = 5
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_single_match(dut, single_match, 0)
+
+    # Multiple Match test
+    # Write Entry 1
+    enable = 1
+    write_enable = 1
+    address = 1
+    data = 5
+    multiple_match = 0
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_single_match(dut, single_match, 0)
+
+    # Write Entry 2
+    # Same data as Entry 1
+    enable = 1
+    write_enable = 1
+    address = 2
+    data = 5
+    multiple_match = 0
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_single_match(dut, single_match, 0)
+
+    # Read Hit Data 5
+    enable = 1
+    write_enable = 0
+    address = 1
+    data = 5
+    multiple_match = 1
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_all(dut, multiple_match, single_match, address, 0, 0, 0)
+
+    # Verify read_warning is not caused
+    # Write Entry 0
+    enable = 1
+    write_enable = 1
+    address = 0
+    data = 7
+    multiple_match = 0
+    single_match = 0
+    yield from set_cam(dut, enable, write_enable, address, data)
+    # Note there is no yield we immediately attempt to read in the next cycle
+
+    # Read Hit Data 7
+    enable = 1
+    write_enable = 0
+    address = 0
+    data = 7
+    multiple_match = 0
+    single_match = 1
+    yield from set_cam(dut, enable, write_enable, address, data)
+    yield
+    yield from check_single_match(dut, single_match, 0)
+
+    yield
+
+
+def test_cam():
+    dut = Cam(4, 4)
+    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam.vcd")
+    print("Cam Unit Test Success")
+
+
+if __name__ == "__main__":
+    test_cam()
diff --git a/unused_please_ignore_completely/TLB/test/test_cam_entry.py b/unused_please_ignore_completely/TLB/test/test_cam_entry.py
new file mode 100644 (file)
index 0000000..961445b
--- /dev/null
@@ -0,0 +1,119 @@
+from nmigen.compat.sim import run_simulation
+
+from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
+from soc.TLB.CamEntry import CamEntry
+
+# This function allows for the easy setting of values to the Cam Entry
+# Arguments:
+#   dut: The CamEntry being tested
+#   c (command): NA (0), Read (1), Write (2), Reserve (3)
+#   d (data): The data to be set
+
+
+def set_cam_entry(dut, c, d):
+    # Write desired values
+    yield dut.command.eq(c)
+    yield dut.data_in.eq(d)
+    yield
+    # Reset all lines
+    yield dut.command.eq(0)
+    yield dut.data_in.eq(0)
+    yield
+
+# Checks the data state of the CAM entry
+# Arguments:
+#   dut: The CamEntry being tested
+#   d (Data): The expected data
+#   op (Operation): (0 => ==), (1 => !=)
+
+
+def check_data(dut, d, op):
+    out_d = yield dut.data
+    assert_op("Data", out_d, d, op)
+
+# Checks the match state of the CAM entry
+# Arguments:
+#   dut: The CamEntry being tested
+#   m (Match): The expected match
+#   op (Operation): (0 => ==), (1 => !=)
+
+
+def check_match(dut, m, op):
+    out_m = yield dut.match
+    assert_op("Match", out_m, m, op)
+
+# Checks the state of the CAM entry
+# Arguments:
+#   dut: The CamEntry being tested
+#   d (data): The expected data
+#   m (match): The expected match
+#   d_op (Operation): Operation for the data assertion (0 => ==), (1 => !=)
+#   m_op (Operation): Operation for the match assertion (0 => ==), (1 => !=)
+
+
+def check_all(dut, d, m, d_op, m_op):
+    yield from check_data(dut, d, d_op)
+    yield from check_match(dut, m, m_op)
+
+# This tbench goes through the paces of testing the CamEntry module
+# It is done by writing and then reading various combinations of key/data pairs
+# and reading the results with varying keys to verify the resulting stored
+# data is correct.
+
+
+def tbench(dut):
+    # Check write
+    command = 2
+    data = 1
+    match = 0
+    yield from set_cam_entry(dut, command, data)
+    yield from check_all(dut, data, match, 0, 0)
+
+    # Check read miss
+    command = 1
+    data = 2
+    match = 0
+    yield from set_cam_entry(dut, command, data)
+    yield from check_all(dut, data, match, 1, 0)
+
+    # Check read hit
+    command = 1
+    data = 1
+    match = 1
+    yield from set_cam_entry(dut, command, data)
+    yield from check_all(dut, data, match, 0, 0)
+
+    # Check overwrite
+    command = 2
+    data = 5
+    match = 0
+    yield from set_cam_entry(dut, command, data)
+    yield
+    yield from check_all(dut, data, match, 0, 0)
+
+    # Check read hit
+    command = 1
+    data = 5
+    match = 1
+    yield from set_cam_entry(dut, command, data)
+    yield from check_all(dut, data, match, 0, 0)
+
+    # Check reset
+    command = 3
+    data = 0
+    match = 0
+    yield from set_cam_entry(dut, command, data)
+    yield from check_all(dut, data, match, 0, 0)
+
+    # Extra clock cycle for waveform
+    yield
+
+
+def test_camentry():
+    dut = CamEntry(4)
+    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_cam_entry.vcd")
+    print("CamEntry Unit Test Success")
+
+
+if __name__ == "__main__":
+    test_camentry()
diff --git a/unused_please_ignore_completely/TLB/test/test_permission_validator.py b/unused_please_ignore_completely/TLB/test/test_permission_validator.py
new file mode 100644 (file)
index 0000000..b52b545
--- /dev/null
@@ -0,0 +1,150 @@
+from nmigen.compat.sim import run_simulation
+
+from soc.TLB.PermissionValidator import PermissionValidator
+
+from soc.TestUtil.test_helper import assert_op
+
+
+def set_validator(dut, d, xwr, sm, sa, asid):
+    yield dut.data.eq(d)
+    yield dut.xwr.eq(xwr)
+    yield dut.super_mode.eq(sm)
+    yield dut.super_access.eq(sa)
+    yield dut.asid.eq(asid)
+    yield
+
+
+def check_valid(dut, v, op):
+    out_v = yield dut.valid
+    assert_op("Valid", out_v, v, op)
+
+
+def tbench(dut):
+    # 80 bits represented. Ignore the MSB as it will be truncated
+    # ASID is bits first 4 hex values (bits 64 - 78)
+
+    # Test user mode entry valid
+    # Global Bit matching ASID
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000031
+    # Ignore MSB it will be truncated
+    asid = 0x7FFF
+    super_mode = 0
+    super_access = 0
+    xwr = 0
+    valid = 1
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test user mode entry valid
+    # Global Bit nonmatching ASID
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000031
+    # Ignore MSB it will be truncated
+    asid = 0x7FF6
+    super_mode = 0
+    super_access = 0
+    xwr = 0
+    valid = 1
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test user mode entry invalid
+    # Global Bit nonmatching ASID
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000021
+    # Ignore MSB it will be truncated
+    asid = 0x7FF6
+    super_mode = 0
+    super_access = 0
+    xwr = 0
+    valid = 0
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test user mode entry valid
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000011
+    # Ignore MSB it will be truncated
+    asid = 0x7FFF
+    super_mode = 0
+    super_access = 0
+    xwr = 0
+    valid = 1
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test user mode entry invalid
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000011
+    # Ignore MSB it will be truncated
+    asid = 0x7FF6
+    super_mode = 0
+    super_access = 0
+    xwr = 0
+    valid = 0
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test supervisor mode entry valid
+    # The entry is NOT in user mode
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000001
+    # Ignore MSB it will be truncated
+    asid = 0x7FFF
+    super_mode = 1
+    super_access = 0
+    xwr = 0
+    valid = 1
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test supervisor mode entry invalid
+    # The entry is in user mode
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000011
+    # Ignore MSB it will be truncated
+    asid = 0x7FFF
+    super_mode = 1
+    super_access = 0
+    xwr = 0
+    valid = 0
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test supervisor mode entry valid
+    # The entry is NOT in user mode with access
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000001
+    # Ignore MSB it will be truncated
+    asid = 0x7FFF
+    super_mode = 1
+    super_access = 1
+    xwr = 0
+    valid = 1
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+    # Test supervisor mode entry valid
+    # The entry is in user mode with access
+    # Ensure that user mode and valid is enabled!
+    data = 0x7FFF0000000000000011
+    # Ignore MSB it will be truncated
+    asid = 0x7FFF
+    super_mode = 1
+    super_access = 1
+    xwr = 0
+    valid = 1
+    yield from set_validator(dut, data, xwr, super_mode, super_access, asid)
+    yield from check_valid(dut, valid, 0)
+
+
+def test_permv():
+    dut = PermissionValidator(15, 64)
+    run_simulation(dut, tbench(
+        dut), vcd_name="Waveforms/test_permission_validator.vcd")
+    print("PermissionValidator Unit Test Success")
+
+
+if __name__ == "__main__":
+    test_permv()
diff --git a/unused_please_ignore_completely/TLB/test/test_pte_entry.py b/unused_please_ignore_completely/TLB/test/test_pte_entry.py
new file mode 100644 (file)
index 0000000..51b3dcf
--- /dev/null
@@ -0,0 +1,114 @@
+from nmigen.compat.sim import run_simulation
+
+from soc.TLB.PteEntry import PteEntry
+
+from soc.TestUtil.test_helper import assert_op
+
+
+def set_entry(dut, i):
+    yield dut.i.eq(i)
+    yield
+
+
+def check_dirty(dut, d, op):
+    out_d = yield dut.d
+    assert_op("Dirty", out_d, d, op)
+
+
+def check_accessed(dut, a, op):
+    out_a = yield dut.a
+    assert_op("Accessed", out_a, a, op)
+
+
+def check_global(dut, o, op):
+    out = yield dut.g
+    assert_op("Global", out, o, op)
+
+
+def check_user(dut, o, op):
+    out = yield dut.u
+    assert_op("User Mode", out, o, op)
+
+
+def check_xwr(dut, o, op):
+    out = yield dut.xwr
+    assert_op("XWR", out, o, op)
+
+
+def check_asid(dut, o, op):
+    out = yield dut.asid
+    assert_op("ASID", out, o, op)
+
+
+def check_pte(dut, o, op):
+    out = yield dut.pte
+    assert_op("ASID", out, o, op)
+
+
+def check_valid(dut, v, op):
+    out_v = yield dut.v
+    assert_op("Valid", out_v, v, op)
+
+
+def check_all(dut, d, a, g, u, xwr, v, asid, pte):
+    yield from check_dirty(dut, d, 0)
+    yield from check_accessed(dut, a, 0)
+    yield from check_global(dut, g, 0)
+    yield from check_user(dut, u, 0)
+    yield from check_xwr(dut, xwr, 0)
+    yield from check_asid(dut, asid, 0)
+    yield from check_pte(dut, pte, 0)
+    yield from check_valid(dut, v, 0)
+
+
+def tbench(dut):
+    # 80 bits represented. Ignore the MSB as it will be truncated
+    # ASID is bits first 4 hex values (bits 64 - 78)
+
+    i = 0x7FFF0000000000000031
+    dirty = 0
+    access = 0
+    glob = 1
+    user = 1
+    xwr = 0
+    valid = 1
+    asid = 0x7FFF
+    pte = 0x0000000000000031
+    yield from set_entry(dut, i)
+    yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
+
+    i = 0x0FFF00000000000000FF
+    dirty = 1
+    access = 1
+    glob = 1
+    user = 1
+    xwr = 7
+    valid = 1
+    asid = 0x0FFF
+    pte = 0x00000000000000FF
+    yield from set_entry(dut, i)
+    yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
+
+    i = 0x0721000000001100001F
+    dirty = 0
+    access = 0
+    glob = 0
+    user = 1
+    xwr = 7
+    valid = 1
+    asid = 0x0721
+    pte = 0x000000001100001F
+    yield from set_entry(dut, i)
+    yield from check_all(dut, dirty, access, glob, user, xwr, valid, asid, pte)
+
+    yield
+
+
+def test_pteentry():
+    dut = PteEntry(15, 64)
+    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_pte_entry.vcd")
+    print("PteEntry Unit Test Success")
+
+
+if __name__ == "__main__":
+    test_pteentry()
diff --git a/unused_please_ignore_completely/TLB/test/test_set_associative_cache.py b/unused_please_ignore_completely/TLB/test/test_set_associative_cache.py
new file mode 100644 (file)
index 0000000..edec055
--- /dev/null
@@ -0,0 +1,43 @@
+from nmigen.compat.sim import run_simulation
+
+from soc.TLB.SetAssociativeCache import SetAssociativeCache
+
+from soc.TestUtil.test_helper import assert_eq, assert_ne, assert_op
+
+
+def set_sac(dut, e, c, s, t, d):
+    yield dut.enable.eq(e)
+    yield dut.command.eq(c)
+    yield dut.cset.eq(s)
+    yield dut.tag.eq(t)
+    yield dut.data_i.eq(d)
+    yield
+
+
+def tbench(dut):
+    enable = 1
+    command = 2
+    cset = 1
+    tag = 2
+    data = 3
+    yield from set_sac(dut, enable, command, cset, tag, data)
+    yield
+
+    enable = 1
+    command = 2
+    cset = 1
+    tag = 5
+    data = 8
+    yield from set_sac(dut, enable, command, cset, tag, data)
+    yield
+
+
+def test_assoc_cache():
+    dut = SetAssociativeCache(4, 4, 4, 4)
+    run_simulation(dut, tbench(
+        dut), vcd_name="Waveforms/test_set_associative_cache.vcd")
+    print("Set Associative Cache Unit Test Success")
+
+
+if __name__ == "__main__":
+    test_assoc_cache()
diff --git a/unused_please_ignore_completely/TLB/test/test_tlb.py b/unused_please_ignore_completely/TLB/test/test_tlb.py
new file mode 100644 (file)
index 0000000..3865662
--- /dev/null
@@ -0,0 +1,86 @@
+#import tracemalloc
+# tracemalloc.start()
+
+from nmigen.compat.sim import run_simulation
+
+from soc.TLB.TLB import TLB
+
+from soc.TestUtil.test_helper import assert_op, assert_eq
+
+# self.supermode = Signal(1) # Supervisor Mode
+# self.super_access = Signal(1) # Supervisor Access
+# self.command = Signal(2) # 00=None, 01=Search, 10=Write L1, 11=Write L2
+# self.xwr = Signal(3) # Execute, Write, Read
+# self.mode = Signal(4) # 4 bits for access to Sv48 on Rv64
+#self.address_L1 = Signal(range(L1_size))
+# self.asid = Signal(asid_size) # Address Space IDentifier (ASID)
+# self.vma = Signal(vma_size) # Virtual Memory Address (VMA)
+# self.pte_in = Signal(pte_size) # To be saved Page Table Entry (PTE)
+#
+# self.hit = Signal(1) # Denotes if the VMA had a mapped PTE
+# self.perm_valid = Signal(1) # Denotes if the permissions are correct
+# self.pte_out = Signal(pte_size) # PTE that was mapped to by the VMA
+
+COMMAND_READ = 1
+COMMAND_WRITE_L1 = 2
+
+# Checks the data state of the CAM entry
+# Arguments:
+#   dut: The CamEntry being tested
+#   d (Data): The expected data
+#   op (Operation): (0 => ==), (1 => !=)
+
+
+def check_hit(dut, d):
+    hit_d = yield dut.hit
+    #assert_eq("hit", hit_d, d)
+
+
+def tst_command(dut, cmd, xwr, cycles):
+    yield dut.command.eq(cmd)
+    yield dut.xwr.eq(xwr)
+    for i in range(0, cycles):
+        yield
+
+
+def tst_write_L1(dut, vma, address_L1, asid, pte_in):
+    yield dut.address_L1.eq(address_L1)
+    yield dut.asid.eq(asid)
+    yield dut.vma.eq(vma)
+    yield dut.pte_in.eq(pte_in)
+    yield from tst_command(dut, COMMAND_WRITE_L1, 7, 2)
+
+
+def tst_search(dut, vma, found):
+    yield dut.vma.eq(vma)
+    yield from tst_command(dut, COMMAND_READ, 7, 1)
+    yield from check_hit(dut, found)
+
+
+def zero(dut):
+    yield dut.supermode.eq(0)
+    yield dut.super_access.eq(0)
+    yield dut.mode.eq(0)
+    yield dut.address_L1.eq(0)
+    yield dut.asid.eq(0)
+    yield dut.vma.eq(0)
+    yield dut.pte_in.eq(0)
+
+
+def tbench(dut):
+    yield from zero(dut)
+    yield dut.mode.eq(0xF)  # enable TLB
+    # test hit
+    yield from tst_write_L1(dut, 0xFEEDFACE, 0, 0xFFFF, 0xF0F0)
+    yield from tst_search(dut, 0xFEEDFACE, 1)
+    yield from tst_search(dut, 0xFACEFEED, 0)
+
+
+def test_tlb():
+    dut = TLB(15, 36, 64, 8)
+    run_simulation(dut, tbench(dut), vcd_name="Waveforms/test_tlb.vcd")
+    print("TLB Unit Test Success")
+
+
+if __name__ == "__main__":
+    test_tlb()
diff --git a/unused_please_ignore_completely/__init__.py b/unused_please_ignore_completely/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/unused_please_ignore_completely/experiment/l0_cache.py b/unused_please_ignore_completely/experiment/l0_cache.py
new file mode 100644 (file)
index 0000000..3b8a7d9
--- /dev/null
@@ -0,0 +1,42 @@
+class DualPortSplitter(Elaboratable):
+    """DualPortSplitter
+
+    * one incoming PortInterface
+    * two *OUTGOING* PortInterfaces
+    * uses LDSTSplitter to do it
+
+    (actually, thinking about it LDSTSplitter could simply be
+     modified to conform to PortInterface: one in, two out)
+
+    once that is done each pair of ports may be wired directly
+    to the dual ports of L0CacheBuffer
+
+    The split is carried out so that, regardless of alignment or
+    mis-alignment, outgoing PortInterface[0] takes bit 4 == 0
+    of the address, whilst outgoing PortInterface[1] takes
+    bit 4 == 1.
+
+    PortInterface *may* need to be changed so that the length is
+    a binary number (accepting values 1-16).
+    """
+
+    def __init__(self,inp):
+        self.outp = [PortInterface(name="outp_0"),
+                     PortInterface(name="outp_1")]
+        print(self.outp)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        m.submodules.splitter = splitter = LDSTSplitter(64, 48, 4)
+        self.inp = splitter.pi
+        comb += splitter.addr_i.eq(self.inp.addr)  # XXX
+        #comb += splitter.len_i.eq()
+        #comb += splitter.valid_i.eq()
+        comb += splitter.is_ld_i.eq(self.inp.is_ld_i)
+        comb += splitter.is_st_i.eq(self.inp.is_st_i)
+        #comb += splitter.st_data_i.eq()
+        #comb += splitter.sld_valid_i.eq()
+        #comb += splitter.sld_data_i.eq()
+        #comb += splitter.sst_valid_i.eq()
+        return m
diff --git a/unused_please_ignore_completely/iommu/__init__.py b/unused_please_ignore_completely/iommu/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/unused_please_ignore_completely/iommu/axi_rab/__init__.py b/unused_please_ignore_completely/iommu/axi_rab/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/unused_please_ignore_completely/iommu/axi_rab/axi4_ar_buffer.py b/unused_please_ignore_completely/iommu/axi_rab/axi4_ar_buffer.py
new file mode 100644 (file)
index 0000000..1f3a5ff
--- /dev/null
@@ -0,0 +1,135 @@
+# Copyright 2018 ETH Zurich and University of Bologna.
+# Copyright and related rights are licensed under the Solderpad Hardware
+# License, Version 0.51 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+# http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# or agreed to in writing, software, hardware and materials distributed under
+# this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+# module axi4_ar_buffer
+#  #(
+#    parameter AXI_ID_WIDTH   = 4,
+#    parameter AXI_USER_WIDTH = 4
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_arid,
+#    input  logic               [31:0] s_axi4_araddr,
+#    input  logic                      s_axi4_arvalid,
+#    output logic                      s_axi4_arready,
+#    input  logic                [7:0] s_axi4_arlen,
+#    input  logic                [2:0] s_axi4_arsize,
+#    input  logic                [1:0] s_axi4_arburst,
+#    input  logic                      s_axi4_arlock,
+#    input  logic                [2:0] s_axi4_arprot,
+#    input  logic                [3:0] s_axi4_arcache,
+#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_aruser,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_arid,
+#    output logic               [31:0] m_axi4_araddr,
+#    output logic                      m_axi4_arvalid,
+#    input  logic                      m_axi4_arready,
+#    output logic                [7:0] m_axi4_arlen,
+#    output logic                [2:0] m_axi4_arsize,
+#    output logic                [1:0] m_axi4_arburst,
+#    output logic                      m_axi4_arlock,
+#    output logic                [2:0] m_axi4_arprot,
+#    output logic                [3:0] m_axi4_arcache,
+#    output logic [AXI_USER_WIDTH-1:0] m_axi4_aruser
+#  );
+
+
+class axi4_ar_buffer(Elaboratable):
+
+    def __init__(self):
+        # self.axi4_aclk = Signal() # input
+        # self.axi4_arstn = Signal() # input
+        self.s_axi4_arid = Signal(AXI_ID_WIDTH)  # input
+        self.s_axi4_araddr = Signal(32)  # input
+        self.s_axi4_arvalid = Signal()  # input
+        self.s_axi4_arready = Signal()  # output
+        self.s_axi4_arlen = Signal(8)  # input
+        self.s_axi4_arsize = Signal(3)  # input
+        self.s_axi4_arburst = Signal(2)  # input
+        self.s_axi4_arlock = Signal()  # input
+        self.s_axi4_arprot = Signal(3)  # input
+        self.s_axi4_arcache = Signal(4)  # input
+        self.s_axi4_aruser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_arid = Signal(AXI_ID_WIDTH)  # output
+        self.m_axi4_araddr = Signal(32)  # output
+        self.m_axi4_arvalid = Signal()  # output
+        self.m_axi4_arready = Signal()  # input
+        self.m_axi4_arlen = Signal(8)  # output
+        self.m_axi4_arsize = Signal(3)  # output
+        self.m_axi4_arburst = Signal(2)  # output
+        self.m_axi4_arlock = Signal()  # output
+        self.m_axi4_arprot = Signal(3)  # output
+        self.m_axi4_arcache = Signal(4)  # output
+        self.m_axi4_aruser = Signal(AXI_USER_WIDTH)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        #  #TODO use record types here
+        #  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+52:0] data_in;
+        #  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+52:0] data_out;
+
+        # assign data_in                                           [3:0] = s_axi4_arcache;
+        # assign data_in                                           [6:4] = s_axi4_arprot;
+        # assign data_in                                             [7] = s_axi4_arlock;
+        # assign data_in                                           [9:8] = s_axi4_arburst;
+        # assign data_in                                         [12:10] = s_axi4_arsize;
+        # assign data_in                                         [20:13] = s_axi4_arlen;
+        # assign data_in                                         [52:21] = s_axi4_araddr;
+        # assign data_in                            [52+AXI_ID_WIDTH:53] = s_axi4_arid;
+        # assign data_in[52+AXI_ID_WIDTH+AXI_USER_WIDTH:53+AXI_ID_WIDTH] = s_axi4_aruser;
+        #
+        # assign m_axi4_arcache = data_out[3:0];
+        # assign m_axi4_arprot  = data_out[6:4];
+        # assign m_axi4_arlock  = data_out[7];
+        # assign m_axi4_arburst = data_out[9:8];
+        # assign m_axi4_arsize  = data_out[12:10];
+        # assign m_axi4_arlen   = data_out[20:13];
+        # assign m_axi4_araddr  = data_out[52:21];
+        # assign m_axi4_arid    = data_out[52+AXI_ID_WIDTH:53];
+        # assign m_axi4_aruser  = data_out[52+AXI_ID_WIDTH+AXI_USER_WIDTH:53+AXI_ID_WIDTH];
+
+        # m.d.comb += self.m_axi4_arcache.eq(..)
+        # m.d.comb += self.m_axi4_arprot.eq(..)
+        # m.d.comb += self.m_axi4_arlock.eq(..)
+        # m.d.comb += self.m_axi4_arburst.eq(..)
+        # m.d.comb += self.m_axi4_arsize.eq(..)
+        # m.d.comb += self.m_axi4_arlen.eq(..)
+        # m.d.comb += self.m_axi4_araddr.eq(..)
+        # m.d.comb += self.m_axi4_arid.eq(..)
+        # m.d.comb += self.m_axi4_aruser.eq(..)
+        return m
+
+# TODO convert axi_buffer_rab.sv
+#
+#  axi_buffer_rab
+#    #(
+#      .DATA_WIDTH   ( AXI_ID_WIDTH+AXI_USER_WIDTH+53  ),
+#      .BUFFER_DEPTH ( 4                               )
+#      )
+#    u_buffer
+#    (
+#      .clk       ( axi4_aclk      ),
+#      .rstn      ( axi4_arstn     ),
+#      .valid_out ( m_axi4_arvalid ),
+#      .data_out  ( data_out       ),
+#      .ready_in  ( m_axi4_arready ),
+#      .valid_in  ( s_axi4_arvalid ),
+#      .data_in   ( data_in        ),
+#      .ready_out ( s_axi4_arready )
+#    );
+#
+
+# endmodule
diff --git a/unused_please_ignore_completely/iommu/axi_rab/axi4_ar_sender.py b/unused_please_ignore_completely/iommu/axi_rab/axi4_ar_sender.py
new file mode 100644 (file)
index 0000000..4cbd97d
--- /dev/null
@@ -0,0 +1,232 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_ar_sender(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.l1_done_o = Signal()  # output
+        self.l1_accept_i = Signal()  # input
+        self.l1_drop_i = Signal()  # input
+        self.l1_save_i = Signal()  # input
+        self.l2_done_o = Signal()  # output
+        self.l2_accept_i = Signal()  # input
+        self.l2_drop_i = Signal()  # input
+        self.l2_sending_o = Signal()  # output
+        self.l1_araddr_i = Signal(AXI_ADDR_WIDTH)  # input
+        self.l2_araddr_i = Signal(AXI_ADDR_WIDTH)  # input
+        self.s_axi4_arid = Signal(AXI_ID_WIDTH)  # input
+        self.s_axi4_arvalid = Signal()  # input
+        self.s_axi4_arready = Signal()  # output
+        self.s_axi4_arlen = Signal(8)  # input
+        self.s_axi4_arsize = Signal(3)  # input
+        self.s_axi4_arburst = Signal(2)  # input
+        self.s_axi4_arlock = Signal()  # input
+        self.s_axi4_arprot = Signal(3)  # input
+        self.s_axi4_arcache = Signal(4)  # input
+        self.s_axi4_aruser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_arid = Signal(AXI_ID_WIDTH)  # output
+        self.m_axi4_araddr = Signal(AXI_ADDR_WIDTH)  # output
+        self.m_axi4_arvalid = Signal()  # output
+        self.m_axi4_arready = Signal()  # input
+        self.m_axi4_arlen = Signal(8)  # output
+        self.m_axi4_arsize = Signal(3)  # output
+        self.m_axi4_arburst = Signal(2)  # output
+        self.m_axi4_arlock = Signal()  # output
+        self.m_axi4_arprot = Signal(3)  # output
+        self.m_axi4_arcache = Signal(4)  # output
+        self.m_axi4_aruser = Signal(AXI_USER_WIDTH)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.l1_save.eq(self.None)
+        m.d.comb += self.l1_done_o.eq(self.None)
+        m.d.comb += self.m_axi4_arvalid.eq(self.None)
+        m.d.comb += self.s_axi4_arready.eq(self.None)
+        m.d.comb += self.m_axi4_aruser.eq(self.None)
+        m.d.comb += self.m_axi4_arcache.eq(self.None)
+        m.d.comb += self.m_axi4_arprot.eq(self.None)
+        m.d.comb += self.m_axi4_arlock.eq(self.None)
+        m.d.comb += self.m_axi4_arburst.eq(self.None)
+        m.d.comb += self.m_axi4_arsize.eq(self.None)
+        m.d.comb += self.m_axi4_arlen.eq(self.None)
+        m.d.comb += self.m_axi4_araddr.eq(self.None)
+        m.d.comb += self.m_axi4_arid.eq(self.None)
+        m.d.comb += self.l2_sending_o.eq(self.None)
+        m.d.comb += self.l2_sent.eq(self.None)
+        m.d.comb += self.l2_done_o.eq(self.None)
+        m.d.comb += self.m_axi4_aruser.eq(self.s_axi4_aruser)
+        m.d.comb += self.m_axi4_arcache.eq(self.s_axi4_arcache)
+        m.d.comb += self.m_axi4_arprot.eq(self.s_axi4_arprot)
+        m.d.comb += self.m_axi4_arlock.eq(self.s_axi4_arlock)
+        m.d.comb += self.m_axi4_arburst.eq(self.s_axi4_arburst)
+        m.d.comb += self.m_axi4_arsize.eq(self.s_axi4_arsize)
+        m.d.comb += self.m_axi4_arlen.eq(self.s_axi4_arlen)
+        m.d.comb += self.m_axi4_araddr.eq(self.l1_araddr_i)
+        m.d.comb += self.m_axi4_arid.eq(self.s_axi4_arid)
+        m.d.comb += self.l2_sending_o.eq(self.1: 'b0)
+        m.d.comb += self.l2_available_q.eq(self.1: 'b0)
+        m.d.comb += self.l2_done_o.eq(self.1: 'b0)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_ar_sender
+#  #(
+#    parameter AXI_ADDR_WIDTH = 40,
+#    parameter AXI_ID_WIDTH   = 4,
+#    parameter AXI_USER_WIDTH = 4,
+#    parameter ENABLE_L2TLB   = 0
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    output logic                      l1_done_o,
+#    input  logic                      l1_accept_i,
+#    input  logic                      l1_drop_i,
+#    input  logic                      l1_save_i,
+#
+#    output logic                      l2_done_o,
+#    input  logic                      l2_accept_i,
+#    input  logic                      l2_drop_i,
+#    output logic                      l2_sending_o,
+#
+#    input  logic [AXI_ADDR_WIDTH-1:0] l1_araddr_i,
+#    input  logic [AXI_ADDR_WIDTH-1:0] l2_araddr_i,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_arid,
+#    input  logic                      s_axi4_arvalid,
+#    output logic                      s_axi4_arready,
+#    input  logic                [7:0] s_axi4_arlen,
+#    input  logic                [2:0] s_axi4_arsize,
+#    input  logic                [1:0] s_axi4_arburst,
+#    input  logic                      s_axi4_arlock,
+#    input  logic                [2:0] s_axi4_arprot,
+#    input  logic                [3:0] s_axi4_arcache,
+#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_aruser,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_arid,
+#    output logic [AXI_ADDR_WIDTH-1:0] m_axi4_araddr,
+#    output logic                      m_axi4_arvalid,
+#    input  logic                      m_axi4_arready,
+#    output logic                [7:0] m_axi4_arlen,
+#    output logic                [2:0] m_axi4_arsize,
+#    output logic                [1:0] m_axi4_arburst,
+#    output logic                      m_axi4_arlock,
+#    output logic                [2:0] m_axi4_arprot,
+#    output logic                [3:0] m_axi4_arcache,
+#    output logic [AXI_USER_WIDTH-1:0] m_axi4_aruser
+#  );
+#
+#  logic l1_save;
+#
+#  logic l2_sent;
+#  logic l2_available_q;
+#
+#  assign l1_save      = l1_save_i & l2_available_q;
+#
+#  assign l1_done_o    = s_axi4_arvalid & s_axi4_arready ;
+#
+#  // if 1: accept and forward a transaction translated by L1
+#  //    2: drop or save request (if L2 slot not occupied already)
+#  assign m_axi4_arvalid = (s_axi4_arvalid & l1_accept_i) |
+#                          l2_sending_o;
+#  assign s_axi4_arready = (m_axi4_arvalid & m_axi4_arready & ~l2_sending_o) |
+#                          (s_axi4_arvalid & (l1_drop_i | l1_save));
+#
+# generate
+#  if (ENABLE_L2TLB == 1) begin
+#    logic [AXI_USER_WIDTH-1:0] l2_axi4_aruser  ;
+#    logic                [3:0] l2_axi4_arcache ;
+#    logic                [3:0] l2_axi4_arregion;
+#    logic                [3:0] l2_axi4_arqos   ;
+#    logic                [2:0] l2_axi4_arprot  ;
+#    logic                      l2_axi4_arlock  ;
+#    logic                [1:0] l2_axi4_arburst ;
+#    logic                [2:0] l2_axi4_arsize  ;
+#    logic                [7:0] l2_axi4_arlen   ;
+#    logic   [AXI_ID_WIDTH-1:0] l2_axi4_arid    ;
+#
+#    assign m_axi4_aruser  = l2_sending_o ? l2_axi4_aruser   : s_axi4_aruser;
+#    assign m_axi4_arcache = l2_sending_o ? l2_axi4_arcache  : s_axi4_arcache;
+#    assign m_axi4_arprot  = l2_sending_o ? l2_axi4_arprot   : s_axi4_arprot;
+#    assign m_axi4_arlock  = l2_sending_o ? l2_axi4_arlock   : s_axi4_arlock;
+#    assign m_axi4_arburst = l2_sending_o ? l2_axi4_arburst  : s_axi4_arburst;
+#    assign m_axi4_arsize  = l2_sending_o ? l2_axi4_arsize   : s_axi4_arsize;
+#    assign m_axi4_arlen   = l2_sending_o ? l2_axi4_arlen    : s_axi4_arlen;
+#    assign m_axi4_araddr  = l2_sending_o ? l2_araddr_i      : l1_araddr_i;
+#    assign m_axi4_arid    = l2_sending_o ? l2_axi4_arid     : s_axi4_arid;
+#
+#    // Buffer AXI signals in case of L1 miss
+#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
+#      if (axi4_arstn == 1'b0) begin
+#        l2_axi4_aruser  <=  'b0;
+#        l2_axi4_arcache <=  'b0;
+#        l2_axi4_arprot  <=  'b0;
+#        l2_axi4_arlock  <= 1'b0;
+#        l2_axi4_arburst <=  'b0;
+#        l2_axi4_arsize  <=  'b0;
+#        l2_axi4_arlen   <=  'b0;
+#        l2_axi4_arid    <=  'b0;
+#      end else if (l1_save) begin
+#        l2_axi4_aruser  <= s_axi4_aruser;
+#        l2_axi4_arcache <= s_axi4_arcache;
+#        l2_axi4_arprot  <= s_axi4_arprot;
+#        l2_axi4_arlock  <= s_axi4_arlock;
+#        l2_axi4_arburst <= s_axi4_arburst;
+#        l2_axi4_arsize  <= s_axi4_arsize;
+#        l2_axi4_arlen   <= s_axi4_arlen;
+#        l2_axi4_arid    <= s_axi4_arid;
+#      end
+#    end
+#
+#    // signal that an l1_save_i can be accepted
+#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
+#      if (axi4_arstn == 1'b0) begin
+#        l2_available_q <= 1'b1;
+#      end else if (l2_sent | l2_drop_i) begin
+#        l2_available_q <= 1'b1;
+#      end else if (l1_save) begin
+#        l2_available_q <= 1'b0;
+#      end
+#    end
+#
+#    assign l2_sending_o = l2_accept_i & ~l2_available_q;
+#    assign l2_sent      = l2_sending_o & m_axi4_arvalid & m_axi4_arready;
+#
+#    // if 1: having sent out a transaction translated by L2
+#    //    2: drop request (L2 slot is available again)
+#    assign l2_done_o    = l2_sent | l2_drop_i;
+#
+#  end else begin // !`ifdef ENABLE_L2TLB
+#    assign m_axi4_aruser  =  s_axi4_aruser;
+#    assign m_axi4_arcache =  s_axi4_arcache;
+#    assign m_axi4_arprot  =  s_axi4_arprot;
+#    assign m_axi4_arlock  =  s_axi4_arlock;
+#    assign m_axi4_arburst =  s_axi4_arburst;
+#    assign m_axi4_arsize  =  s_axi4_arsize;
+#    assign m_axi4_arlen   =  s_axi4_arlen;
+#    assign m_axi4_araddr  =  l1_araddr_i;
+#    assign m_axi4_arid    =  s_axi4_arid;
+#
+#    assign l2_sending_o   = 1'b0;
+#    assign l2_available_q = 1'b0;
+#    assign l2_done_o      = 1'b0;
+#  end // else: !if(ENABLE_L2TLB == 1)
+# endgenerate
+#
+# endmodule
+#
+#
diff --git a/unused_please_ignore_completely/iommu/axi_rab/axi4_aw_buffer.py b/unused_please_ignore_completely/iommu/axi_rab/axi4_aw_buffer.py
new file mode 100644 (file)
index 0000000..f5ca37d
--- /dev/null
@@ -0,0 +1,157 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_aw_buffer(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.s_axi4_awid = Signal(AXI_ID_WIDTH)  # input
+        self.s_axi4_awaddr = Signal(32)  # input
+        self.s_axi4_awvalid = Signal()  # input
+        self.s_axi4_awready = Signal()  # output
+        self.s_axi4_awlen = Signal(8)  # input
+        self.s_axi4_awsize = Signal(3)  # input
+        self.s_axi4_awburst = Signal(2)  # input
+        self.s_axi4_awlock = Signal()  # input
+        self.s_axi4_awprot = Signal(3)  # input
+        self.s_axi4_awcache = Signal(4)  # input
+        self.s_axi4_awregion = Signal(4)  # input
+        self.s_axi4_awqos = Signal(4)  # input
+        self.s_axi4_awuser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_awid = Signal(AXI_ID_WIDTH)  # output
+        self.m_axi4_awaddr = Signal(32)  # output
+        self.m_axi4_awvalid = Signal()  # output
+        self.m_axi4_awready = Signal()  # input
+        self.m_axi4_awlen = Signal(8)  # output
+        self.m_axi4_awsize = Signal(3)  # output
+        self.m_axi4_awburst = Signal(2)  # output
+        self.m_axi4_awlock = Signal()  # output
+        self.m_axi4_awprot = Signal(3)  # output
+        self.m_axi4_awcache = Signal(4)  # output
+        self.m_axi4_awregion = Signal(4)  # output
+        self.m_axi4_awqos = Signal(4)  # output
+        self.m_axi4_awuser = Signal(AXI_USER_WIDTH)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.None.eq(self.s_axi4_awcache)
+        m.d.comb += self.None.eq(self.s_axi4_awprot)
+        m.d.comb += self.None.eq(self.s_axi4_awlock)
+        m.d.comb += self.None.eq(self.s_axi4_awburst)
+        m.d.comb += self.None.eq(self.s_axi4_awsize)
+        m.d.comb += self.None.eq(self.s_axi4_awlen)
+        m.d.comb += self.None.eq(self.s_axi4_awaddr)
+        m.d.comb += self.None.eq(self.s_axi4_awregion)
+        m.d.comb += self.None.eq(self.s_axi4_awqos)
+        m.d.comb += self.None.eq(self.s_axi4_awid)
+        m.d.comb += self.None.eq(self.s_axi4_awuser)
+        m.d.comb += self.m_axi4_awcache.eq(self.None)
+        m.d.comb += self.m_axi4_awprot.eq(self.None)
+        m.d.comb += self.m_axi4_awlock.eq(self.None)
+        m.d.comb += self.m_axi4_awburst.eq(self.None)
+        m.d.comb += self.m_axi4_awsize.eq(self.None)
+        m.d.comb += self.m_axi4_awlen.eq(self.None)
+        m.d.comb += self.m_axi4_awaddr.eq(self.None)
+        m.d.comb += self.m_axi4_awregion.eq(self.None)
+        m.d.comb += self.m_axi4_awqos.eq(self.None)
+        m.d.comb += self.m_axi4_awid.eq(self.None)
+        m.d.comb += self.m_axi4_awuser.eq(self.None)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_aw_buffer
+#  #(
+#    parameter AXI_ID_WIDTH   = 4,
+#    parameter AXI_USER_WIDTH = 4
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_awid,
+#    input  logic               [31:0] s_axi4_awaddr,
+#    input  logic                      s_axi4_awvalid,
+#    output logic                      s_axi4_awready,
+#    input  logic                [7:0] s_axi4_awlen,
+#    input  logic                [2:0] s_axi4_awsize,
+#    input  logic                [1:0] s_axi4_awburst,
+#    input  logic                      s_axi4_awlock,
+#    input  logic                [2:0] s_axi4_awprot,
+#    input  logic                [3:0] s_axi4_awcache,
+#    input  logic                [3:0] s_axi4_awregion,
+#    input  logic                [3:0] s_axi4_awqos,
+#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_awuser,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_awid,
+#    output logic               [31:0] m_axi4_awaddr,
+#    output logic                      m_axi4_awvalid,
+#    input  logic                      m_axi4_awready,
+#    output logic                [7:0] m_axi4_awlen,
+#    output logic                [2:0] m_axi4_awsize,
+#    output logic                [1:0] m_axi4_awburst,
+#    output logic                      m_axi4_awlock,
+#    output logic                [2:0] m_axi4_awprot,
+#    output logic                [3:0] m_axi4_awcache,
+#    output logic                [3:0] m_axi4_awregion,
+#    output logic                [3:0] m_axi4_awqos,
+#    output logic [AXI_USER_WIDTH-1:0] m_axi4_awuser
+#  );
+#
+#  wire [AXI_USER_WIDTH+AXI_ID_WIDTH+60:0] data_in;
+#  wire [AXI_USER_WIDTH+AXI_ID_WIDTH+60:0] data_out;
+#
+#  assign data_in                                            [3:0] = s_axi4_awcache;
+#  assign data_in                                            [6:4] = s_axi4_awprot;
+#  assign data_in                                              [7] = s_axi4_awlock;
+#  assign data_in                                            [9:8] = s_axi4_awburst;
+#  assign data_in                                          [12:10] = s_axi4_awsize;
+#  assign data_in                                          [20:13] = s_axi4_awlen;
+#  assign data_in                                          [52:21] = s_axi4_awaddr;
+#  assign data_in                                          [56:53] = s_axi4_awregion;
+#  assign data_in                                          [60:57] = s_axi4_awqos;
+#  assign data_in                             [60+AXI_ID_WIDTH:61] = s_axi4_awid;
+#  assign data_in [60+AXI_ID_WIDTH+AXI_USER_WIDTH:61+AXI_ID_WIDTH] = s_axi4_awuser;
+#
+#  assign m_axi4_awcache  = data_out[3:0];
+#  assign m_axi4_awprot   = data_out[6:4];
+#  assign m_axi4_awlock   = data_out[7];
+#  assign m_axi4_awburst  = data_out[9:8];
+#  assign m_axi4_awsize   = data_out[12:10];
+#  assign m_axi4_awlen    = data_out[20:13];
+#  assign m_axi4_awaddr   = data_out[52:21];
+#  assign m_axi4_awregion = data_out[56:53];
+#  assign m_axi4_awqos    = data_out[60:57];
+#  assign m_axi4_awid     = data_out[60+AXI_ID_WIDTH:61];
+#  assign m_axi4_awuser   = data_out[60+AXI_ID_WIDTH+AXI_USER_WIDTH:61+AXI_ID_WIDTH];
+#
+#  axi_buffer_rab
+#    #(
+#      .DATA_WIDTH   ( AXI_ID_WIDTH+AXI_USER_WIDTH+61  ),
+#      .BUFFER_DEPTH ( 4                               )
+#    )
+#    u_buffer
+#    (
+#      .clk       ( axi4_aclk      ),
+#      .rstn      ( axi4_arstn     ),
+#      .valid_out ( m_axi4_awvalid ),
+#      .data_out  ( data_out       ),
+#      .ready_in  ( m_axi4_awready ),
+#      .valid_in  ( s_axi4_awvalid ),
+#      .data_in   ( data_in        ),
+#      .ready_out ( s_axi4_awready )
+#    );
+# endmodule
+#
+#
diff --git a/unused_please_ignore_completely/iommu/axi_rab/axi4_aw_sender.py b/unused_please_ignore_completely/iommu/axi_rab/axi4_aw_sender.py
new file mode 100644 (file)
index 0000000..fbc917d
--- /dev/null
@@ -0,0 +1,252 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_aw_sender(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.l1_done_o = Signal()  # output
+        self.l1_accept_i = Signal()  # input
+        self.l1_drop_i = Signal()  # input
+        self.l1_save_i = Signal()  # input
+        self.l2_done_o = Signal()  # output
+        self.l2_accept_i = Signal()  # input
+        self.l2_drop_i = Signal()  # input
+        self.l2_sending_o = Signal()  # output
+        self.l1_awaddr_i = Signal(AXI_ADDR_WIDTH)  # input
+        self.l2_awaddr_i = Signal(AXI_ADDR_WIDTH)  # input
+        self.s_axi4_awid = Signal(AXI_ID_WIDTH)  # input
+        self.s_axi4_awvalid = Signal()  # input
+        self.s_axi4_awready = Signal()  # output
+        self.s_axi4_awlen = Signal(8)  # input
+        self.s_axi4_awsize = Signal(3)  # input
+        self.s_axi4_awburst = Signal(2)  # input
+        self.s_axi4_awlock = Signal()  # input
+        self.s_axi4_awprot = Signal(3)  # input
+        self.s_axi4_awcache = Signal(4)  # input
+        self.s_axi4_awregion = Signal(4)  # input
+        self.s_axi4_awqos = Signal(4)  # input
+        self.s_axi4_awuser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_awid = Signal(AXI_ID_WIDTH)  # output
+        self.m_axi4_awaddr = Signal(AXI_ADDR_WIDTH)  # output
+        self.m_axi4_awvalid = Signal()  # output
+        self.m_axi4_awready = Signal()  # input
+        self.m_axi4_awlen = Signal(8)  # output
+        self.m_axi4_awsize = Signal(3)  # output
+        self.m_axi4_awburst = Signal(2)  # output
+        self.m_axi4_awlock = Signal()  # output
+        self.m_axi4_awprot = Signal(3)  # output
+        self.m_axi4_awcache = Signal(4)  # output
+        self.m_axi4_awregion = Signal(4)  # output
+        self.m_axi4_awqos = Signal(4)  # output
+        self.m_axi4_awuser = Signal(AXI_USER_WIDTH)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.l1_save.eq(self.None)
+        m.d.comb += self.l1_done_o.eq(self.None)
+        m.d.comb += self.m_axi4_awvalid.eq(self.None)
+        m.d.comb += self.s_axi4_awready.eq(self.None)
+        m.d.comb += self.m_axi4_awuser.eq(self.None)
+        m.d.comb += self.m_axi4_awcache.eq(self.None)
+        m.d.comb += self.m_axi4_awregion.eq(self.None)
+        m.d.comb += self.m_axi4_awqos.eq(self.None)
+        m.d.comb += self.m_axi4_awprot.eq(self.None)
+        m.d.comb += self.m_axi4_awlock.eq(self.None)
+        m.d.comb += self.m_axi4_awburst.eq(self.None)
+        m.d.comb += self.m_axi4_awsize.eq(self.None)
+        m.d.comb += self.m_axi4_awlen.eq(self.None)
+        m.d.comb += self.m_axi4_awaddr.eq(self.None)
+        m.d.comb += self.m_axi4_awid.eq(self.None)
+        m.d.comb += self.l2_sending_o.eq(self.None)
+        m.d.comb += self.l2_sent.eq(self.None)
+        m.d.comb += self.l2_done_o.eq(self.None)
+        m.d.comb += self.m_axi4_awuser.eq(self.s_axi4_awuser)
+        m.d.comb += self.m_axi4_awcache.eq(self.s_axi4_awcache)
+        m.d.comb += self.m_axi4_awregion.eq(self.s_axi4_awregion)
+        m.d.comb += self.m_axi4_awqos.eq(self.s_axi4_awqos)
+        m.d.comb += self.m_axi4_awprot.eq(self.s_axi4_awprot)
+        m.d.comb += self.m_axi4_awlock.eq(self.s_axi4_awlock)
+        m.d.comb += self.m_axi4_awburst.eq(self.s_axi4_awburst)
+        m.d.comb += self.m_axi4_awsize.eq(self.s_axi4_awsize)
+        m.d.comb += self.m_axi4_awlen.eq(self.s_axi4_awlen)
+        m.d.comb += self.m_axi4_awaddr.eq(self.l1_awaddr_i)
+        m.d.comb += self.m_axi4_awid.eq(self.s_axi4_awid)
+        m.d.comb += self.l2_sending_o.eq(self.1: 'b0)
+        m.d.comb += self.l2_available_q.eq(self.1: 'b0)
+        m.d.comb += self.l2_done_o.eq(self.1: 'b0)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_aw_sender
+#  #(
+#    parameter AXI_ADDR_WIDTH   = 40,
+#    parameter AXI_ID_WIDTH     = 4,
+#    parameter AXI_USER_WIDTH   = 4,
+#    parameter ENABLE_L2TLB     = 0
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    output logic                      l1_done_o,
+#    input  logic                      l1_accept_i,
+#    input  logic                      l1_drop_i,
+#    input  logic                      l1_save_i,
+#
+#    output logic                      l2_done_o,
+#    input  logic                      l2_accept_i,
+#    input  logic                      l2_drop_i,
+#    output logic                      l2_sending_o,
+#
+#    input  logic [AXI_ADDR_WIDTH-1:0] l1_awaddr_i,
+#    input  logic [AXI_ADDR_WIDTH-1:0] l2_awaddr_i,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] s_axi4_awid,
+#    input  logic                      s_axi4_awvalid,
+#    output logic                      s_axi4_awready,
+#    input  logic                [7:0] s_axi4_awlen,
+#    input  logic                [2:0] s_axi4_awsize,
+#    input  logic                [1:0] s_axi4_awburst,
+#    input  logic                      s_axi4_awlock,
+#    input  logic                [2:0] s_axi4_awprot,
+#    input  logic                [3:0] s_axi4_awcache,
+#    input  logic                [3:0] s_axi4_awregion,
+#    input  logic                [3:0] s_axi4_awqos,
+#    input  logic [AXI_USER_WIDTH-1:0] s_axi4_awuser,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] m_axi4_awid,
+#    output logic [AXI_ADDR_WIDTH-1:0] m_axi4_awaddr,
+#    output logic                      m_axi4_awvalid,
+#    input  logic                      m_axi4_awready,
+#    output logic                [7:0] m_axi4_awlen,
+#    output logic                [2:0] m_axi4_awsize,
+#    output logic                [1:0] m_axi4_awburst,
+#    output logic                      m_axi4_awlock,
+#    output logic                [2:0] m_axi4_awprot,
+#    output logic                [3:0] m_axi4_awcache,
+#    output logic                [3:0] m_axi4_awregion,
+#    output logic                [3:0] m_axi4_awqos,
+#    output logic [AXI_USER_WIDTH-1:0] m_axi4_awuser
+#  );
+#
+#  logic l1_save;
+#
+#  logic l2_sent;
+#  logic l2_available_q;
+#
+#  assign l1_save      = l1_save_i & l2_available_q;
+#
+#  assign l1_done_o    = s_axi4_awvalid & s_axi4_awready ;
+#
+#  // if 1: accept and forward a transaction translated by L1
+#  //    2: drop or save request (if L2 slot not occupied already)
+#  assign m_axi4_awvalid = (s_axi4_awvalid & l1_accept_i) |
+#                          l2_sending_o;
+#  assign s_axi4_awready = (m_axi4_awvalid & m_axi4_awready & ~l2_sending_o) |
+#                          (s_axi4_awvalid & (l1_drop_i | l1_save));
+#
+# generate
+#  if (ENABLE_L2TLB    == 1) begin
+#    logic [AXI_USER_WIDTH-1:0] l2_axi4_awuser  ;
+#    logic                [3:0] l2_axi4_awcache ;
+#    logic                [3:0] l2_axi4_awregion;
+#    logic                [3:0] l2_axi4_awqos   ;
+#    logic                [2:0] l2_axi4_awprot  ;
+#    logic                      l2_axi4_awlock  ;
+#    logic                [1:0] l2_axi4_awburst ;
+#    logic                [2:0] l2_axi4_awsize  ;
+#    logic                [7:0] l2_axi4_awlen   ;
+#    logic   [AXI_ID_WIDTH-1:0] l2_axi4_awid    ;
+#
+#    assign m_axi4_awuser   = l2_sending_o ? l2_axi4_awuser   : s_axi4_awuser;
+#    assign m_axi4_awcache  = l2_sending_o ? l2_axi4_awcache  : s_axi4_awcache;
+#    assign m_axi4_awregion = l2_sending_o ? l2_axi4_awregion : s_axi4_awregion;
+#    assign m_axi4_awqos    = l2_sending_o ? l2_axi4_awqos    : s_axi4_awqos;
+#    assign m_axi4_awprot   = l2_sending_o ? l2_axi4_awprot   : s_axi4_awprot;
+#    assign m_axi4_awlock   = l2_sending_o ? l2_axi4_awlock   : s_axi4_awlock;
+#    assign m_axi4_awburst  = l2_sending_o ? l2_axi4_awburst  : s_axi4_awburst;
+#    assign m_axi4_awsize   = l2_sending_o ? l2_axi4_awsize   : s_axi4_awsize;
+#    assign m_axi4_awlen    = l2_sending_o ? l2_axi4_awlen    : s_axi4_awlen;
+#    assign m_axi4_awaddr   = l2_sending_o ? l2_awaddr_i      : l1_awaddr_i;
+#    assign m_axi4_awid     = l2_sending_o ? l2_axi4_awid     : s_axi4_awid;
+#
+#    // buffer AXI signals in case of L1 miss
+#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
+#      if (axi4_arstn == 1'b0) begin
+#        l2_axi4_awuser   <=  'b0;
+#        l2_axi4_awcache  <=  'b0;
+#        l2_axi4_awregion <=  'b0;
+#        l2_axi4_awqos    <=  'b0;
+#        l2_axi4_awprot   <=  'b0;
+#        l2_axi4_awlock   <= 1'b0;
+#        l2_axi4_awburst  <=  'b0;
+#        l2_axi4_awsize   <=  'b0;
+#        l2_axi4_awlen    <=  'b0;
+#        l2_axi4_awid     <=  'b0;
+#      end else if (l1_save) begin
+#        l2_axi4_awuser   <= s_axi4_awuser;
+#        l2_axi4_awcache  <= s_axi4_awcache;
+#        l2_axi4_awregion <= s_axi4_awregion;
+#        l2_axi4_awqos    <= s_axi4_awqos;
+#        l2_axi4_awprot   <= s_axi4_awprot;
+#        l2_axi4_awlock   <= s_axi4_awlock;
+#        l2_axi4_awburst  <= s_axi4_awburst;
+#        l2_axi4_awsize   <= s_axi4_awsize;
+#        l2_axi4_awlen    <= s_axi4_awlen;
+#        l2_axi4_awid     <= s_axi4_awid;
+#      end
+#    end
+#
+#    // signal that an l1_save_i can be accepted
+#    always @(posedge axi4_aclk or negedge axi4_arstn) begin
+#      if (axi4_arstn == 1'b0) begin
+#        l2_available_q <= 1'b1;
+#      end else if (l2_sent | l2_drop_i) begin
+#        l2_available_q <= 1'b1;
+#      end else if (l1_save) begin
+#        l2_available_q <= 1'b0;
+#      end
+#    end
+#
+#    assign l2_sending_o = l2_accept_i & ~l2_available_q;
+#    assign l2_sent      = l2_sending_o & m_axi4_awvalid & m_axi4_awready;
+#
+#    // if 1: having sent out a transaction translated by L2
+#    //    2: drop request (L2 slot is available again)
+#    assign l2_done_o    = l2_sent | l2_drop_i;
+#
+#  end else begin // !`ifdef ENABLE_L2TLB
+#    assign m_axi4_awuser   =  s_axi4_awuser;
+#    assign m_axi4_awcache  =  s_axi4_awcache;
+#    assign m_axi4_awregion =  s_axi4_awregion;
+#    assign m_axi4_awqos    =  s_axi4_awqos;
+#    assign m_axi4_awprot   =  s_axi4_awprot;
+#    assign m_axi4_awlock   =  s_axi4_awlock;
+#    assign m_axi4_awburst  =  s_axi4_awburst;
+#    assign m_axi4_awsize   =  s_axi4_awsize;
+#    assign m_axi4_awlen    =  s_axi4_awlen;
+#    assign m_axi4_awaddr   =  l1_awaddr_i;
+#    assign m_axi4_awid     =  s_axi4_awid;
+#
+#    assign l2_sending_o    = 1'b0;
+#    assign l2_available_q  = 1'b0;
+#    assign l2_done_o       = 1'b0;
+#  end // !`ifdef ENABLE_L2TLB
+# endgenerate
+#
+# endmodule
+#
+#
diff --git a/unused_please_ignore_completely/iommu/axi_rab/axi4_b_buffer.py b/unused_please_ignore_completely/iommu/axi_rab/axi4_b_buffer.py
new file mode 100644 (file)
index 0000000..42fce1a
--- /dev/null
@@ -0,0 +1,94 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_b_buffer(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.s_axi4_bid = Signal(AXI_ID_WIDTH)  # output
+        self.s_axi4_bresp = Signal(2)  # output
+        self.s_axi4_bvalid = Signal()  # output
+        self.s_axi4_buser = Signal(AXI_USER_WIDTH)  # output
+        self.s_axi4_bready = Signal()  # input
+        self.m_axi4_bid = Signal(AXI_ID_WIDTH)  # input
+        self.m_axi4_bresp = Signal(2)  # input
+        self.m_axi4_bvalid = Signal()  # input
+        self.m_axi4_buser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_bready = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.None.eq(self.m_axi4_bresp)
+        m.d.comb += self.None.eq(self.m_axi4_bid)
+        m.d.comb += self.None.eq(self.m_axi4_buser)
+        m.d.comb += self.s_axi4_buser.eq(self.None)
+        m.d.comb += self.s_axi4_bid.eq(self.None)
+        m.d.comb += self.s_axi4_bresp.eq(self.None)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_b_buffer
+#  #(
+#    parameter AXI_ID_WIDTH   = 4,
+#    parameter AXI_USER_WIDTH = 4
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_bid,
+#    output logic                [1:0] s_axi4_bresp,
+#    output logic                      s_axi4_bvalid,
+#    output logic [AXI_USER_WIDTH-1:0] s_axi4_buser,
+#    input  logic                      s_axi4_bready,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_bid,
+#    input  logic                [1:0] m_axi4_bresp,
+#    input  logic                      m_axi4_bvalid,
+#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_buser,
+#    output logic                      m_axi4_bready
+#  );
+#
+#  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+1:0] data_in;
+#  wire [AXI_ID_WIDTH+AXI_USER_WIDTH+1:0] data_out;
+#
+#  assign data_in                                         [1:0] = m_axi4_bresp;
+#  assign data_in                            [AXI_ID_WIDTH+1:2] = m_axi4_bid;
+#  assign data_in[AXI_ID_WIDTH+AXI_USER_WIDTH+1:AXI_ID_WIDTH+2] = m_axi4_buser;
+#
+#  assign s_axi4_buser = data_out[AXI_ID_WIDTH+AXI_USER_WIDTH+1:AXI_ID_WIDTH+2];
+#  assign s_axi4_bid   = data_out[AXI_ID_WIDTH+1:2];
+#  assign s_axi4_bresp = data_out[1:0];
+#
+#  axi_buffer_rab
+#  #(
+#    .DATA_WIDTH   ( AXI_ID_WIDTH+AXI_USER_WIDTH+2 ),
+#    .BUFFER_DEPTH ( 4                             )
+#    )
+#  u_buffer
+#  (
+#    .clk      ( axi4_aclk     ),
+#    .rstn     ( axi4_arstn    ),
+#    .valid_out( s_axi4_bvalid ),
+#    .data_out ( data_out      ),
+#    .ready_in ( s_axi4_bready ),
+#    .valid_in ( m_axi4_bvalid ),
+#    .data_in  ( data_in       ),
+#    .ready_out( m_axi4_bready )
+#  );
+#
+# endmodule
+#
+#
diff --git a/unused_please_ignore_completely/iommu/axi_rab/axi4_b_sender.py b/unused_please_ignore_completely/iommu/axi_rab/axi4_b_sender.py
new file mode 100644 (file)
index 0000000..1c61a2a
--- /dev/null
@@ -0,0 +1,136 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_b_sender(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.drop_i = Signal()  # input
+        self.done_o = Signal()  # output
+        self.id_i = Signal(AXI_ID_WIDTH)  # input
+        self.prefetch_i = Signal()  # input
+        self.hit_i = Signal()  # input
+        self.s_axi4_bid = Signal(AXI_ID_WIDTH)  # output
+        self.s_axi4_bresp = Signal(2)  # output
+        self.s_axi4_bvalid = Signal()  # output
+        self.s_axi4_buser = Signal(AXI_USER_WIDTH)  # output
+        self.s_axi4_bready = Signal()  # input
+        self.m_axi4_bid = Signal(AXI_ID_WIDTH)  # input
+        self.m_axi4_bresp = Signal(2)  # input
+        self.m_axi4_bvalid = Signal()  # input
+        self.m_axi4_buser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_bready = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.fifo_push.eq(self.None)
+        m.d.comb += self.done_o.eq(self.fifo_push)
+        m.d.comb += self.fifo_pop.eq(self.None)
+        m.d.comb += self.s_axi4_buser.eq(self.None)
+        m.d.comb += self.s_axi4_bid.eq(self.None)
+        m.d.comb += self.s_axi4_bresp.eq(self.None)
+        m.d.comb += self.s_axi4_bvalid.eq(self.None)
+        m.d.comb += self.m_axi4_bready.eq(self.None)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_b_sender
+#  #(
+#    parameter AXI_ID_WIDTH   = 10,
+#    parameter AXI_USER_WIDTH = 4
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    input  logic                      drop_i,
+#    output logic                      done_o,
+#    input  logic   [AXI_ID_WIDTH-1:0] id_i,
+#    input  logic                      prefetch_i,
+#    input  logic                      hit_i,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_bid,
+#    output logic                [1:0] s_axi4_bresp,
+#    output logic                      s_axi4_bvalid,
+#    output logic [AXI_USER_WIDTH-1:0] s_axi4_buser,
+#    input  logic                      s_axi4_bready,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_bid,
+#    input  logic                [1:0] m_axi4_bresp,
+#    input  logic                      m_axi4_bvalid,
+#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_buser,
+#    output logic                      m_axi4_bready
+#  );
+#
+#  logic                    fifo_valid;
+#  logic                    fifo_pop;
+#  logic                    fifo_push;
+#  logic                    fifo_ready;
+#  logic [AXI_ID_WIDTH-1:0] id;
+#  logic                    prefetch;
+#  logic                    hit;
+#
+#  logic                    dropping;
+#
+#  axi_buffer_rab
+#    #(
+#      .DATA_WIDTH   ( 2+AXI_ID_WIDTH  ),
+#      .BUFFER_DEPTH ( 4               )
+#      )
+#    u_fifo
+#      (
+#        .clk       ( axi4_aclk                 ),
+#        .rstn      ( axi4_arstn                ),
+#        // Pop
+#        .data_out  ( {prefetch,   hit,   id}   ),
+#        .valid_out ( fifo_valid                ),
+#        .ready_in  ( fifo_pop                  ),
+#        // Push
+#        .valid_in  ( fifo_push                 ),
+#        .data_in   ( {prefetch_i, hit_i, id_i} ),
+#        .ready_out ( fifo_ready                )
+#      );
+#
+#  assign fifo_push = drop_i & fifo_ready;
+#  assign done_o    = fifo_push;
+#
+#  assign fifo_pop  = dropping & s_axi4_bready;
+#
+#  always @ (posedge axi4_aclk or negedge axi4_arstn) begin
+#    if (axi4_arstn == 1'b0) begin
+#      dropping <= 1'b0;
+#    end else begin
+#      if (fifo_valid && ~dropping)
+#        dropping <= 1'b1;
+#      else if (fifo_pop)
+#        dropping <= 1'b0;
+#    end
+#  end
+#
+#  assign s_axi4_buser  = dropping ? {AXI_USER_WIDTH{1'b0}} : m_axi4_buser;
+#  assign s_axi4_bid    = dropping ? id : m_axi4_bid;
+#
+#  assign s_axi4_bresp  = (dropping & prefetch & hit) ? 2'b00 : // prefetch hit, mutli, prot
+#                         (dropping & prefetch      ) ? 2'b10 : // prefetch miss
+#                         (dropping            & hit) ? 2'b10 : // non-prefetch multi, prot
+#                         (dropping                 ) ? 2'b10 : // non-prefetch miss
+#                         m_axi4_bresp;
+#
+#  assign s_axi4_bvalid =  dropping | m_axi4_bvalid;
+#  assign m_axi4_bready = ~dropping & s_axi4_bready;
+#
+# endmodule
+#
+#
diff --git a/unused_please_ignore_completely/iommu/axi_rab/axi4_r_buffer.py b/unused_please_ignore_completely/iommu/axi_rab/axi4_r_buffer.py
new file mode 100644 (file)
index 0000000..91bdf0a
--- /dev/null
@@ -0,0 +1,120 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_r_buffer(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.s_axi4_rid = Signal(AXI_ID_WIDTH)  # output
+        self.s_axi4_rresp = Signal(2)  # output
+        self.s_axi4_rdata = Signal(AXI_DATA_WIDTH)  # output
+        self.s_axi4_rlast = Signal()  # output
+        self.s_axi4_rvalid = Signal()  # output
+        self.s_axi4_ruser = Signal(AXI_USER_WIDTH)  # output
+        self.s_axi4_rready = Signal()  # input
+        self.m_axi4_rid = Signal(AXI_ID_WIDTH)  # input
+        self.m_axi4_rresp = Signal(2)  # input
+        self.m_axi4_rdata = Signal(AXI_DATA_WIDTH)  # input
+        self.m_axi4_rlast = Signal()  # input
+        self.m_axi4_rvalid = Signal()  # input
+        self.m_axi4_ruser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_rready = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.None.eq(self.m_axi4_rresp)
+        m.d.comb += self.None.eq(self.m_axi4_rlast)
+        m.d.comb += self.None.eq(self.m_axi4_rid)
+        m.d.comb += self.None.eq(self.m_axi4_rdata)
+        m.d.comb += self.None.eq(self.m_axi4_ruser)
+        m.d.comb += self.s_axi4_rresp.eq(self.None)
+        m.d.comb += self.s_axi4_rlast.eq(self.None)
+        m.d.comb += self.s_axi4_rid.eq(self.None)
+        m.d.comb += self.s_axi4_rdata.eq(self.None)
+        m.d.comb += self.s_axi4_ruser.eq(self.None)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_r_buffer
+#  #(
+#    parameter AXI_DATA_WIDTH = 32,
+#    parameter AXI_ID_WIDTH   = 4,
+#    parameter AXI_USER_WIDTH = 4
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_rid,
+#    output logic                [1:0] s_axi4_rresp,
+#    output logic [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
+#    output logic                      s_axi4_rlast,
+#    output logic                      s_axi4_rvalid,
+#    output logic [AXI_USER_WIDTH-1:0] s_axi4_ruser,
+#    input  logic                      s_axi4_rready,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_rid,
+#    input  logic                [1:0] m_axi4_rresp,
+#    input  logic [AXI_DATA_WIDTH-1:0] m_axi4_rdata,
+#    input  logic                      m_axi4_rlast,
+#    input  logic                      m_axi4_rvalid,
+#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_ruser,
+#    output logic                      m_axi4_rready
+#  );
+#
+#  wire [AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3-1:0] data_in;
+#  wire [AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3-1:0] data_out;
+#
+#  localparam ID_START   = 3;
+#  localparam ID_END     = AXI_ID_WIDTH-1 + ID_START;
+#  localparam DATA_START = ID_END + 1;
+#  localparam DATA_END   = AXI_DATA_WIDTH-1 + DATA_START;
+#  localparam USER_START = DATA_END + 1;
+#  localparam USER_END   = AXI_USER_WIDTH-1 + USER_START;
+#
+#  assign data_in                [1:0] = m_axi4_rresp;
+#  assign data_in                  [2] = m_axi4_rlast;
+#  assign data_in    [ID_END:ID_START] = m_axi4_rid;
+#  assign data_in[DATA_END:DATA_START] = m_axi4_rdata;
+#  assign data_in[USER_END:USER_START] = m_axi4_ruser;
+#
+#  assign s_axi4_rresp  = data_out                [1:0];
+#  assign s_axi4_rlast  = data_out                  [2];
+#  assign s_axi4_rid    = data_out    [ID_END:ID_START];
+#  assign s_axi4_rdata  = data_out[DATA_END:DATA_START];
+#  assign s_axi4_ruser  = data_out[USER_END:USER_START];
+#
+#  axi_buffer_rab
+#  #(
+#    .DATA_WIDTH   ( AXI_DATA_WIDTH+AXI_ID_WIDTH+AXI_USER_WIDTH+3  ),
+#    .BUFFER_DEPTH ( 4                                             )
+#    )
+#  u_buffer
+#  (
+#    .clk       ( axi4_aclk     ),
+#    .rstn      ( axi4_arstn    ),
+#    // Pop
+#    .valid_out ( s_axi4_rvalid ),
+#    .data_out  ( data_out      ),
+#    .ready_in  ( s_axi4_rready ),
+#    // Push
+#    .valid_in  ( m_axi4_rvalid ),
+#    .data_in   ( data_in       ),
+#    .ready_out ( m_axi4_rready )
+#  );
+#
+# endmodule
+#
+#
diff --git a/unused_please_ignore_completely/iommu/axi_rab/axi4_r_sender.py b/unused_please_ignore_completely/iommu/axi_rab/axi4_r_sender.py
new file mode 100644 (file)
index 0000000..d4e22bb
--- /dev/null
@@ -0,0 +1,206 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_r_sender(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.drop_i = Signal()  # input
+        self.drop_len_i = Signal(8)  # input
+        self.done_o = Signal()  # output
+        self.id_i = Signal(AXI_ID_WIDTH)  # input
+        self.prefetch_i = Signal()  # input
+        self.hit_i = Signal()  # input
+        self.s_axi4_rid = Signal(AXI_ID_WIDTH)  # output
+        self.s_axi4_rresp = Signal(2)  # output
+        self.s_axi4_rdata = Signal(AXI_DATA_WIDTH)  # output
+        self.s_axi4_rlast = Signal()  # output
+        self.s_axi4_rvalid = Signal()  # output
+        self.s_axi4_ruser = Signal(AXI_USER_WIDTH)  # output
+        self.s_axi4_rready = Signal()  # input
+        self.m_axi4_rid = Signal(AXI_ID_WIDTH)  # input
+        self.m_axi4_rresp = Signal(2)  # input
+        self.m_axi4_rdata = Signal(AXI_DATA_WIDTH)  # input
+        self.m_axi4_rlast = Signal()  # input
+        self.m_axi4_rvalid = Signal()  # input
+        self.m_axi4_ruser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_rready = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.fifo_push.eq(self.None)
+        m.d.comb += self.done_o.eq(self.fifo_push)
+        m.d.comb += self.s_axi4_rdata.eq(self.m_axi4_rdata)
+        m.d.comb += self.s_axi4_ruser.eq(self.None)
+        m.d.comb += self.s_axi4_rid.eq(self.None)
+        m.d.comb += self.s_axi4_rresp.eq(self.None)
+        m.d.comb += self.s_axi4_rvalid.eq(self.None)
+        m.d.comb += self.m_axi4_rready.eq(self.None)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# //import CfMath::log2;
+#
+# module axi4_r_sender
+#  #(
+#    parameter AXI_DATA_WIDTH = 32,
+#    parameter AXI_ID_WIDTH   = 4,
+#    parameter AXI_USER_WIDTH = 4
+#  )
+#  (
+#    input  logic                      axi4_aclk,
+#    input  logic                      axi4_arstn,
+#
+#    input  logic                      drop_i,
+#    input  logic                [7:0] drop_len_i,
+#    output logic                      done_o,
+#    input  logic   [AXI_ID_WIDTH-1:0] id_i,
+#    input  logic                      prefetch_i,
+#    input  logic                      hit_i,
+#
+#    output logic   [AXI_ID_WIDTH-1:0] s_axi4_rid,
+#    output logic                [1:0] s_axi4_rresp,
+#    output logic [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
+#    output logic                      s_axi4_rlast,
+#    output logic                      s_axi4_rvalid,
+#    output logic [AXI_USER_WIDTH-1:0] s_axi4_ruser,
+#    input  logic                      s_axi4_rready,
+#
+#    input  logic   [AXI_ID_WIDTH-1:0] m_axi4_rid,
+#    input  logic                [1:0] m_axi4_rresp,
+#    input  logic [AXI_DATA_WIDTH-1:0] m_axi4_rdata,
+#    input  logic                      m_axi4_rlast,
+#    input  logic                      m_axi4_rvalid,
+#    input  logic [AXI_USER_WIDTH-1:0] m_axi4_ruser,
+#    output logic                      m_axi4_rready
+#  );
+#
+#  localparam BUFFER_DEPTH = 16;
+#
+#  logic                    fifo_valid;
+#  logic                    fifo_pop;
+#  logic                    fifo_push;
+#  logic                    fifo_ready;
+#  logic [AXI_ID_WIDTH-1:0] id;
+#  logic              [7:0] len;
+#  logic                    prefetch;
+#  logic                    hit;
+#
+#  logic                    dropping;
+#
+#  enum logic [1:0]  { FORWARDING, DROPPING }
+#                            state_d,                state_q;
+#  logic                     burst_ongoing_d,        burst_ongoing_q;
+#  logic [7:0]               drop_cnt_d,             drop_cnt_q;
+#
+#  axi_buffer_rab
+#    #(
+#      .DATA_WIDTH       ( 2+AXI_ID_WIDTH+8  ),
+#      .BUFFER_DEPTH     ( BUFFER_DEPTH      )
+#      )
+#    u_fifo
+#      (
+#        .clk       ( axi4_aclk                              ),
+#        .rstn      ( axi4_arstn                             ),
+#        // Pop
+#        .data_out  ( {prefetch,   hit,   id,   len}         ),
+#        .valid_out ( fifo_valid                             ),
+#        .ready_in  ( fifo_pop                               ),
+#        // Push
+#        .valid_in  ( fifo_push                              ),
+#        .data_in   ( {prefetch_i, hit_i, id_i, drop_len_i}  ),
+#        .ready_out ( fifo_ready                             )
+#      );
+#
+#  assign fifo_push = drop_i & fifo_ready;
+#  assign done_o    = fifo_push;
+#
+#  always_comb begin
+#    burst_ongoing_d = burst_ongoing_q;
+#    drop_cnt_d      = drop_cnt_q;
+#    dropping        = 1'b0;
+#    s_axi4_rlast    = 1'b0;
+#    fifo_pop        = 1'b0;
+#    state_d         = state_q;
+#
+#    case (state_q)
+#      FORWARDING: begin
+#        s_axi4_rlast = m_axi4_rlast;
+#        // Remember whether there is currently a burst ongoing.
+#        if (m_axi4_rvalid && m_axi4_rready) begin
+#          if (m_axi4_rlast) begin
+#            burst_ongoing_d = 1'b0;
+#          end else begin
+#            burst_ongoing_d = 1'b1;
+#          end
+#        end
+#        // If there is no burst ongoing and the FIFO has a drop request ready, process it.
+#        if (!burst_ongoing_d && fifo_valid) begin
+#          drop_cnt_d  = len;
+#          state_d     = DROPPING;
+#        end
+#      end
+#
+#      DROPPING: begin
+#        dropping      = 1'b1;
+#        s_axi4_rlast  = (drop_cnt_q == '0);
+#        // Handshake on slave interface
+#        if (s_axi4_rready) begin
+#          drop_cnt_d -= 1;
+#          if (drop_cnt_q == '0) begin
+#            drop_cnt_d  = '0;
+#            fifo_pop    = 1'b1;
+#            state_d     = FORWARDING;
+#          end
+#        end
+#      end
+#
+#      default: begin
+#        state_d = FORWARDING;
+#      end
+#    endcase
+#  end
+#
+#  assign s_axi4_rdata  = m_axi4_rdata;
+#
+#  assign s_axi4_ruser  = dropping ? {AXI_USER_WIDTH{1'b0}} : m_axi4_ruser;
+#  assign s_axi4_rid    = dropping ? id : m_axi4_rid;
+#
+#  assign s_axi4_rresp  = (dropping & prefetch & hit) ? 2'b00 : // prefetch hit, mutli, prot
+#                         (dropping & prefetch      ) ? 2'b10 : // prefetch miss
+#                         (dropping            & hit) ? 2'b10 : // non-prefetch multi, prot
+#                         (dropping                 ) ? 2'b10 : // non-prefetch miss
+#                         m_axi4_rresp;
+#
+#  assign s_axi4_rvalid =  dropping | m_axi4_rvalid;
+#  assign m_axi4_rready = ~dropping & s_axi4_rready;
+#
+#  always_ff @(posedge axi4_aclk, negedge axi4_arstn) begin
+#    if (axi4_arstn == 1'b0) begin
+#      burst_ongoing_q <= 1'b0;
+#      drop_cnt_q      <=  'b0;
+#      state_q         <= FORWARDING;
+#    end else begin
+#      burst_ongoing_q <= burst_ongoing_d;
+#      drop_cnt_q      <= drop_cnt_d;
+#      state_q         <= state_d;
+#    end
+#  end
+#
+# endmodule
+#
+#
+#
+#
diff --git a/unused_please_ignore_completely/iommu/axi_rab/axi4_w_buffer.py b/unused_please_ignore_completely/iommu/axi_rab/axi4_w_buffer.py
new file mode 100644 (file)
index 0000000..aa06dc2
--- /dev/null
@@ -0,0 +1,777 @@
+# this file has been generated by sv2nmigen
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_w_buffer(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.l1_done_o = Signal()  # output
+        self.l1_accept_i = Signal()  # input
+        self.l1_save_i = Signal()  # input
+        self.l1_drop_i = Signal()  # input
+        self.l1_master_i = Signal()  # input
+        self.l1_id_i = Signal(AXI_ID_WIDTH)  # input
+        self.l1_len_i = Signal(8)  # input
+        self.l1_prefetch_i = Signal()  # input
+        self.l1_hit_i = Signal()  # input
+        self.l2_done_o = Signal()  # output
+        self.l2_accept_i = Signal()  # input
+        self.l2_drop_i = Signal()  # input
+        self.l2_master_i = Signal()  # input
+        self.l2_id_i = Signal(AXI_ID_WIDTH)  # input
+        self.l2_len_i = Signal(8)  # input
+        self.l2_prefetch_i = Signal()  # input
+        self.l2_hit_i = Signal()  # input
+        self.master_select_o = Signal()  # output
+        self.input_stall_o = Signal()  # output
+        self.output_stall_o = Signal()  # output
+        self.b_drop_o = Signal()  # output
+        self.b_done_i = Signal()  # input
+        self.id_o = Signal(AXI_ID_WIDTH)  # output
+        self.prefetch_o = Signal()  # output
+        self.hit_o = Signal()  # output
+        self.s_axi4_wdata = Signal(AXI_DATA_WIDTH)  # input
+        self.s_axi4_wvalid = Signal()  # input
+        self.s_axi4_wready = Signal()  # output
+        self.s_axi4_wstrb = Signal(1+ERROR p_expression_25)  # input
+        self.s_axi4_wlast = Signal()  # input
+        self.s_axi4_wuser = Signal(AXI_USER_WIDTH)  # input
+        self.m_axi4_wdata = Signal(AXI_DATA_WIDTH)  # output
+        self.m_axi4_wvalid = Signal()  # output
+        self.m_axi4_wready = Signal()  # input
+        self.m_axi4_wstrb = Signal(1+ERROR p_expression_25)  # output
+        self.m_axi4_wlast = Signal()  # output
+        self.m_axi4_wuser = Signal(AXI_USER_WIDTH)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+#
+# //import CfMath::log2;
+#
+# module axi4_w_buffer
+#  #(
+#    parameter AXI_DATA_WIDTH   = 32,
+#    parameter AXI_ID_WIDTH     = 4,
+#    parameter AXI_USER_WIDTH   = 4,
+#    parameter ENABLE_L2TLB     = 0,
+#    parameter HUM_BUFFER_DEPTH = 16
+#  )
+#  (
+#    input  logic                        axi4_aclk,
+#    input  logic                        axi4_arstn,
+#
+#    // L1 & L2 interfaces
+#    output logic                        l1_done_o,
+#    input  logic                        l1_accept_i,
+#    input  logic                        l1_save_i,
+#    input  logic                        l1_drop_i,
+#    input  logic                        l1_master_i,
+#    input  logic     [AXI_ID_WIDTH-1:0] l1_id_i,
+#    input  logic                  [7:0] l1_len_i,
+#    input  logic                        l1_prefetch_i,
+#    input  logic                        l1_hit_i,
+#
+#    output logic                        l2_done_o,
+#    input  logic                        l2_accept_i,
+#    input  logic                        l2_drop_i,
+#    input  logic                        l2_master_i,
+#    input  logic     [AXI_ID_WIDTH-1:0] l2_id_i,
+#    input  logic                  [7:0] l2_len_i,
+#    input  logic                        l2_prefetch_i,
+#    input  logic                        l2_hit_i,
+#
+#    output logic                        master_select_o,
+#    output logic                        input_stall_o,
+#    output logic                        output_stall_o,
+#
+#    // B sender interface
+#    output logic                        b_drop_o,
+#    input  logic                        b_done_i,
+#    output logic     [AXI_ID_WIDTH-1:0] id_o,
+#    output logic                        prefetch_o,
+#    output logic                        hit_o,
+#
+#    // AXI W channel interfaces
+#    input  logic   [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
+#    input  logic                        s_axi4_wvalid,
+#    output logic                        s_axi4_wready,
+#    input  logic [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
+#    input  logic                        s_axi4_wlast,
+#    input  logic   [AXI_USER_WIDTH-1:0] s_axi4_wuser,
+#
+#    output logic   [AXI_DATA_WIDTH-1:0] m_axi4_wdata,
+#    output logic                        m_axi4_wvalid,
+#    input  logic                        m_axi4_wready,
+#    output logic [AXI_DATA_WIDTH/8-1:0] m_axi4_wstrb,
+#    output logic                        m_axi4_wlast,
+#    output logic   [AXI_USER_WIDTH-1:0] m_axi4_wuser
+#  );
+#
+"""
+
+  localparam BUFFER_WIDTH  = AXI_DATA_WIDTH+AXI_USER_WIDTH+AXI_DATA_WIDTH/8+1;
+
+  localparam INPUT_BUFFER_DEPTH = 4;
+  localparam L1_FIFO_DEPTH      = 8;
+  localparam L2_FIFO_DEPTH      = 4;
+
+  logic      [AXI_DATA_WIDTH-1:0] axi4_wdata;
+  logic                           axi4_wvalid;
+  logic                           axi4_wready;
+  logic    [AXI_DATA_WIDTH/8-1:0] axi4_wstrb;
+  logic                           axi4_wlast;
+  logic      [AXI_USER_WIDTH-1:0] axi4_wuser;
+
+  logic                           l1_fifo_valid_out;
+  logic                           l1_fifo_ready_in;
+  logic                           l1_fifo_valid_in;
+  logic                           l1_fifo_ready_out;
+
+  logic                           l1_req;
+  logic                           l1_accept_cur, l1_save_cur, l1_drop_cur;
+  logic                           l1_master_cur;
+  logic        [AXI_ID_WIDTH-1:0] l1_id_cur;
+  logic                     [7:0] l1_len_cur;
+  logic                           l1_hit_cur, l1_prefetch_cur;
+  logic                           l1_save_in, l1_save_out;
+  logic [log2(L1_FIFO_DEPTH)-1:0] n_l1_save_SP;
+
+  logic                           l2_fifo_valid_out;
+  logic                           l2_fifo_ready_in;
+  logic                           l2_fifo_valid_in;
+  logic                           l2_fifo_ready_out;
+
+  logic                           l2_req;
+  logic                           l2_accept_cur, l2_drop_cur;
+  logic                           l2_master_cur;
+  logic        [AXI_ID_WIDTH-1:0] l2_id_cur;
+  logic                     [7:0] l2_len_cur;
+  logic                           l2_hit_cur, l2_prefetch_cur;
+
+  logic                           fifo_select, fifo_select_SN, fifo_select_SP;
+  logic                           w_done;
+  logic                           b_drop_set;
+
+  // HUM buffer signals
+  logic                           hum_buf_ready_out;
+  logic                           hum_buf_valid_in;
+  logic                           hum_buf_ready_in;
+  logic                           hum_buf_valid_out;
+  logic                           hum_buf_underfull;
+
+  logic      [AXI_DATA_WIDTH-1:0] hum_buf_wdata;
+  logic    [AXI_DATA_WIDTH/8-1:0] hum_buf_wstrb;
+  logic                           hum_buf_wlast;
+  logic      [AXI_USER_WIDTH-1:0] hum_buf_wuser;
+
+  logic                           hum_buf_drop_req_SN, hum_buf_drop_req_SP;
+  logic                     [7:0] hum_buf_drop_len_SN, hum_buf_drop_len_SP;
+  logic                           hum_buf_almost_full;
+
+  logic                           stop_store;
+  logic                           wlast_in, wlast_out;
+  logic signed              [3:0] n_wlast_SN,          n_wlast_SP;
+  logic                           block_forwarding;
+
+  // Search FSM
+  typedef enum logic        [3:0] {STORE,                       BYPASS,
+                                   WAIT_L1_BYPASS_YES,          WAIT_L2_BYPASS_YES,
+                                   WAIT_L1_BYPASS_NO,           WAIT_L2_BYPASS_NO,
+                                   FLUSH,                       DISCARD,
+                                   DISCARD_FINISH}
+                                  hum_buf_state_t;
+  hum_buf_state_t                 hum_buf_SP; // Present state
+  hum_buf_state_tbg                 hum_buf_SN; // Next State
+
+  axi_buffer_rab
+    #(
+      .DATA_WIDTH       ( BUFFER_WIDTH        ),
+      .BUFFER_DEPTH     ( INPUT_BUFFER_DEPTH  )
+      )
+    u_input_buf
+    (
+      .clk       ( axi4_aclk                                                ),
+      .rstn      ( axi4_arstn                                               ),
+      // Push
+      .data_in   ( {s_axi4_wuser, s_axi4_wstrb, s_axi4_wdata, s_axi4_wlast} ),
+      .valid_in  ( s_axi4_wvalid                                            ),
+      .ready_out ( s_axi4_wready                                            ),
+      // Pop
+      .data_out  ( {axi4_wuser,   axi4_wstrb,   axi4_wdata,   axi4_wlast}   ),
+      .valid_out ( axi4_wvalid                                              ),
+      .ready_in  ( axi4_wready                                              )
+    );
+
+  axi_buffer_rab
+    #(
+      .DATA_WIDTH       ( 2+AXI_ID_WIDTH+8+4  ),
+      .BUFFER_DEPTH     ( L1_FIFO_DEPTH       )
+      )
+    u_l1_fifo
+    (
+      .clk       ( axi4_aclk                                                                                                    ),
+      .rstn      ( axi4_arstn                                                                                                   ),
+      // Push
+      .data_in   ( {l1_prefetch_i,   l1_hit_i,   l1_id_i,   l1_len_i,   l1_master_i,   l1_accept_i,   l1_save_i,   l1_drop_i}   ),
+      .valid_in  ( l1_fifo_valid_in                                                                                             ),
+      .ready_out ( l1_fifo_ready_out                                                                                            ),
+      // Pop
+      .data_out  ( {l1_prefetch_cur, l1_hit_cur, l1_id_cur, l1_len_cur, l1_master_cur, l1_accept_cur, l1_save_cur, l1_drop_cur} ),
+      .valid_out ( l1_fifo_valid_out                                                                                            ),
+      .ready_in  ( l1_fifo_ready_in                                                                                             )
+    );
+
+    // Push upon receiving new requests from the TLB.
+    assign l1_req           = l1_accept_i | l1_save_i | l1_drop_i;
+    assign l1_fifo_valid_in = l1_req & l1_fifo_ready_out;
+
+    // Signal handshake
+    assign l1_done_o  = l1_fifo_valid_in;
+    assign l2_done_o  = l2_fifo_valid_in;
+
+    // Stall AW input of L1 TLB
+    assign input_stall_o = ~(l1_fifo_ready_out & l2_fifo_ready_out);
+
+    // Interface b_drop signals + handshake
+    always_comb begin
+      if (fifo_select == 1'b0) begin
+        prefetch_o       = l1_prefetch_cur;
+        hit_o            = l1_hit_cur;
+        id_o             = l1_id_cur;
+
+        l1_fifo_ready_in = w_done | b_done_i;
+        l2_fifo_ready_in = 1'b0;
+      end else begin
+        prefetch_o       = l2_prefetch_cur;
+        hit_o            = l2_hit_cur;
+        id_o             = l2_id_cur;
+
+        l1_fifo_ready_in = 1'b0;
+        l2_fifo_ready_in = w_done | b_done_i;
+      end
+    end
+
+    // Detect when an L1 transaction save request enters or exits the L1 FIFO.
+    assign l1_save_in  = l1_fifo_valid_in & l1_save_i;
+    assign l1_save_out = l1_fifo_ready_in & l1_save_cur;
+
+    // Count the number of L1 transaction to save in the L1 FIFO.
+    always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
+      if (axi4_arstn == 0) begin
+        n_l1_save_SP <= '0;
+      end else if (l1_save_in ^ l1_save_out) begin
+        if (l1_save_in) begin
+          n_l1_save_SP <= n_l1_save_SP + 1'b1;
+        end else if (l1_save_out) begin
+          n_l1_save_SP <= n_l1_save_SP - 1'b1;
+        end
+      end
+    end
+
+    // Stall forwarding of AW L1 hits if:
+    // 1. The HUM buffer does not allow to be bypassed.
+    // 2. There are multiple L1 save requests in the FIFO, i.e., multiple L2 outputs pending.
+    assign output_stall_o = (n_l1_save_SP > 1) || (block_forwarding == 1'b1);
+
+  generate
+  if (ENABLE_L2TLB == 1) begin : HUM_BUFFER
+
+    axi_buffer_rab_bram
+    #(
+      .DATA_WIDTH       ( BUFFER_WIDTH      ),
+      .BUFFER_DEPTH     ( HUM_BUFFER_DEPTH  )
+      )
+    u_hum_buf
+    (
+      .clk           ( axi4_aclk                                                    ),
+      .rstn          ( axi4_arstn                                                   ),
+      // Push
+      .data_in       ( {axi4_wuser,    axi4_wstrb,    axi4_wdata,    axi4_wlast}    ),
+      .valid_in      ( hum_buf_valid_in                                             ),
+      .ready_out     ( hum_buf_ready_out                                            ),
+      // Pop
+      .data_out      ( {hum_buf_wuser, hum_buf_wstrb, hum_buf_wdata, hum_buf_wlast} ),
+      .valid_out     ( hum_buf_valid_out                                            ),
+      .ready_in      ( hum_buf_ready_in                                             ),
+      // Clear
+      .almost_full   ( hum_buf_almost_full                                          ),
+      .underfull     ( hum_buf_underfull                                            ),
+      .drop_req      ( hum_buf_drop_req_SP                                          ),
+      .drop_len      ( hum_buf_drop_len_SP                                          )
+    );
+
+    axi_buffer_rab
+    #(
+      .DATA_WIDTH       ( 2+AXI_ID_WIDTH+8+3  ),
+      .BUFFER_DEPTH     ( L2_FIFO_DEPTH       )
+      )
+    u_l2_fifo
+    (
+      .clk       ( axi4_aclk                                                                                        ),
+      .rstn      ( axi4_arstn                                                                                       ),
+      // Push
+      .data_in   ( {l2_prefetch_i,   l2_hit_i,   l2_id_i,   l2_len_i,   l2_master_i,   l2_accept_i,   l2_drop_i}    ),
+      .valid_in  ( l2_fifo_valid_in                                                                                 ),
+      .ready_out ( l2_fifo_ready_out                                                                                ),
+      // Pop
+      .data_out  ( {l2_prefetch_cur, l2_hit_cur, l2_id_cur, l2_len_cur, l2_master_cur, l2_accept_cur, l2_drop_cur}  ),
+      .valid_out ( l2_fifo_valid_out                                                                                ),
+      .ready_in  ( l2_fifo_ready_in                                                                                 )
+    );
+
+    // Push upon receiving new result from TLB.
+    assign l2_req           = l2_accept_i | l2_drop_i;
+    assign l2_fifo_valid_in = l2_req & l2_fifo_ready_out;
+
+    assign wlast_in  =    axi4_wlast & hum_buf_valid_in  & hum_buf_ready_out;
+    assign wlast_out = hum_buf_wlast & hum_buf_valid_out & hum_buf_ready_in;
+
+    always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
+      if (axi4_arstn == 0) begin
+        fifo_select_SP      <= 1'b0;
+        hum_buf_drop_len_SP <=  'b0;
+        hum_buf_drop_req_SP <= 1'b0;
+        hum_buf_SP          <= STORE;
+        n_wlast_SP          <=  'b0;
+      end else begin
+        fifo_select_SP      <= fifo_select_SN;
+        hum_buf_drop_len_SP <= hum_buf_drop_len_SN;
+        hum_buf_drop_req_SP <= hum_buf_drop_req_SN;
+        hum_buf_SP          <= hum_buf_SN;
+        n_wlast_SP          <= n_wlast_SN;
+      end
+    end
+
+    always_comb begin
+      n_wlast_SN = n_wlast_SP;
+      if (hum_buf_drop_req_SP) begin  // Happens exactly once per burst to be dropped.
+        n_wlast_SN -= 1;
+      end
+      if (wlast_in) begin
+        n_wlast_SN += 1;
+      end
+      if (wlast_out) begin
+        n_wlast_SN -= 1;
+      end
+    end
+
+    always_comb begin : HUM_BUFFER_FSM
+      hum_buf_SN       = hum_buf_SP;
+
+      m_axi4_wlast     = 1'b0;
+      m_axi4_wdata     =  'b0;
+      m_axi4_wstrb     =  'b0;
+      m_axi4_wuser     =  'b0;
+
+      m_axi4_wvalid    = 1'b0;
+      axi4_wready      = 1'b0;
+
+      hum_buf_valid_in = 1'b0;
+      hum_buf_ready_in = 1'b0;
+
+      hum_buf_drop_req_SN = hum_buf_drop_req_SP;
+      hum_buf_drop_len_SN = hum_buf_drop_len_SP;
+      master_select_o  = 1'b0;
+
+      w_done           = 1'b0; // read from FIFO without handshake with B sender
+      b_drop_o         = 1'b0; // send data from FIFO to B sender (with handshake)
+      fifo_select      = 1'b0;
+
+      fifo_select_SN   = fifo_select_SP;
+      stop_store       = 1'b0;
+
+      block_forwarding = 1'b0;
+
+      unique case (hum_buf_SP)
+
+        STORE : begin
+          // Simply store the data in the buffer.
+          hum_buf_valid_in = axi4_wvalid & hum_buf_ready_out;
+          axi4_wready      = hum_buf_ready_out;
+
+          // We have got a full burst in the HUM buffer, thus stop storing.
+          if (wlast_in & !hum_buf_underfull | (n_wlast_SP > $signed(0))) begin
+            hum_buf_SN = WAIT_L1_BYPASS_YES;
+
+          // The buffer is full, thus wait for decision.
+          end else if (~hum_buf_ready_out) begin
+            hum_buf_SN = WAIT_L1_BYPASS_NO;
+          end
+
+          // Avoid the forwarding of L1 hits until we know whether we can bypass.
+          if (l1_fifo_valid_out & l1_save_cur) begin
+            block_forwarding = 1'b1;
+          end
+        end
+
+        WAIT_L1_BYPASS_YES : begin
+          // Wait for orders from L1 TLB.
+          if (l1_fifo_valid_out) begin
+
+            // L1 hit - forward data from buffer
+            if (l1_accept_cur) begin
+              m_axi4_wlast       = hum_buf_wlast;
+              m_axi4_wdata       = hum_buf_wdata;
+              m_axi4_wstrb       = hum_buf_wstrb;
+              m_axi4_wuser       = hum_buf_wuser;
+
+              m_axi4_wvalid      = hum_buf_valid_out;
+              hum_buf_ready_in   = m_axi4_wready;
+
+              master_select_o    = l1_master_cur;
+
+              // Detect last data beat.
+              if (wlast_out) begin
+                fifo_select      = 1'b0;
+                w_done           = 1'b1;
+                hum_buf_SN       = STORE;
+              end
+
+            // L1 miss - wait for L2
+            end else if (l1_save_cur) begin
+              fifo_select        = 1'b0;
+              w_done             = 1'b1;
+              hum_buf_SN         = WAIT_L2_BYPASS_YES;
+
+            // L1 prefetch, prot, multi - drop data
+            end else if (l1_drop_cur) begin
+              fifo_select_SN      = 1'b0; // L1
+              hum_buf_drop_req_SN = 1'b1;
+              hum_buf_drop_len_SN = l1_len_cur;
+              hum_buf_SN          = FLUSH;
+            end
+          end
+        end
+
+        WAIT_L2_BYPASS_YES : begin
+          // Wait for orders from L2 TLB.
+          if (l2_fifo_valid_out) begin
+
+            // L2 hit - forward data from buffer
+            if (l2_accept_cur) begin
+              m_axi4_wlast       = hum_buf_wlast;
+              m_axi4_wdata       = hum_buf_wdata;
+              m_axi4_wstrb       = hum_buf_wstrb;
+              m_axi4_wuser       = hum_buf_wuser;
+
+              m_axi4_wvalid      = hum_buf_valid_out;
+              hum_buf_ready_in   = m_axi4_wready;
+
+              master_select_o    = l2_master_cur;
+
+              // Detect last data beat.
+              if (wlast_out) begin
+                fifo_select      = 1'b1;
+                w_done           = 1'b1;
+                hum_buf_SN       = STORE;
+              end
+
+            // L2 miss/prefetch hit
+            end else if (l2_drop_cur) begin
+              fifo_select_SN      = 1'b1; // L2
+              hum_buf_drop_req_SN = 1'b1;
+              hum_buf_drop_len_SN = l2_len_cur;
+              hum_buf_SN          = FLUSH;
+            end
+
+          // While we wait for orders from L2 TLB, we can still drop and accept L1 transactions.
+          end else if (l1_fifo_valid_out) begin
+
+            // L1 hit
+            if (l1_accept_cur) begin
+              hum_buf_SN         = BYPASS;
+
+            // L1 prefetch/prot/multi
+            end else if (l1_drop_cur) begin
+              hum_buf_SN         = DISCARD;
+            end
+          end
+        end
+
+        FLUSH : begin
+          // Clear HUM buffer flush request.
+          hum_buf_drop_req_SN = 1'b0;
+
+          // perform handshake with B sender
+          fifo_select      = fifo_select_SP;
+          b_drop_o         = 1'b1;
+          if (b_done_i) begin
+            hum_buf_SN     = STORE;
+          end
+        end
+
+        BYPASS : begin
+          // Forward one full transaction from input buffer.
+          m_axi4_wlast       = axi4_wlast;
+          m_axi4_wdata       = axi4_wdata;
+          m_axi4_wstrb       = axi4_wstrb;
+          m_axi4_wuser       = axi4_wuser;
+
+          m_axi4_wvalid      = axi4_wvalid;
+          axi4_wready        = m_axi4_wready;
+
+          master_select_o    = l1_master_cur;
+
+          // We have got a full transaction.
+          if (axi4_wlast & axi4_wready & axi4_wvalid) begin
+            fifo_select      = 1'b0;
+            w_done           = 1'b1;
+            hum_buf_SN       = WAIT_L2_BYPASS_YES;
+          end
+        end
+
+        DISCARD : begin
+          // Discard one full transaction from input buffer.
+          axi4_wready        = 1'b1;
+
+          // We have got a full transaction.
+          if (axi4_wlast & axi4_wready & axi4_wvalid) begin
+            // Try to perform handshake with B sender.
+            fifo_select      = 1'b0;
+            b_drop_o         = 1'b1;
+            // We cannot wait here due to axi4_wready.
+            if (b_done_i) begin
+              hum_buf_SN     = WAIT_L2_BYPASS_YES;
+            end else begin
+              hum_buf_SN     = DISCARD_FINISH;
+            end
+          end
+        end
+
+        DISCARD_FINISH : begin
+          // Perform handshake with B sender.
+          fifo_select      = 1'b0;
+          b_drop_o         = 1'b1;
+          if (b_done_i) begin
+            hum_buf_SN     = WAIT_L2_BYPASS_YES;
+          end
+        end
+
+        WAIT_L1_BYPASS_NO : begin
+          // Do not allow the forwarding of L1 hits.
+          block_forwarding       = 1'b1;
+
+          // Wait for orders from L1 TLB.
+          if (l1_fifo_valid_out) begin
+
+            // L1 hit - forward data from/through HUM buffer and refill the buffer
+            if (l1_accept_cur) begin
+              // Forward data from HUM buffer.
+              m_axi4_wlast       = hum_buf_wlast;
+              m_axi4_wdata       = hum_buf_wdata;
+              m_axi4_wstrb       = hum_buf_wstrb;
+              m_axi4_wuser       = hum_buf_wuser;
+
+              m_axi4_wvalid      = hum_buf_valid_out;
+              hum_buf_ready_in   = m_axi4_wready;
+
+              master_select_o    = l1_master_cur;
+
+              // Refill the HUM buffer. Stop when buffer full.
+              stop_store         = ~hum_buf_ready_out;
+              hum_buf_valid_in   = stop_store ? 1'b0 : axi4_wvalid      ;
+              axi4_wready        = stop_store ? 1'b0 : hum_buf_ready_out;
+
+              // Detect last data beat.
+              if (wlast_out) begin
+                fifo_select      = 1'b0;
+                w_done           = 1'b1;
+                if (~hum_buf_ready_out | hum_buf_almost_full) begin
+                  hum_buf_SN     = WAIT_L1_BYPASS_NO;
+                end else begin
+                  hum_buf_SN     = STORE;
+                end
+              end
+
+              // Allow the forwarding of L1 hits.
+              block_forwarding   = 1'b0;
+
+            // L1 miss - wait for L2
+            end else if (l1_save_cur) begin
+              fifo_select        = 1'b0;
+              w_done             = 1'b1;
+              hum_buf_SN         = WAIT_L2_BYPASS_NO;
+
+            // L1 prefetch, prot, multi - drop data
+            end else if (l1_drop_cur) begin
+              fifo_select_SN      = 1'b0; // L1
+              hum_buf_drop_req_SN = 1'b1;
+              hum_buf_drop_len_SN = l1_len_cur;
+              hum_buf_SN          = FLUSH;
+
+              // Allow the forwarding of L1 hits.
+              block_forwarding   = 1'b0;
+            end
+          end
+        end
+
+        WAIT_L2_BYPASS_NO : begin
+          // Do not allow the forwarding of L1 hits.
+          block_forwarding       = 1'b1;
+
+          // Wait for orders from L2 TLB.
+          if (l2_fifo_valid_out) begin
+
+            // L2 hit - forward first part from HUM buffer, rest from input buffer
+            if (l2_accept_cur) begin
+              // Forward data from HUM buffer.
+              m_axi4_wlast       = hum_buf_wlast;
+              m_axi4_wdata       = hum_buf_wdata;
+              m_axi4_wstrb       = hum_buf_wstrb;
+              m_axi4_wuser       = hum_buf_wuser;
+
+              m_axi4_wvalid      = hum_buf_valid_out;
+              hum_buf_ready_in   = m_axi4_wready;
+
+              master_select_o    = l2_master_cur;
+
+              // Refill the HUM buffer. Stop when buffer full.
+              stop_store         = ~hum_buf_ready_out;
+              hum_buf_valid_in   = stop_store ? 1'b0 : axi4_wvalid      ;
+              axi4_wready        = stop_store ? 1'b0 : hum_buf_ready_out;
+
+              // Detect last data beat.
+              if (wlast_out) begin
+                fifo_select      = 1'b1;
+                w_done           = 1'b1;
+                if (~hum_buf_ready_out | hum_buf_almost_full) begin
+                  hum_buf_SN     = WAIT_L1_BYPASS_NO;
+                end else begin
+                  hum_buf_SN     = STORE;
+                end
+              end
+
+              // Allow the forwarding of L1 hits.
+              block_forwarding   = 1'b0;
+
+            // L2 miss/prefetch hit - drop data
+            end else if (l2_drop_cur) begin
+              fifo_select_SN      = 1'b1; // L2
+              hum_buf_drop_req_SN = 1'b1;
+              hum_buf_drop_len_SN = l2_len_cur;
+              hum_buf_SN          = FLUSH;
+
+              // Allow the forwarding of L1 hits.
+              block_forwarding   = 1'b0;
+            end
+          end
+        end
+
+
+        default: begin
+          hum_buf_SN = STORE;
+        end
+
+      endcase // hum_buf_SP
+    end // HUM_BUFFER_FSM
+
+    assign b_drop_set = 1'b0;
+
+  end else begin // HUM_BUFFER
+
+    // register to perform the handshake with B sender
+    always_ff @(posedge axi4_aclk or negedge axi4_arstn) begin
+      if (axi4_arstn == 0) begin
+        b_drop_o <= 1'b0;
+      end else if (b_done_i) begin
+        b_drop_o <= 1'b0;
+      end else if (b_drop_set) begin
+        b_drop_o <= 1'b1;;
+      end
+    end
+
+    always_comb begin : OUTPUT_CTRL
+
+      fifo_select   = 1'b0;
+      w_done        = 1'b0;
+      b_drop_set    = 1'b0;
+
+      m_axi4_wlast  = 1'b0;
+      m_axi4_wdata  =  'b0;
+      m_axi4_wstrb  =  'b0;
+      m_axi4_wuser  =  'b0;
+
+      m_axi4_wvalid = 1'b0;
+      axi4_wready   = 1'b0;
+
+      if (l1_fifo_valid_out) begin
+        // forward data
+        if (l1_accept_cur) begin
+          m_axi4_wlast  = axi4_wlast;
+          m_axi4_wdata  = axi4_wdata;
+          m_axi4_wstrb  = axi4_wstrb;
+          m_axi4_wuser  = axi4_wuser;
+
+          m_axi4_wvalid = axi4_wvalid;
+          axi4_wready   = m_axi4_wready;
+
+          // Simply pop from FIFO upon last data beat.
+          w_done        = axi4_wlast & axi4_wvalid & axi4_wready;
+
+        // discard entire burst
+        end else if (b_drop_o == 1'b0) begin
+          axi4_wready   = 1'b1;
+
+          // Simply pop from FIFO upon last data beat. Perform handshake with B sender.
+          if (axi4_wlast & axi4_wvalid & axi4_wready)
+            b_drop_set  = 1'b1;
+        end
+      end
+
+    end // OUTPUT_CTRL
+
+    assign master_select_o     = l1_master_cur;
+    assign l2_fifo_ready_out   = 1'b1;
+    assign block_forwarding    = 1'b0;
+
+    // unused signals
+    assign hum_buf_ready_out   = 1'b0;
+    assign hum_buf_valid_in    = 1'b0;
+    assign hum_buf_ready_in    = 1'b0;
+    assign hum_buf_valid_out   = 1'b0;
+    assign hum_buf_wdata       =  'b0;
+    assign hum_buf_wstrb       =  'b0;
+    assign hum_buf_wlast       = 1'b0;
+    assign hum_buf_wuser       =  'b0;
+    assign hum_buf_drop_len_SN =  'b0;
+    assign hum_buf_drop_req_SN = 1'b0;
+    assign hum_buf_almost_full = 1'b0;
+
+    assign l2_fifo_valid_in    = 1'b0;
+    assign l2_fifo_valid_out   = 1'b0;
+    assign l2_prefetch_cur     = 1'b0;
+    assign l2_hit_cur          = 1'b0;
+    assign l2_id_cur           =  'b0;
+    assign l2_len_cur          =  'b0;
+    assign l2_master_cur       = 1'b0;
+    assign l2_accept_cur       = 1'b0;
+    assign l2_drop_cur         = 1'b0;
+
+    assign l2_req              = 1'b0;
+
+    assign fifo_select_SN      = 1'b0;
+    assign fifo_select_SP      = 1'b0;
+
+    assign stop_store          = 1'b0;
+    assign n_wlast_SP          =  'b0;
+    assign wlast_in            = 1'b0;
+    assign wlast_out           = 1'b0;
+
+  end // HUM_BUFFER
+
+  endgenerate
+"""
diff --git a/unused_please_ignore_completely/iommu/axi_rab/axi4_w_sender.py b/unused_please_ignore_completely/iommu/axi_rab/axi4_w_sender.py
new file mode 100644 (file)
index 0000000..9916334
--- /dev/null
@@ -0,0 +1,78 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi4_w_sender(Elaboratable):
+
+    def __init__(self):
+        self.axi4_aclk = Signal()  # input
+        self.axi4_arstn = Signal()  # input
+        self.s_axi4_wdata = Signal()  # input
+        self.s_axi4_wvalid = Signal()  # input
+        self.s_axi4_wready = Signal()  # output
+        self.s_axi4_wstrb = Signal()  # input
+        self.s_axi4_wlast = Signal()  # input
+        self.s_axi4_wuser = Signal()  # input
+        self.m_axi4_wdata = Signal()  # output
+        self.m_axi4_wvalid = Signal()  # output
+        self.m_axi4_wready = Signal()  # input
+        self.m_axi4_wstrb = Signal()  # output
+        self.m_axi4_wlast = Signal()  # output
+        self.m_axi4_wuser = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.m_axi4_wdata.eq(self.s_axi4_wdata)
+        m.d.comb += self.m_axi4_wstrb.eq(self.s_axi4_wstrb)
+        m.d.comb += self.m_axi4_wlast.eq(self.s_axi4_wlast)
+        m.d.comb += self.m_axi4_wuser.eq(self.s_axi4_wuser)
+        m.d.comb += self.m_axi4_wvalid.eq(self.s_axi4_wvalid)
+        m.d.comb += self.s_axi4_wready.eq(self.m_axi4_wready)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module axi4_w_sender
+#  #(
+#    parameter AXI_DATA_WIDTH = 32,
+#    parameter AXI_USER_WIDTH = 2
+#  )
+#  (
+#    input                         axi4_aclk,
+#    input                         axi4_arstn,
+#
+#    input    [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
+#    input                         s_axi4_wvalid,
+#    output                        s_axi4_wready,
+#    input  [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
+#    input                         s_axi4_wlast,
+#    input    [AXI_USER_WIDTH-1:0] s_axi4_wuser,
+#
+#    output   [AXI_DATA_WIDTH-1:0] m_axi4_wdata,
+#    output                        m_axi4_wvalid,
+#    input                         m_axi4_wready,
+#    output [AXI_DATA_WIDTH/8-1:0] m_axi4_wstrb,
+#    output                        m_axi4_wlast,
+#    output   [AXI_USER_WIDTH-1:0] m_axi4_wuser
+#  );
+#
+#  assign m_axi4_wdata  = s_axi4_wdata;
+#  assign m_axi4_wstrb  = s_axi4_wstrb;
+#  assign m_axi4_wlast  = s_axi4_wlast;
+#  assign m_axi4_wuser  = s_axi4_wuser;
+#
+#  assign m_axi4_wvalid = s_axi4_wvalid;
+#  assign s_axi4_wready = m_axi4_wready;
+#
+# endmodule
+#
+#
diff --git a/unused_please_ignore_completely/iommu/axi_rab/axi_buffer_rab.py b/unused_please_ignore_completely/iommu/axi_rab/axi_buffer_rab.py
new file mode 100644 (file)
index 0000000..b4d9929
--- /dev/null
@@ -0,0 +1,151 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi_buffer_rab(Elaboratable):
+
+    def __init__(self):
+        self.clk = Signal()  # input
+        self.rstn = Signal()  # input
+        self.data_out = Signal(DATA_WIDTH)  # output
+        self.valid_out = Signal()  # output
+        self.ready_in = Signal()  # input
+        self.valid_in = Signal()  # input
+        self.data_in = Signal(DATA_WIDTH)  # input
+        self.ready_out = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.d.comb += self.full.eq(self.None)
+        m.d.comb += self.data_out.eq(self.None)
+        m.d.comb += self.valid_out.eq(self.None)
+        m.d.comb += self.ready_out.eq(self.None)
+        return m
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# //import CfMath::log2;
+#
+# module axi_buffer_rab
+#  //#(
+#  //  parameter DATA_WIDTH,
+#  //  parameter BUFFER_DEPTH
+#  //)
+#  (
+#    input logic                   clk,
+#    input logic                   rstn,
+#
+#    // Downstream port
+#    output logic [DATA_WIDTH-1:0] data_out,
+#    output logic                  valid_out,
+#    input  logic                  ready_in,
+#
+#    // Upstream port
+#    input  logic                  valid_in,
+#    input  logic [DATA_WIDTH-1:0] data_in,
+#    output logic                  ready_out
+#  );
+#
+#  localparam integer LOG_BUFFER_DEPTH = log2(BUFFER_DEPTH);
+#
+#    // Internal data structures
+#    reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_in;   // location to which we last wrote
+#    reg [LOG_BUFFER_DEPTH - 1 : 0] pointer_out;  // location from which we last sent
+#    reg     [LOG_BUFFER_DEPTH : 0] elements;     // number of elements in the buffer
+#    reg       [DATA_WIDTH - 1 : 0] buffer [BUFFER_DEPTH - 1 : 0];
+#
+#    wire full;
+#
+#    integer loop1;
+#
+#    assign full = (elements == BUFFER_DEPTH);
+#
+#    always @(posedge clk or negedge rstn)
+#      begin: elements_sequential
+#        if (rstn == 1'b0)
+#          elements <= 0;
+#        else
+#        begin
+#          // ------------------
+#          // Are we filling up?
+#          // ------------------
+#          // One out, none in
+#          if (ready_in && valid_out && (!valid_in || full))
+#            elements <= elements - 1;
+#          // None out, one in
+#          else if ((!valid_out || !ready_in) && valid_in && !full)
+#            elements <= elements + 1;
+#          // Else, either one out and one in, or none out and none in - stays unchanged
+#        end
+#      end
+#
+#    always @(posedge clk or negedge rstn)
+#      begin: buffers_sequential
+#        if (rstn == 1'b0)
+#        begin
+#          for (loop1 = 0 ; loop1 < BUFFER_DEPTH ; loop1 = loop1 + 1)
+#            buffer[loop1] <= 0;
+#        end
+#        else
+#        begin
+#          // Update the memory
+#          if (valid_in && !full)
+#            buffer[pointer_in] <= data_in;
+#        end
+#      end
+#
+#    always @(posedge clk or negedge rstn)
+#      begin: sequential
+#        if (rstn == 1'b0)
+#        begin
+#          pointer_out <= 0;
+#          pointer_in <= 0;
+#        end
+#        else
+#        begin
+#          // ------------------------------------
+#          // Check what to do with the input side
+#          // ------------------------------------
+#          // We have some input, increase by 1 the input pointer
+#          if (valid_in && !full)
+#          begin
+#            if (pointer_in == $unsigned(BUFFER_DEPTH - 1))
+#              pointer_in <= 0;
+#            else
+#              pointer_in <= pointer_in + 1;
+#          end
+#          // Else we don't have any input, the input pointer stays the same
+#
+#          // -------------------------------------
+#          // Check what to do with the output side
+#          // -------------------------------------
+#          // We had pushed one flit out, we can try to go for the next one
+#          if (ready_in && valid_out)
+#          begin
+#            if (pointer_out == $unsigned(BUFFER_DEPTH - 1))
+#              pointer_out <= 0;
+#            else
+#              pointer_out <= pointer_out + 1;
+#          end
+#          // Else stay on the same output location
+#        end
+#      end
+#
+#    // Update output ports
+#    assign data_out = buffer[pointer_out];
+#    assign valid_out = (elements != 0);
+#
+#    assign ready_out = ~full;
+#
+# endmodule
+#
+#
diff --git a/unused_please_ignore_completely/iommu/axi_rab/axi_buffer_rab_bram.py b/unused_please_ignore_completely/iommu/axi_rab/axi_buffer_rab_bram.py
new file mode 100644 (file)
index 0000000..349b314
--- /dev/null
@@ -0,0 +1,209 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi_buffer_rab_bram(Elaboratable):
+
+    def __init__(self):
+        self.clk = Signal()  # input
+        self.rstn = Signal()  # input
+        self.data_out = Signal(DATA_WIDTH)  # output
+        self.valid_out = Signal()  # output
+        self.ready_in = Signal()  # input
+        self.valid_in = Signal()  # input
+        self.data_in = Signal(DATA_WIDTH)  # input
+        self.ready_out = Signal()  # output
+        self.almost_full = Signal()  # output
+        self.underfull = Signal()  # output
+        self.drop_req = Signal()  # input
+        self.drop_len = Signal(8)  # input
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# ////import CfMath::log2;
+#
+# module axi_buffer_rab_bram
+#  //#(
+#  //  parameter DATA_WIDTH,
+#  //  parameter BUFFER_DEPTH
+#  //  )
+#   (
+#    input logic                   clk,
+#    input logic                   rstn,
+#
+#    // Downstream port
+#    output logic [DATA_WIDTH-1:0] data_out,
+#    output logic                  valid_out,
+#    input  logic                  ready_in,
+#
+#    // Upstream port
+#    input  logic                  valid_in,
+#    input  logic [DATA_WIDTH-1:0] data_in,
+#    output logic                  ready_out,
+#
+#    // Status and drop control
+#    output logic                  almost_full,
+#    output logic                  underfull,
+#    input  logic                  drop_req,
+#    // Number of items to drop.  As for AXI lengths, counting starts at zero, i.e., `drop_len == 0`
+#    // and `drop_req` means drop one item.
+#    input  logic [7:0]            drop_len
+#    );
+#
+"""  #docstring_begin
+  // The BRAM needs to be in "write-first" mode for first-word fall-through FIFO behavior.
+  // To still push and pop simultaneously if the buffer is full, we internally increase the
+  // buffer depth by 1.
+  localparam ACT_BUFFER_DEPTH     = BUFFER_DEPTH+1;
+  localparam ACT_LOG_BUFFER_DEPTH = log2(ACT_BUFFER_DEPTH+1);
+
+  /**
+    * Internal data structures
+    */
+  // Location to which we last wrote
+  logic        [ACT_LOG_BUFFER_DEPTH-1:0] ptr_in_d,                   ptr_in_q;
+  // Location from which we last sent
+  logic        [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_d,                  ptr_out_q;
+  // Required for fall-through behavior on the first word
+  logic        [ACT_LOG_BUFFER_DEPTH-1:0] ptr_out_bram;
+  // Number of elements in the buffer.  Can be negative if elements that have been dropped have not
+  // yet been written.
+  logic signed   [ACT_LOG_BUFFER_DEPTH:0] n_elems_d,                  n_elems_q;
+
+  logic           [DATA_WIDTH-1:0]        data_out_bram, data_out_q;
+  logic                                   valid_out_q;
+
+  logic full;
+
+  assign almost_full = (n_elems_q == BUFFER_DEPTH-1);
+  assign full        = (n_elems_q == BUFFER_DEPTH);
+
+  always_ff @(posedge clk, negedge rstn) begin
+    if (~rstn) begin
+      n_elems_q <= '0;
+      ptr_in_q  <= '0;
+      ptr_out_q <= '0;
+    end else begin
+      n_elems_q <= n_elems_d;
+      ptr_in_q  <= ptr_in_d;
+      ptr_out_q <= ptr_out_d;
+    end
+  end
+
+  // Update the number of elements.
+  always_comb begin
+    n_elems_d = n_elems_q;
+    if (drop_req) begin
+      n_elems_d -= (drop_len + 1);
+    end
+    if (valid_in && ready_out) begin
+      n_elems_d += 1;
+    end
+    if (valid_out && ready_in) begin
+      n_elems_d -= 1;
+    end
+  end
+
+  // Update the output pointer.
+  always_comb begin
+    ptr_out_d = ptr_out_q;
+    if (drop_req) begin
+      if ((ptr_out_q + drop_len + 1) > (ACT_BUFFER_DEPTH - 1)) begin
+        ptr_out_d = drop_len + 1 - (ACT_BUFFER_DEPTH - ptr_out_q);
+      end else begin
+        ptr_out_d += (drop_len + 1);
+      end
+    end
+    if (valid_out && ready_in) begin
+      if (ptr_out_d == (ACT_BUFFER_DEPTH - 1)) begin
+        ptr_out_d = '0;
+      end else begin
+        ptr_out_d += 1;
+      end
+    end
+  end
+
+  // The BRAM has a read latency of one cycle, so apply the new address one cycle earlier for
+  // first-word fall-through FIFO behavior.
+  //assign ptr_out_bram = (ptr_out_q == (ACT_BUFFER_DEPTH-1)) ? '0 : (ptr_out_q + 1);
+  assign ptr_out_bram = ptr_out_d;
+
+  // Update the input pointer.
+  always_comb begin
+    ptr_in_d = ptr_in_q;
+    if (valid_in && ready_out) begin
+      if (ptr_in_d == (ACT_BUFFER_DEPTH - 1)) begin
+        ptr_in_d = '0;
+      end else begin
+        ptr_in_d += 1;
+      end
+    end
+  end
+
+  // Update output ports.
+  assign valid_out = (n_elems_q > $signed(0));
+  assign underfull = (n_elems_q < $signed(0));
+  assign ready_out = ~full;
+
+  ram_tp_write_first #(
+    .ADDR_WIDTH ( ACT_LOG_BUFFER_DEPTH ),
+    .DATA_WIDTH ( DATA_WIDTH           )
+  )
+  ram_tp_write_first_0
+  (
+    .clk   ( clk              ),
+    .we    ( valid_in & ~full ),
+    .addr0 ( ptr_in_q         ),
+    .addr1 ( ptr_out_bram     ),
+    .d_i   ( data_in          ),
+    .d0_o  (                  ),
+    .d1_o  ( data_out_bram    )
+  );
+
+  // When reading from/writing two the same address on both ports ("Write-Read Collision"),
+  // the data on the read port is invalid (during the write cycle). In this implementation,
+  // this can happen only when the buffer is empty. Thus, we forward the data from an
+  // register in this case.
+  always @(posedge clk) begin
+    if (rstn == 1'b0) begin
+      data_out_q <= 'b0;
+    end else if ( (ptr_out_bram == ptr_in_q) && (valid_in && !full) ) begin
+      data_out_q <= data_in;
+    end
+  end
+
+  always @(posedge clk) begin
+    if (rstn == 1'b0) begin
+      valid_out_q <= 'b0;
+    end else begin
+      valid_out_q <= valid_out;
+    end
+  end
+
+  // Drive output data
+  always_comb begin
+    if (valid_out && !valid_out_q) begin // We have just written to an empty FIFO
+      data_out = data_out_q;
+    end else begin
+      data_out = data_out_bram;
+    end
+  end
+
+"""
+# endmodule
+#
+#
diff --git a/unused_please_ignore_completely/iommu/axi_rab/axi_rab_cfg.py b/unused_please_ignore_completely/iommu/axi_rab/axi_rab_cfg.py
new file mode 100644 (file)
index 0000000..43843b9
--- /dev/null
@@ -0,0 +1,707 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi_rab_cfg(Elaboratable):
+
+    def __init__(self):
+        self.Clk_CI = Signal()  # input
+        self.Rst_RBI = Signal()  # input
+        self.s_axi_awaddr = Signal(AXI_ADDR_WIDTH)  # input
+        self.s_axi_awvalid = Signal()  # input
+        self.s_axi_awready = Signal()  # output
+        self.s_axi_wdata = Signal()  # input
+        self.s_axi_wstrb = Signal(1+ERROR p_expression_25)  # input
+        self.s_axi_wvalid = Signal()  # input
+        self.s_axi_wready = Signal()  # output
+        self.s_axi_bresp = Signal(2)  # output
+        self.s_axi_bvalid = Signal()  # output
+        self.s_axi_bready = Signal()  # input
+        self.s_axi_araddr = Signal(AXI_ADDR_WIDTH)  # input
+        self.s_axi_arvalid = Signal()  # input
+        self.s_axi_arready = Signal()  # output
+        self.s_axi_rdata = Signal(AXI_DATA_WIDTH)  # output
+        self.s_axi_rresp = Signal(2)  # output
+        self.s_axi_rvalid = Signal()  # output
+        self.s_axi_rready = Signal()  # input
+        self.L1Cfg_DO = Signal()  # output
+        self.L1AllowMultiHit_SO = Signal()  # output
+        self.MissAddr_DI = Signal(ADDR_WIDTH_VIRT)  # input
+        self.MissMeta_DI = Signal(MISS_META_WIDTH)  # input
+        self.Miss_SI = Signal()  # input
+        self.MhFifoFull_SO = Signal()  # output
+        self.wdata_l2 = Signal()  # output
+        self.waddr_l2 = Signal()  # output
+        self.wren_l2 = Signal(N_PORTS)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# // --=========================================================================--
+# //
+# //  █████╗ ██╗  ██╗██╗    ██████╗  █████╗ ██████╗      ██████╗███████╗ ██████╗
+# // ██╔══██╗╚██╗██╔╝██║    ██╔══██╗██╔══██╗██╔══██╗    ██╔════╝██╔════╝██╔════╝
+# // ███████║ ╚███╔╝ ██║    ██████╔╝███████║██████╔╝    ██║     █████╗  ██║  ███╗
+# // ██╔══██║ ██╔██╗ ██║    ██╔══██╗██╔══██║██╔══██╗    ██║     ██╔══╝  ██║   ██║
+# // ██║  ██║██╔╝ ██╗██║    ██║  ██║██║  ██║██████╔╝    ╚██████╗██║     ╚██████╔╝
+# // ╚═╝  ╚═╝╚═╝  ╚═╝╚═╝    ╚═╝  ╚═╝╚═╝  ╚═╝╚═════╝      ╚═════╝╚═╝      ╚═════╝
+# //
+# //
+# // Author: Pirmin Vogel - vogelpi@iis.ee.ethz.ch
+# //
+# // Purpose : AXI4-Lite configuration and miss handling interface for RAB
+# //
+# // --=========================================================================--
+#
+# //import CfMath::log2;
+#
+# module axi_rab_cfg
+#  #(
+#    parameter N_PORTS         =   3,
+#    parameter N_REGS          = 196,
+#    parameter N_L2_SETS       =  32,
+#    parameter N_L2_SET_ENTRIES=  32,
+#    parameter ADDR_WIDTH_PHYS =  40,
+#    parameter ADDR_WIDTH_VIRT =  32,
+#    parameter N_FLAGS         =   4,
+#    parameter AXI_DATA_WIDTH  =  64,
+#    parameter AXI_ADDR_WIDTH  =  32,
+#    parameter MISS_META_WIDTH =  10,  // <= FIFO_WIDTH
+#    parameter MH_FIFO_DEPTH   =  16
+#    )
+#   (
+#    input  logic                                    Clk_CI,
+#    input  logic                                    Rst_RBI,
+#
+#    // AXI Lite interface
+#    input  logic [AXI_ADDR_WIDTH-1:0]               s_axi_awaddr,
+#    input  logic                                    s_axi_awvalid,
+#    output logic                                    s_axi_awready,
+#    input  logic [AXI_DATA_WIDTH/8-1:0][7:0]        s_axi_wdata,
+#    input  logic [AXI_DATA_WIDTH/8-1:0]             s_axi_wstrb,
+#    input  logic                                    s_axi_wvalid,
+#    output logic                                    s_axi_wready,
+#    output logic [1:0]                              s_axi_bresp,
+#    output logic                                    s_axi_bvalid,
+#    input  logic                                    s_axi_bready,
+#    input  logic [AXI_ADDR_WIDTH-1:0]               s_axi_araddr,
+#    input  logic                                    s_axi_arvalid,
+#    output logic                                    s_axi_arready,
+#    output logic [AXI_DATA_WIDTH-1:0]               s_axi_rdata,
+#    output logic [1:0]                              s_axi_rresp,
+#    output logic                                    s_axi_rvalid,
+#    input  logic                                    s_axi_rready,
+#
+#    // Slice configuration
+#    output logic [N_REGS-1:0][63:0]                 L1Cfg_DO,
+#    output logic                                    L1AllowMultiHit_SO,
+#
+#    // Miss handling
+#    input  logic [ADDR_WIDTH_VIRT-1:0]              MissAddr_DI,
+#    input  logic [MISS_META_WIDTH-1:0]              MissMeta_DI,
+#    input  logic                                    Miss_SI,
+#    output logic                                    MhFifoFull_SO,
+#
+#    // L2 TLB
+#    output logic [N_PORTS-1:0] [AXI_DATA_WIDTH-1:0] wdata_l2,
+#    output logic [N_PORTS-1:0] [AXI_ADDR_WIDTH-1:0] waddr_l2,
+#    output logic [N_PORTS-1:0]                      wren_l2
+#  );
+#
+"""  #docstring_begin
+
+  localparam ADDR_LSB = log2(64/8); // 64 even if the AXI Lite interface is 32,
+                                    // because RAB slices are 64 bit wide.
+  localparam ADDR_MSB = log2(N_REGS)+ADDR_LSB-1;
+
+  localparam L2SINGLE_AMAP_SIZE = 16'h4000; // Maximum 2048 TLB entries in L2
+
+  localparam integer N_L2_ENTRIES = N_L2_SETS * N_L2_SET_ENTRIES;
+
+  localparam logic [AXI_ADDR_WIDTH-1:0] L2_VA_MAX_ADDR = (N_L2_ENTRIES-1) << 2;
+
+  logic [AXI_DATA_WIDTH/8-1:0][7:0] L1Cfg_DP[N_REGS]; // [Byte][Bit]
+  genvar j;
+
+  //  █████╗ ██╗  ██╗██╗██╗  ██╗      ██╗     ██╗████████╗███████╗
+  // ██╔══██╗╚██╗██╔╝██║██║  ██║      ██║     ██║╚══██╔══╝██╔════╝
+  // ███████║ ╚███╔╝ ██║███████║█████╗██║     ██║   ██║   █████╗
+  // ██╔══██║ ██╔██╗ ██║╚════██║╚════╝██║     ██║   ██║   ██╔══╝
+  // ██║  ██║██╔╝ ██╗██║     ██║      ███████╗██║   ██║   ███████╗
+  // ╚═╝  ╚═╝╚═╝  ╚═╝╚═╝     ╚═╝      ╚══════╝╚═╝   ╚═╝   ╚══════╝
+  //
+  logic [AXI_ADDR_WIDTH-1:0]        awaddr_reg;
+  logic                             awaddr_done_rise;
+  logic                             awaddr_done_reg;
+  logic                             awaddr_done_reg_dly;
+
+  logic [AXI_DATA_WIDTH/8-1:0][7:0] wdata_reg;
+  logic [AXI_DATA_WIDTH/8-1:0]      wstrb_reg;
+  logic                             wdata_done_rise;
+  logic                             wdata_done_reg;
+  logic                             wdata_done_reg_dly;
+
+  logic                             wresp_done_reg;
+  logic                             wresp_running_reg;
+
+  logic [AXI_ADDR_WIDTH-1:0]        araddr_reg;
+  logic                             araddr_done_reg;
+
+  logic [AXI_DATA_WIDTH-1:0]        rdata_reg;
+  logic                             rresp_done_reg;
+  logic                             rresp_running_reg;
+
+  logic                             awready;
+  logic                             wready;
+  logic                             bvalid;
+
+  logic                             arready;
+  logic                             rvalid;
+
+  logic                             wren;
+  logic                             wren_l1;
+
+  assign wren = ( wdata_done_rise & awaddr_done_reg ) | ( awaddr_done_rise & wdata_done_reg );
+  assign wdata_done_rise  = wdata_done_reg  & ~wdata_done_reg_dly;
+  assign awaddr_done_rise = awaddr_done_reg & ~awaddr_done_reg_dly;
+
+  // reg_dly
+  always @(posedge Clk_CI or negedge Rst_RBI)
+    begin
+       if (!Rst_RBI)
+         begin
+            wdata_done_reg_dly  <= 1'b0;
+            awaddr_done_reg_dly <= 1'b0;
+         end
+       else
+         begin
+            wdata_done_reg_dly  <= wdata_done_reg;
+            awaddr_done_reg_dly <= awaddr_done_reg;
+         end
+    end
+
+  // AW Channel
+  always @(posedge Clk_CI or negedge Rst_RBI)
+    begin
+       if (!Rst_RBI)
+         begin
+            awaddr_done_reg <= 1'b0;
+            awaddr_reg      <= '0;
+            awready         <= 1'b1;
+         end
+       else
+         begin
+            if (awready && s_axi_awvalid)
+              begin
+                 awready         <= 1'b0;
+                 awaddr_done_reg <= 1'b1;
+                 awaddr_reg      <= s_axi_awaddr;
+              end
+            else if (awaddr_done_reg && wresp_done_reg)
+              begin
+                 awready         <= 1'b1;
+                 awaddr_done_reg <= 1'b0;
+              end
+         end
+    end
+
+  // W Channel
+  always @(posedge Clk_CI or negedge Rst_RBI)
+    begin
+       if (!Rst_RBI)
+         begin
+            wdata_done_reg <= 1'b0;
+            wready         <= 1'b1;
+            wdata_reg      <= '0;
+            wstrb_reg      <= '0;
+         end
+       else
+         begin
+            if (wready && s_axi_wvalid)
+              begin
+                 wready         <= 1'b0;
+                 wdata_done_reg <= 1'b1;
+                 wdata_reg      <= s_axi_wdata;
+                 wstrb_reg      <= s_axi_wstrb;
+              end
+            else if (wdata_done_reg && wresp_done_reg)
+              begin
+                 wready         <= 1'b1;
+                 wdata_done_reg <= 1'b0;
+              end
+         end
+    end
+
+  // B Channel
+  always @(posedge Clk_CI or negedge Rst_RBI)
+    begin
+       if (!Rst_RBI)
+         begin
+            bvalid            <= 1'b0;
+            wresp_done_reg    <= 1'b0;
+            wresp_running_reg <= 1'b0;
+         end
+       else
+         begin
+            if (awaddr_done_reg && wdata_done_reg && !wresp_done_reg)
+              begin
+                 if (!wresp_running_reg)
+                   begin
+                      bvalid            <= 1'b1;
+                      wresp_running_reg <= 1'b1;
+                   end
+                 else if (s_axi_bready)
+                   begin
+                      bvalid            <= 1'b0;
+                      wresp_done_reg    <= 1'b1;
+                      wresp_running_reg <= 1'b0;
+                   end
+              end
+            else
+              begin
+                 bvalid            <= 1'b0;
+                 wresp_done_reg    <= 1'b0;
+                 wresp_running_reg <= 1'b0;
+              end
+         end
+    end
+
+  // AR Channel
+  always @(posedge Clk_CI or negedge Rst_RBI)
+    begin
+       if (!Rst_RBI)
+         begin
+            araddr_done_reg <= 1'b0;
+            arready         <= 1'b1;
+            araddr_reg       <= '0;
+         end
+       else
+         begin
+            if (arready && s_axi_arvalid)
+              begin
+                 arready         <= 1'b0;
+                 araddr_done_reg <= 1'b1;
+                 araddr_reg      <= s_axi_araddr;
+              end
+            else if (araddr_done_reg && rresp_done_reg)
+              begin
+                 arready         <= 1'b1;
+                 araddr_done_reg <= 1'b0;
+              end
+         end
+    end
+
+  // R Channel
+  always @(posedge Clk_CI or negedge Rst_RBI)
+    begin
+       if (!Rst_RBI)
+         begin
+            rresp_done_reg    <= 1'b0;
+            rvalid            <= 1'b0;
+            rresp_running_reg <= 1'b0;
+         end
+       else
+         begin
+            if (araddr_done_reg && !rresp_done_reg)
+              begin
+                 if (!rresp_running_reg)
+                   begin
+                      rvalid            <= 1'b1;
+                      rresp_running_reg <= 1'b1;
+                   end
+                 else if (s_axi_rready)
+                   begin
+                      rvalid            <= 1'b0;
+                      rresp_done_reg    <= 1'b1;
+                      rresp_running_reg <= 1'b0;
+                   end
+              end
+            else
+              begin
+                 rvalid            <= 1'b0;
+                 rresp_done_reg    <= 1'b0;
+                 rresp_running_reg <= 1'b0;
+              end
+         end
+    end
+
+  // ██╗     ██╗     ██████╗███████╗ ██████╗     ██████╗ ███████╗ ██████╗
+  // ██║    ███║    ██╔════╝██╔════╝██╔════╝     ██╔══██╗██╔════╝██╔════╝
+  // ██║    ╚██║    ██║     █████╗  ██║  ███╗    ██████╔╝█████╗  ██║  ███╗
+  // ██║     ██║    ██║     ██╔══╝  ██║   ██║    ██╔══██╗██╔══╝  ██║   ██║
+  // ███████╗██║    ╚██████╗██║     ╚██████╔╝    ██║  ██║███████╗╚██████╔╝
+  // ╚══════╝╚═╝     ╚═════╝╚═╝      ╚═════╝     ╚═╝  ╚═╝╚══════╝ ╚═════╝
+  //
+  assign wren_l1 = wren && (awaddr_reg < L2SINGLE_AMAP_SIZE);
+
+  always @( posedge Clk_CI or negedge Rst_RBI )
+    begin
+      var integer idx_reg, idx_byte;
+      if ( Rst_RBI == 1'b0 )
+        begin
+          for ( idx_reg = 0; idx_reg < N_REGS; idx_reg++ )
+            L1Cfg_DP[idx_reg] <= '0;
+        end
+      else if ( wren_l1 )
+          begin
+            if ( awaddr_reg[ADDR_LSB+1] == 1'b0 ) begin                     // VIRT_ADDR
+              for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
+                if ( (idx_byte < ADDR_WIDTH_VIRT/8) ) begin
+                  if ( wstrb_reg[idx_byte] ) begin
+                    L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte];
+                  end
+                end
+                else begin  // Let synthesizer optimize away unused registers.
+                  L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
+                end
+              end
+            end
+            else if ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b10 ) begin      // PHYS_ADDR
+              for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
+                if ( (idx_byte < ADDR_WIDTH_PHYS/8) ) begin
+                  if ( wstrb_reg[idx_byte] ) begin
+                    L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte];
+                  end
+                end
+                else begin  // Let synthesizer optimize away unused registers.
+                  L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
+                end
+              end
+            end
+            else begin // ( awaddr_reg[ADDR_LSB+1:ADDR_LSB] == 2'b11 )      // FLAGS
+              for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ ) begin
+                if ( (idx_byte < 1) ) begin
+                  if ( wstrb_reg[idx_byte] ) begin
+                    L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= wdata_reg[idx_byte] & { {{8-N_FLAGS}{1'b0}}, {{N_FLAGS}{1'b1}} };
+                  end
+                end
+                else begin  // Let synthesizer optimize away unused registers.
+                  L1Cfg_DP[awaddr_reg[ADDR_MSB:ADDR_LSB]][idx_byte] <= '0;
+                end
+              end
+            end
+          end
+    end // always @ ( posedge Clk_CI or negedge Rst_RBI )
+
+  generate
+    // Mask unused bits -> Synthesizer should optimize away unused registers
+    for( j=0; j<N_REGS; j++ ) begin
+      if ( j[1] == 1'b0 ) // VIRT_ADDR
+        assign L1Cfg_DO[j] = { {{64-ADDR_WIDTH_VIRT}{1'b0}},{ADDR_WIDTH_VIRT{1'b1}} } & L1Cfg_DP[j];
+      else if ( j[1:0] == 2'b10 ) // PHYS_ADDR
+        assign L1Cfg_DO[j] = { {{64-ADDR_WIDTH_PHYS}{1'b0}},{ADDR_WIDTH_PHYS{1'b1}} } & L1Cfg_DP[j];
+      else // if ( j[1:0] == 2'b11 ) // FLAGS
+        assign L1Cfg_DO[j] = { {{64-N_FLAGS}{1'b0}},{N_FLAGS{1'b1}} } & L1Cfg_DP[j];
+    end
+  endgenerate
+
+  always_comb
+    begin
+      if ( araddr_reg[ADDR_LSB-1] == 1'b1 ) // read upper 32 bit, for debugging over 32-bit interface
+        rdata_reg = { {32'h00000000},{L1Cfg_DO[araddr_reg[ADDR_MSB:ADDR_LSB]][63:32]} };
+      else
+        rdata_reg = L1Cfg_DO[araddr_reg[ADDR_MSB:ADDR_LSB]];
+    end
+
+  assign s_axi_awready = awready;
+  assign s_axi_wready  = wready;
+
+  assign s_axi_bresp   = 2'b00;
+  assign s_axi_bvalid  = bvalid;
+
+  assign s_axi_arready = arready;
+  assign s_axi_rresp   = 2'b00;
+  assign s_axi_rvalid  = rvalid;
+
+  // ██╗     ██████╗      ██████╗███████╗ ██████╗
+  // ██║     ╚════██╗    ██╔════╝██╔════╝██╔════╝
+  // ██║      █████╔╝    ██║     █████╗  ██║  ███╗
+  // ██║     ██╔═══╝     ██║     ██╔══╝  ██║   ██║
+  // ███████╗███████╗    ╚██████╗██║     ╚██████╔╝
+  // ╚══════╝╚══════╝     ╚═════╝╚═╝      ╚═════╝
+  //
+  logic [N_PORTS-1:0] l2_addr_is_in_va_rams;
+  logic [N_PORTS-1:0] upper_word_is_written;
+  logic [N_PORTS-1:0] lower_word_is_written;
+  generate
+    for( j=0; j< N_PORTS; j++)
+      begin
+        if (AXI_DATA_WIDTH == 64) begin
+          assign l2_addr_is_in_va_rams[j] = (awaddr_reg >= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg[log2(L2SINGLE_AMAP_SIZE)-1:0] <= L2_VA_MAX_ADDR);
+          assign upper_word_is_written[j] = (wstrb_reg[7:4] != 4'b0000);
+          assign lower_word_is_written[j] = (wstrb_reg[3:0] != 4'b0000);
+        end else begin
+          assign l2_addr_is_in_va_rams[j] = 1'b0;
+          assign upper_word_is_written[j] = 1'b0;
+          assign lower_word_is_written[j] = 1'b0;
+        end
+
+        always @( posedge Clk_CI or negedge Rst_RBI ) begin
+          var integer idx_byte, off_byte;
+          if ( Rst_RBI == 1'b0 )
+            begin
+              wren_l2[j]  <= 1'b0;
+              wdata_l2[j] <= '0;
+            end
+          else if (wren)
+            begin
+              if ( (awaddr_reg >= (j+1)*L2SINGLE_AMAP_SIZE) && (awaddr_reg < (j+2)*L2SINGLE_AMAP_SIZE) && (|wstrb_reg) )
+                wren_l2[j] <= 1'b1;
+              if      (AXI_DATA_WIDTH == 32) begin
+                for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8; idx_byte++ )
+                  wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte] & {8{wstrb_reg[idx_byte]}};
+              end
+              else if (AXI_DATA_WIDTH == 64) begin
+                if (lower_word_is_written[j] == 1'b1)
+                  off_byte = 0;
+                else
+                  off_byte = 4;
+                // always put the payload in the lower word and set upper word to 0
+                for ( idx_byte = 0; idx_byte < AXI_DATA_WIDTH/8/2; idx_byte++ )
+                    wdata_l2[j][idx_byte*8 +: 8] <= wdata_reg[idx_byte+off_byte] & {8{wstrb_reg[idx_byte+off_byte]}};
+                wdata_l2[j][AXI_DATA_WIDTH-1:AXI_DATA_WIDTH/2] <= 'b0;
+              end
+              // pragma translate_off
+              else
+                $fatal(1, "Unsupported AXI_DATA_WIDTH!");
+              // pragma translate_on
+            end
+          else
+            wren_l2[j] <= '0;
+        end // always @ ( posedge Clk_CI or negedge Rst_RBI )
+
+        // Properly align the 32-bit word address when writing from 64-bit interface:
+        // Depending on the system, the incoming address is (non-)aligned to the 64-bit
+        // word when writing the upper 32-bit word.
+        always_comb begin
+          waddr_l2[j] = (awaddr_reg -(j+1)*L2SINGLE_AMAP_SIZE)/4;
+          if (wren_l2[j]) begin
+            if (AXI_DATA_WIDTH == 64) begin
+              if (upper_word_is_written[j] == 1'b1) begin
+                // address must be non-aligned
+                waddr_l2[j][0] = 1'b1;
+              end
+            end
+            // pragma translate_off
+            else if (AXI_DATA_WIDTH != 32) begin
+              $fatal(1, "Unsupported AXI_DATA_WIDTH!");
+            end
+            // pragma translate_on
+          end
+        end
+
+        // Assert that only one 32-bit word is ever written at a time to VA RAMs on 64-bit data
+        // systems.
+        // pragma translate_off
+        always_ff @ (posedge Clk_CI) begin
+          if (AXI_DATA_WIDTH == 64) begin
+            if  (l2_addr_is_in_va_rams[j]) begin
+              if (upper_word_is_written[j]) begin
+                assert (!lower_word_is_written[j])
+                  else $error("Unsupported write across two 32-bit words to VA RAMs!");
+              end
+              else if (lower_word_is_written[j]) begin
+                assert (!upper_word_is_written[j])
+                  else $error("Unsupported write across two 32-bit words to VA RAMs!");
+              end
+            end
+          end
+        end
+        // pragma translate_on
+
+      end // for (j=0; j< N_PORTS; j++)
+   endgenerate
+
+  // ███╗   ███╗██╗  ██╗    ███████╗██╗███████╗ ██████╗ ███████╗
+  // ████╗ ████║██║  ██║    ██╔════╝██║██╔════╝██╔═══██╗██╔════╝
+  // ██╔████╔██║███████║    █████╗  ██║█████╗  ██║   ██║███████╗
+  // ██║╚██╔╝██║██╔══██║    ██╔══╝  ██║██╔══╝  ██║   ██║╚════██║
+  // ██║ ╚═╝ ██║██║  ██║    ██║     ██║██║     ╚██████╔╝███████║
+  // ╚═╝     ╚═╝╚═╝  ╚═╝    ╚═╝     ╚═╝╚═╝      ╚═════╝ ╚══════╝
+  //
+  logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDin_D;
+  logic                       AddrFifoWen_S;
+  logic                       AddrFifoRen_S;
+  logic [ADDR_WIDTH_VIRT-1:0] AddrFifoDout_D;
+  logic                       AddrFifoFull_S;
+  logic                       AddrFifoEmpty_S;
+  logic                       AddrFifoEmpty_SB;
+  logic                       AddrFifoFull_SB;
+
+  logic [MISS_META_WIDTH-1:0] MetaFifoDin_D;
+  logic                       MetaFifoWen_S;
+  logic                       MetaFifoRen_S;
+  logic [MISS_META_WIDTH-1:0] MetaFifoDout_D;
+  logic                       MetaFifoFull_S;
+  logic                       MetaFifoEmpty_S;
+  logic                       MetaFifoEmpty_SB;
+  logic                       MetaFifoFull_SB;
+
+  logic                       FifosDisabled_S;
+  logic                       ConfRegWen_S;
+  logic                 [1:0] ConfReg_DN;
+  logic                 [1:0] ConfReg_DP;
+
+  logic [AXI_DATA_WIDTH-1:0] wdata_reg_vec;
+
+  assign FifosDisabled_S    = ConfReg_DP[0];
+  assign L1AllowMultiHit_SO = ConfReg_DP[1];
+
+  assign AddrFifoEmpty_S = ~AddrFifoEmpty_SB;
+  assign MetaFifoEmpty_S = ~MetaFifoEmpty_SB;
+
+  assign AddrFifoFull_S = ~AddrFifoFull_SB;
+  assign MetaFifoFull_S = ~MetaFifoFull_SB;
+
+  assign MhFifoFull_SO = (AddrFifoWen_S & AddrFifoFull_S) | (MetaFifoWen_S & MetaFifoFull_S);
+
+  generate
+     for ( j=0; j<AXI_DATA_WIDTH/8; j++ )
+       assign wdata_reg_vec[(j+1)*8-1:j*8] = wdata_reg[j];
+  endgenerate
+
+  // write address FIFO
+  always_comb
+    begin
+       AddrFifoWen_S = 1'b0;
+       AddrFifoDin_D = 'b0;
+       if ( (Miss_SI == 1'b1) && (FifosDisabled_S == 1'b0) ) // register a new miss
+         begin
+            AddrFifoWen_S = 1'b1;
+            AddrFifoDin_D = MissAddr_DI;
+         end
+       else if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 'b0) && (FifosDisabled_S == 1'b0)) // write request from AXI interface
+         begin
+            AddrFifoWen_S = 1'b1;
+            AddrFifoDin_D = wdata_reg_vec[ADDR_WIDTH_VIRT-1:0];
+         end
+    end
+
+  // write meta FIFO
+  always_comb
+    begin
+       MetaFifoWen_S = 1'b0;
+       MetaFifoDin_D = 'b0;
+       if ( (Miss_SI == 1'b1) && (FifosDisabled_S == 1'b0) ) // register a new miss
+         begin
+            MetaFifoWen_S                      = 1'b1;
+            MetaFifoDin_D[MISS_META_WIDTH-1:0] = MissMeta_DI;
+         end
+       else if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 4'h8) && (FifosDisabled_S == 1'b0) ) // write request from AXI interface
+         begin
+            MetaFifoWen_S = 1'b1;
+            MetaFifoDin_D = wdata_reg_vec[MISS_META_WIDTH-1:0];
+         end
+    end
+
+  // write configuration register
+  always_comb
+    begin
+       ConfRegWen_S = 1'b0;
+       ConfReg_DN   = 1'b0;
+       if ( (wren_l1 == 1'b1) && (awaddr_reg[ADDR_MSB:0] == 8'h10) ) // write request from AXI interface
+         begin
+            ConfRegWen_S = 1'b1;
+            ConfReg_DN   = wdata_reg_vec[$high(ConfReg_DN):0];
+         end
+    end
+
+  // AXI read data
+  always_comb
+    begin
+       s_axi_rdata   = rdata_reg; // read L1 config
+       AddrFifoRen_S = 1'b0;
+       MetaFifoRen_S = 1'b0;
+       if ( rvalid == 1'b1 )
+         begin
+            // read address FIFO
+            if ( araddr_reg[ADDR_MSB:0] == 'b0 )
+              begin
+                s_axi_rdata                      = {AXI_DATA_WIDTH{1'b0}};
+                s_axi_rdata[ADDR_WIDTH_VIRT-1:0] = AddrFifoDout_D;
+                if ( AddrFifoEmpty_S == 1'b0 )
+                  AddrFifoRen_S = 1'b1;
+              end
+            // read meta FIFO
+            else if ( araddr_reg[ADDR_MSB:0] == 4'h8 )
+              begin
+                s_axi_rdata                      = {AXI_DATA_WIDTH{1'b0}};
+                s_axi_rdata[31]                  = MetaFifoEmpty_S;
+                s_axi_rdata[MISS_META_WIDTH-1:0] = MetaFifoDout_D;
+                if ( MetaFifoEmpty_S == 1'b0 )
+                  MetaFifoRen_S = 1'b1;
+              end
+            // read configuration register
+            else if ( araddr_reg[ADDR_MSB:0] == 8'h10 )
+              begin
+                s_axi_rdata                      = {AXI_DATA_WIDTH{1'b0}};
+                s_axi_rdata[$high(ConfReg_DP):0] = ConfReg_DP;
+              end
+         end // if ( rvalid == 1'b1 )
+    end // always_comb begin
+
+  // configuration register
+  always_ff @(posedge Clk_CI or negedge Rst_RBI) begin
+    if (Rst_RBI == 1'b0)
+      begin
+        ConfReg_DP <= 'b0;
+      end
+    else if (ConfRegWen_S == 1'b1)
+      begin
+        ConfReg_DP <= ConfReg_DN;
+      end
+  end
+
+  generic_fifo
+    #(
+      .DATA_WIDTH ( ADDR_WIDTH_VIRT ),
+      .DATA_DEPTH ( MH_FIFO_DEPTH   )
+      )
+    fifo_addr_i
+    (
+      .clk         ( Clk_CI                          ),
+      .rst_n       ( Rst_RBI                         ),
+      .data_i      ( AddrFifoDin_D                   ),
+      .valid_i     ( AddrFifoWen_S & AddrFifoFull_SB ),
+      .grant_o     ( AddrFifoFull_SB                 ),
+      .data_o      ( AddrFifoDout_D                  ),
+      .valid_o     ( AddrFifoEmpty_SB                ),
+      .grant_i     ( AddrFifoRen_S                   ),
+      .test_mode_i ( 1'b0                            )
+    );
+
+  generic_fifo
+    #(
+      .DATA_WIDTH ( MISS_META_WIDTH ),
+      .DATA_DEPTH ( MH_FIFO_DEPTH   )
+      )
+    fifo_meta_i
+    (
+      .clk         ( Clk_CI                          ),
+      .rst_n       ( Rst_RBI                         ),
+      .data_i      ( MetaFifoDin_D                   ),
+      .valid_i     ( MetaFifoWen_S & MetaFifoFull_SB ),
+      .grant_o     ( MetaFifoFull_SB                 ),
+      .data_o      ( MetaFifoDout_D                  ),
+      .valid_o     ( MetaFifoEmpty_SB                ),
+      .grant_i     ( MetaFifoRen_S                   ),
+      .test_mode_i ( 1'b0                            )
+    );
+"""
+#
+# endmodule
+#
+#
diff --git a/unused_please_ignore_completely/iommu/axi_rab/axi_rab_top.py b/unused_please_ignore_completely/iommu/axi_rab/axi_rab_top.py
new file mode 100644 (file)
index 0000000..ea1a802
--- /dev/null
@@ -0,0 +1,2642 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class axi_rab_top(Elaboratable):
+
+    def __init__(self):
+        self.Clk_CI = Signal()  # input
+        self.NonGatedClk_CI = Signal()  # input
+        self.Rst_RBI = Signal()  # input
+        self.s_axi4_awid = Signal()  # input
+        self.s_axi4_awaddr = Signal()  # input
+        self.s_axi4_awvalid = Signal(N_PORTS)  # input
+        self.s_axi4_awready = Signal(N_PORTS)  # output
+        self.s_axi4_awlen = Signal()  # input
+        self.s_axi4_awsize = Signal()  # input
+        self.s_axi4_awburst = Signal()  # input
+        self.s_axi4_awlock = Signal(N_PORTS)  # input
+        self.s_axi4_awprot = Signal()  # input
+        self.s_axi4_awcache = Signal()  # input
+        self.s_axi4_awregion = Signal()  # input
+        self.s_axi4_awqos = Signal()  # input
+        self.s_axi4_awuser = Signal()  # input
+        self.s_axi4_wdata = Signal()  # input
+        self.s_axi4_wvalid = Signal(N_PORTS)  # input
+        self.s_axi4_wready = Signal(N_PORTS)  # output
+        self.s_axi4_wstrb = Signal()  # input
+        self.s_axi4_wlast = Signal(N_PORTS)  # input
+        self.s_axi4_wuser = Signal()  # input
+        self.s_axi4_bid = Signal()  # output
+        self.s_axi4_bresp = Signal()  # output
+        self.s_axi4_bvalid = Signal(N_PORTS)  # output
+        self.s_axi4_buser = Signal()  # output
+        self.s_axi4_bready = Signal(N_PORTS)  # input
+        self.s_axi4_arid = Signal()  # input
+        self.s_axi4_araddr = Signal()  # input
+        self.s_axi4_arvalid = Signal(N_PORTS)  # input
+        self.s_axi4_arready = Signal(N_PORTS)  # output
+        self.s_axi4_arlen = Signal()  # input
+        self.s_axi4_arsize = Signal()  # input
+        self.s_axi4_arburst = Signal()  # input
+        self.s_axi4_arlock = Signal(N_PORTS)  # input
+        self.s_axi4_arprot = Signal()  # input
+        self.s_axi4_arcache = Signal()  # input
+        self.s_axi4_aruser = Signal()  # input
+        self.s_axi4_rid = Signal()  # output
+        self.s_axi4_rdata = Signal()  # output
+        self.s_axi4_rresp = Signal()  # output
+        self.s_axi4_rvalid = Signal(N_PORTS)  # output
+        self.s_axi4_rready = Signal(N_PORTS)  # input
+        self.s_axi4_rlast = Signal(N_PORTS)  # output
+        self.s_axi4_ruser = Signal()  # output
+        self.m0_axi4_awid = Signal()  # output
+        self.m0_axi4_awaddr = Signal()  # output
+        self.m0_axi4_awvalid = Signal(N_PORTS)  # output
+        self.m0_axi4_awready = Signal(N_PORTS)  # input
+        self.m0_axi4_awlen = Signal()  # output
+        self.m0_axi4_awsize = Signal()  # output
+        self.m0_axi4_awburst = Signal()  # output
+        self.m0_axi4_awlock = Signal(N_PORTS)  # output
+        self.m0_axi4_awprot = Signal()  # output
+        self.m0_axi4_awcache = Signal()  # output
+        self.m0_axi4_awregion = Signal()  # output
+        self.m0_axi4_awqos = Signal()  # output
+        self.m0_axi4_awuser = Signal()  # output
+        self.m0_axi4_wdata = Signal()  # output
+        self.m0_axi4_wvalid = Signal(N_PORTS)  # output
+        self.m0_axi4_wready = Signal(N_PORTS)  # input
+        self.m0_axi4_wstrb = Signal()  # output
+        self.m0_axi4_wlast = Signal(N_PORTS)  # output
+        self.m0_axi4_wuser = Signal()  # output
+        self.m0_axi4_bid = Signal()  # input
+        self.m0_axi4_bresp = Signal()  # input
+        self.m0_axi4_bvalid = Signal(N_PORTS)  # input
+        self.m0_axi4_buser = Signal()  # input
+        self.m0_axi4_bready = Signal(N_PORTS)  # output
+        self.m0_axi4_arid = Signal()  # output
+        self.m0_axi4_araddr = Signal()  # output
+        self.m0_axi4_arvalid = Signal(N_PORTS)  # output
+        self.m0_axi4_arready = Signal(N_PORTS)  # input
+        self.m0_axi4_arlen = Signal()  # output
+        self.m0_axi4_arsize = Signal()  # output
+        self.m0_axi4_arburst = Signal()  # output
+        self.m0_axi4_arlock = Signal(N_PORTS)  # output
+        self.m0_axi4_arprot = Signal()  # output
+        self.m0_axi4_arcache = Signal()  # output
+        self.m0_axi4_aruser = Signal()  # output
+        self.m0_axi4_rid = Signal()  # input
+        self.m0_axi4_rdata = Signal()  # input
+        self.m0_axi4_rresp = Signal()  # input
+        self.m0_axi4_rvalid = Signal(N_PORTS)  # input
+        self.m0_axi4_rready = Signal(N_PORTS)  # output
+        self.m0_axi4_rlast = Signal(N_PORTS)  # input
+        self.m0_axi4_ruser = Signal()  # input
+        self.m1_axi4_awid = Signal()  # output
+        self.m1_axi4_awaddr = Signal()  # output
+        self.m1_axi4_awvalid = Signal(N_PORTS)  # output
+        self.m1_axi4_awready = Signal(N_PORTS)  # input
+        self.m1_axi4_awlen = Signal()  # output
+        self.m1_axi4_awsize = Signal()  # output
+        self.m1_axi4_awburst = Signal()  # output
+        self.m1_axi4_awlock = Signal(N_PORTS)  # output
+        self.m1_axi4_awprot = Signal()  # output
+        self.m1_axi4_awcache = Signal()  # output
+        self.m1_axi4_awregion = Signal()  # output
+        self.m1_axi4_awqos = Signal()  # output
+        self.m1_axi4_awuser = Signal()  # output
+        self.m1_axi4_wdata = Signal()  # output
+        self.m1_axi4_wvalid = Signal(N_PORTS)  # output
+        self.m1_axi4_wready = Signal(N_PORTS)  # input
+        self.m1_axi4_wstrb = Signal()  # output
+        self.m1_axi4_wlast = Signal(N_PORTS)  # output
+        self.m1_axi4_wuser = Signal()  # output
+        self.m1_axi4_bid = Signal()  # input
+        self.m1_axi4_bresp = Signal()  # input
+        self.m1_axi4_bvalid = Signal(N_PORTS)  # input
+        self.m1_axi4_buser = Signal()  # input
+        self.m1_axi4_bready = Signal(N_PORTS)  # output
+        self.m1_axi4_arid = Signal()  # output
+        self.m1_axi4_araddr = Signal()  # output
+        self.m1_axi4_arvalid = Signal(N_PORTS)  # output
+        self.m1_axi4_arready = Signal(N_PORTS)  # input
+        self.m1_axi4_arlen = Signal()  # output
+        self.m1_axi4_arsize = Signal()  # output
+        self.m1_axi4_arburst = Signal()  # output
+        self.m1_axi4_arlock = Signal(N_PORTS)  # output
+        self.m1_axi4_arprot = Signal()  # output
+        self.m1_axi4_arcache = Signal()  # output
+        self.m1_axi4_aruser = Signal()  # output
+        self.m1_axi4_rid = Signal()  # input
+        self.m1_axi4_rdata = Signal()  # input
+        self.m1_axi4_rresp = Signal()  # input
+        self.m1_axi4_rvalid = Signal(N_PORTS)  # input
+        self.m1_axi4_rready = Signal(N_PORTS)  # output
+        self.m1_axi4_rlast = Signal(N_PORTS)  # input
+        self.m1_axi4_ruser = Signal()  # input
+        self.s_axi4lite_awaddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
+        self.s_axi4lite_awvalid = Signal()  # input
+        self.s_axi4lite_awready = Signal()  # output
+        self.s_axi4lite_wdata = Signal(AXI_LITE_DATA_WIDTH)  # input
+        self.s_axi4lite_wvalid = Signal()  # input
+        self.s_axi4lite_wready = Signal()  # output
+        self.s_axi4lite_wstrb = Signal(1+ERROR p_expression_25)  # input
+        self.s_axi4lite_bresp = Signal(2)  # output
+        self.s_axi4lite_bvalid = Signal()  # output
+        self.s_axi4lite_bready = Signal()  # input
+        self.s_axi4lite_araddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
+        self.s_axi4lite_arvalid = Signal()  # input
+        self.s_axi4lite_arready = Signal()  # output
+        self.s_axi4lite_rdata = Signal(AXI_LITE_DATA_WIDTH)  # output
+        self.s_axi4lite_rresp = Signal(2)  # output
+        self.s_axi4lite_rvalid = Signal()  # output
+        self.s_axi4lite_rready = Signal()  # input
+        self.int_miss = Signal(N_PORTS)  # output
+        self.int_multi = Signal(N_PORTS)  # output
+        self.int_prot = Signal(N_PORTS)  # output
+        self.int_mhf_full = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# // --=========================================================================--
+# //
+# //  █████╗ ██╗  ██╗██╗    ██████╗  █████╗ ██████╗     ████████╗ ██████╗ ██████╗
+# // ██╔══██╗╚██╗██╔╝██║    ██╔══██╗██╔══██╗██╔══██╗    ╚══██╔══╝██╔═══██╗██╔══██╗
+# // ███████║ ╚███╔╝ ██║    ██████╔╝███████║██████╔╝       ██║   ██║   ██║██████╔╝
+# // ██╔══██║ ██╔██╗ ██║    ██╔══██╗██╔══██║██╔══██╗       ██║   ██║   ██║██╔═══╝
+# // ██║  ██║██╔╝ ██╗██║    ██║  ██║██║  ██║██████╔╝       ██║   ╚██████╔╝██║
+# // ╚═╝  ╚═╝╚═╝  ╚═╝╚═╝    ╚═╝  ╚═╝╚═╝  ╚═╝╚═════╝        ╚═╝    ╚═════╝ ╚═╝
+# //
+# // --=========================================================================--
+# /*
+# * axi_rab_top
+# *
+# * The remapping address block (RAB) performs address translation for AXI
+# * transactions arriving at the input port and forwards them to different
+# * downstream AXI ports.
+# *
+# * The five axi channels are each buffered on the input side using a FIFO,
+# * described in axi4_XX_buffer. The RAB lookup result is merged into the
+# * AXI transaction via the axi4_XX_sender instances, which manages upstream
+# * error signaling for failed lookups.
+# *
+# * Address translation is performed based on data stored in up to two
+# * translation lookaside buffers (TLBs), which are private per RAB port (each
+# * of which having two AXI master ports and one AXI slave port). These TLBs
+# * are managed in software through the AXI-Lite interface.
+# *
+# * If ACP is enabled, the `cache_coherent` flag in the TLBs is used to
+# * multiplex between the two ports. If ACP is disabled, only the first master
+# * port is used. In this case, the `cache_coherent` flag is used to set the
+# * AxCACHE signals of the AXI bus accordingly.
+# *
+# * Authors:
+# * Antonio Pullini <pullinia@iis.ee.ethz.ch>
+# * Conrad Burchert <bconrad@ethz.ch>
+# * Maheshwara Sharma <msharma@student.ethz.ch>
+# * Andreas Kurth <akurth@iis.ee.ethz.ch>
+# * Johannes Weinbuch <jweinbuch@student.ethz.ch>
+# * Pirmin Vogel <vogelpi@iis.ee.ethz.ch>
+# */
+#
+# //`include "pulp_soc_defines.sv"
+#
+# ////import CfMath::log2;
+#
+# module axi_rab_top
+#
+#  // Parameters {{{
+#  #(
+#    parameter N_PORTS             =  2,
+#    parameter N_L2_SETS           = 32,
+#    parameter N_L2_SET_ENTRIES    = 32,
+#    parameter AXI_DATA_WIDTH      = 64,
+#    parameter AXI_S_ADDR_WIDTH    = 32,
+#    parameter AXI_M_ADDR_WIDTH    = 40,
+#    parameter AXI_LITE_DATA_WIDTH = 64,
+#    parameter AXI_LITE_ADDR_WIDTH = 32,
+#    parameter AXI_ID_WIDTH        = 10,
+#    parameter AXI_USER_WIDTH      =  6,
+#    parameter MH_FIFO_DEPTH       = 16
+#  )
+#  // }}}
+#
+#  // Ports {{{
+#  (
+#
+#    input logic                                            Clk_CI,  // This clock may be gated.
+#    input logic                                            NonGatedClk_CI,
+#    input logic                                            Rst_RBI,
+#
+#    // For every slave port there are two master ports. The master
+#    // port to use can be set using the master_select flag of the protection
+#    // bits of a slice
+#
+#    // AXI4 Slave {{{
+#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_awid,
+#    input  logic    [N_PORTS-1:0]   [AXI_S_ADDR_WIDTH-1:0] s_axi4_awaddr,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_awvalid,
+#    output logic    [N_PORTS-1:0]                          s_axi4_awready,
+#    input  logic    [N_PORTS-1:0]                    [7:0] s_axi4_awlen,
+#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_awsize,
+#    input  logic    [N_PORTS-1:0]                    [1:0] s_axi4_awburst,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_awlock,
+#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_awprot,
+#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_awcache,
+#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_awregion,
+#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_awqos,
+#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_awuser,
+#
+#    input  logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] s_axi4_wdata,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_wvalid,
+#    output logic    [N_PORTS-1:0]                          s_axi4_wready,
+#    input  logic    [N_PORTS-1:0]   [AXI_DATA_WIDTH/8-1:0] s_axi4_wstrb,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_wlast,
+#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_wuser,
+#
+#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_bid,
+#    output logic    [N_PORTS-1:0]                    [1:0] s_axi4_bresp,
+#    output logic    [N_PORTS-1:0]                          s_axi4_bvalid,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_buser,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_bready,
+#
+#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_arid,
+#    input  logic    [N_PORTS-1:0]   [AXI_S_ADDR_WIDTH-1:0] s_axi4_araddr,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_arvalid,
+#    output logic    [N_PORTS-1:0]                          s_axi4_arready,
+#    input  logic    [N_PORTS-1:0]                    [7:0] s_axi4_arlen,
+#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_arsize,
+#    input  logic    [N_PORTS-1:0]                    [1:0] s_axi4_arburst,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_arlock,
+#    input  logic    [N_PORTS-1:0]                    [2:0] s_axi4_arprot,
+#    input  logic    [N_PORTS-1:0]                    [3:0] s_axi4_arcache,
+#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_aruser,
+#
+#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] s_axi4_rid,
+#    output logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] s_axi4_rdata,
+#    output logic    [N_PORTS-1:0]                    [1:0] s_axi4_rresp,
+#    output logic    [N_PORTS-1:0]                          s_axi4_rvalid,
+#    input  logic    [N_PORTS-1:0]                          s_axi4_rready,
+#    output logic    [N_PORTS-1:0]                          s_axi4_rlast,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] s_axi4_ruser,
+#    // }}}
+#
+#    // AXI4 Master 0 {{{
+#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_awid,
+#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m0_axi4_awaddr,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_awvalid,
+#    input  logic    [N_PORTS-1:0]                          m0_axi4_awready,
+#    output logic    [N_PORTS-1:0]                    [7:0] m0_axi4_awlen,
+#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_awsize,
+#    output logic    [N_PORTS-1:0]                    [1:0] m0_axi4_awburst,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_awlock,
+#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_awprot,
+#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_awcache,
+#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_awregion,
+#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_awqos,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_awuser,
+#
+#    output logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m0_axi4_wdata,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_wvalid,
+#    input  logic    [N_PORTS-1:0]                          m0_axi4_wready,
+#    output logic    [N_PORTS-1:0]   [AXI_DATA_WIDTH/8-1:0] m0_axi4_wstrb,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_wlast,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_wuser,
+#
+#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_bid,
+#    input  logic    [N_PORTS-1:0]                    [1:0] m0_axi4_bresp,
+#    input  logic    [N_PORTS-1:0]                          m0_axi4_bvalid,
+#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_buser,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_bready,
+#
+#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_arid,
+#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m0_axi4_araddr,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_arvalid,
+#    input  logic    [N_PORTS-1:0]                          m0_axi4_arready,
+#    output logic    [N_PORTS-1:0]                    [7:0] m0_axi4_arlen,
+#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_arsize,
+#    output logic    [N_PORTS-1:0]                    [1:0] m0_axi4_arburst,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_arlock,
+#    output logic    [N_PORTS-1:0]                    [2:0] m0_axi4_arprot,
+#    output logic    [N_PORTS-1:0]                    [3:0] m0_axi4_arcache,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_aruser,
+#
+#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m0_axi4_rid,
+#    input  logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m0_axi4_rdata,
+#    input  logic    [N_PORTS-1:0]                    [1:0] m0_axi4_rresp,
+#    input  logic    [N_PORTS-1:0]                          m0_axi4_rvalid,
+#    output logic    [N_PORTS-1:0]                          m0_axi4_rready,
+#    input  logic    [N_PORTS-1:0]                          m0_axi4_rlast,
+#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m0_axi4_ruser,
+#    // }}}
+#
+#    // AXI4 Master 1 {{{
+#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_awid,
+#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m1_axi4_awaddr,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_awvalid,
+#    input  logic    [N_PORTS-1:0]                          m1_axi4_awready,
+#    output logic    [N_PORTS-1:0]                    [7:0] m1_axi4_awlen,
+#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_awsize,
+#    output logic    [N_PORTS-1:0]                    [1:0] m1_axi4_awburst,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_awlock,
+#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_awprot,
+#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_awcache,
+#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_awregion,
+#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_awqos,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_awuser,
+#
+#    output logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m1_axi4_wdata,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_wvalid,
+#    input  logic    [N_PORTS-1:0]                          m1_axi4_wready,
+#    output logic    [N_PORTS-1:0]   [AXI_DATA_WIDTH/8-1:0] m1_axi4_wstrb,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_wlast,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_wuser,
+#
+#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_bid,
+#    input  logic    [N_PORTS-1:0]                    [1:0] m1_axi4_bresp,
+#    input  logic    [N_PORTS-1:0]                          m1_axi4_bvalid,
+#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_buser,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_bready,
+#
+#    output logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_arid,
+#    output logic    [N_PORTS-1:0]   [AXI_M_ADDR_WIDTH-1:0] m1_axi4_araddr,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_arvalid,
+#    input  logic    [N_PORTS-1:0]                          m1_axi4_arready,
+#    output logic    [N_PORTS-1:0]                    [7:0] m1_axi4_arlen,
+#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_arsize,
+#    output logic    [N_PORTS-1:0]                    [1:0] m1_axi4_arburst,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_arlock,
+#    output logic    [N_PORTS-1:0]                    [2:0] m1_axi4_arprot,
+#    output logic    [N_PORTS-1:0]                    [3:0] m1_axi4_arcache,
+#    output logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_aruser,
+#
+#    input  logic    [N_PORTS-1:0]       [AXI_ID_WIDTH-1:0] m1_axi4_rid,
+#    input  logic    [N_PORTS-1:0]     [AXI_DATA_WIDTH-1:0] m1_axi4_rdata,
+#    input  logic    [N_PORTS-1:0]                    [1:0] m1_axi4_rresp,
+#    input  logic    [N_PORTS-1:0]                          m1_axi4_rvalid,
+#    output logic    [N_PORTS-1:0]                          m1_axi4_rready,
+#    input  logic    [N_PORTS-1:0]                          m1_axi4_rlast,
+#    input  logic    [N_PORTS-1:0]     [AXI_USER_WIDTH-1:0] m1_axi4_ruser,
+#    // }}}
+#
+#    // AXI 4 Lite Slave (Configuration Interface) {{{
+#    // AXI4-Lite port to setup the rab slices
+#    // use this to program the configuration registers
+#    input  logic                 [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_awaddr,
+#    input  logic                                           s_axi4lite_awvalid,
+#    output logic                                           s_axi4lite_awready,
+#
+#    input  logic                 [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_wdata,
+#    input  logic                                           s_axi4lite_wvalid,
+#    output logic                                           s_axi4lite_wready,
+#    input  logic               [AXI_LITE_DATA_WIDTH/8-1:0] s_axi4lite_wstrb,
+#
+#    output logic                                     [1:0] s_axi4lite_bresp,
+#    output logic                                           s_axi4lite_bvalid,
+#    input  logic                                           s_axi4lite_bready,
+#
+#    input  logic                 [AXI_LITE_ADDR_WIDTH-1:0] s_axi4lite_araddr,
+#    input  logic                                           s_axi4lite_arvalid,
+#    output logic                                           s_axi4lite_arready,
+#
+#    output logic                 [AXI_LITE_DATA_WIDTH-1:0] s_axi4lite_rdata,
+#    output logic                                     [1:0] s_axi4lite_rresp,
+#    output logic                                           s_axi4lite_rvalid,
+#    input  logic                                           s_axi4lite_rready,
+#    // }}}
+#
+#    // BRAMs {{{
+# //`ifdef RAB_AX_LOG_EN
+# //    BramPort.Slave                                         ArBram_PS,
+# //    BramPort.Slave                                         AwBram_PS,
+# //`endif
+#    // }}}
+#
+#    // Logger Control {{{
+# //`ifdef RAB_AX_LOG_EN
+# //   input  logic                                           LogEn_SI,
+# //   input  logic                                           ArLogClr_SI,
+# //   input  logic                                           AwLogClr_SI,
+#  //  output logic                                           ArLogRdy_SO,
+#  //  output logic                                           AwLogRdy_SO,
+# //`endif
+#    // }}}
+#
+#    // Interrupt Outputs {{{
+#    // Interrupt lines to handle misses, collisions of slices/multiple hits,
+#    // protection faults and overflow of the miss handling fifo
+# //`ifdef RAB_AX_LOG_EN
+# //   output logic                                           int_ar_log_full,
+# //   output logic                                           int_aw_log_full,
+# //`endif
+#    output logic                             [N_PORTS-1:0] int_miss,
+#    output logic                             [N_PORTS-1:0] int_multi,
+#    output logic                             [N_PORTS-1:0] int_prot,
+#    output logic                                           int_mhf_full
+#    // }}}
+#
+#  );
+#
+"""#docstring_begin
+
+  // }}}
+
+  // Signals {{{
+  // ███████╗██╗ ██████╗ ███╗   ██╗ █████╗ ██╗     ███████╗
+  // ██╔════╝██║██╔════╝ ████╗  ██║██╔══██╗██║     ██╔════╝
+  // ███████╗██║██║  ███╗██╔██╗ ██║███████║██║     ███████╗
+  // ╚════██║██║██║   ██║██║╚██╗██║██╔══██║██║     ╚════██║
+  // ███████║██║╚██████╔╝██║ ╚████║██║  ██║███████╗███████║
+  // ╚══════╝╚═╝ ╚═════╝ ╚═╝  ╚═══╝╚═╝  ╚═╝╚══════╝╚══════╝
+  //
+
+  // Internal AXI4 lines, these connect buffers on the slave side to the rab core and
+  // multiplexers which switch between the two master outputs
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_awid;
+  logic [N_PORTS-1:0]  [AXI_S_ADDR_WIDTH-1:0] int_awaddr;
+  logic [N_PORTS-1:0]                         int_awvalid;
+  logic [N_PORTS-1:0]                         int_awready;
+  logic [N_PORTS-1:0]                   [7:0] int_awlen;
+  logic [N_PORTS-1:0]                   [2:0] int_awsize;
+  logic [N_PORTS-1:0]                   [1:0] int_awburst;
+  logic [N_PORTS-1:0]                         int_awlock;
+  logic [N_PORTS-1:0]                   [2:0] int_awprot;
+  logic [N_PORTS-1:0]                   [3:0] int_awcache;
+  logic [N_PORTS-1:0]                   [3:0] int_awregion;
+  logic [N_PORTS-1:0]                   [3:0] int_awqos;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_awuser;
+
+  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_wdata;
+  logic [N_PORTS-1:0]                         int_wvalid;
+  logic [N_PORTS-1:0]                         int_wready;
+  logic [N_PORTS-1:0]  [AXI_DATA_WIDTH/8-1:0] int_wstrb;
+  logic [N_PORTS-1:0]                         int_wlast;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_wuser;
+
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_bid;
+  logic [N_PORTS-1:0]                   [1:0] int_bresp;
+  logic [N_PORTS-1:0]                         int_bvalid;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_buser;
+  logic [N_PORTS-1:0]                         int_bready;
+
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_arid;
+  logic [N_PORTS-1:0]  [AXI_S_ADDR_WIDTH-1:0] int_araddr;
+  logic [N_PORTS-1:0]                         int_arvalid;
+  logic [N_PORTS-1:0]                         int_arready;
+  logic [N_PORTS-1:0]                   [7:0] int_arlen;
+  logic [N_PORTS-1:0]                   [2:0] int_arsize;
+  logic [N_PORTS-1:0]                   [1:0] int_arburst;
+  logic [N_PORTS-1:0]                         int_arlock;
+  logic [N_PORTS-1:0]                   [2:0] int_arprot;
+  logic [N_PORTS-1:0]                   [3:0] int_arcache;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_aruser;
+
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_rid;
+  logic [N_PORTS-1:0]                   [1:0] int_rresp;
+  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_rdata;
+  logic [N_PORTS-1:0]                         int_rlast;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_ruser;
+  logic [N_PORTS-1:0]                         int_rvalid;
+  logic [N_PORTS-1:0]                         int_rready;
+
+  // rab_core outputs
+  logic [N_PORTS-1:0]  [AXI_M_ADDR_WIDTH-1:0] int_wtrans_addr;
+  logic [N_PORTS-1:0]                         int_wtrans_accept;
+  logic [N_PORTS-1:0]                         int_wtrans_drop;
+  logic [N_PORTS-1:0]                         int_wtrans_miss;
+  logic [N_PORTS-1:0]                         int_wtrans_sent;
+  logic [N_PORTS-1:0]                         int_wtrans_cache_coherent;
+  logic [N_PORTS-1:0]                         int_wmaster_select;
+
+  logic [N_PORTS-1:0]  [AXI_M_ADDR_WIDTH-1:0] int_rtrans_addr;
+  logic [N_PORTS-1:0]                         int_rtrans_accept;
+  logic [N_PORTS-1:0]                         int_rtrans_drop;
+  logic [N_PORTS-1:0]                         int_rtrans_miss;
+  logic [N_PORTS-1:0]                         int_rtrans_sent;
+  logic [N_PORTS-1:0]                         int_rtrans_cache_coherent;
+  logic [N_PORTS-1:0]                         int_rmaster_select;
+
+  logic [N_PORTS-1:0]                         w_master_select;
+
+  // Internal master0 AXI4 lines. These connect the first master port to the
+  // multiplexers
+  // For channels read address, write address and write data the other lines
+  // are ignored if valid is not set, therefore we only need to multiplex those
+  logic [N_PORTS-1:0]                         int_m0_awvalid;
+  logic [N_PORTS-1:0]                         int_m0_awready;
+
+  logic [N_PORTS-1:0]                         int_m0_wvalid;
+  logic [N_PORTS-1:0]                         int_m0_wready;
+
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m0_bid;
+  logic [N_PORTS-1:0]                   [1:0] int_m0_bresp;
+  logic [N_PORTS-1:0]                         int_m0_bvalid;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m0_buser;
+  logic [N_PORTS-1:0]                         int_m0_bready;
+
+  logic [N_PORTS-1:0]                         int_m0_arvalid;
+  logic [N_PORTS-1:0]                         int_m0_arready;
+
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m0_rid;
+  logic [N_PORTS-1:0]                   [1:0] int_m0_rresp;
+  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_m0_rdata;
+  logic [N_PORTS-1:0]                         int_m0_rlast;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m0_ruser;
+  logic [N_PORTS-1:0]                         int_m0_rready;
+  logic [N_PORTS-1:0]                         int_m0_rvalid;
+
+  logic [N_PORTS-1:0]                         l1_m0_ar_accept;
+  logic [N_PORTS-1:0]                         l1_m0_ar_drop;
+  logic [N_PORTS-1:0]                         l1_m0_ar_save;
+  logic [N_PORTS-1:0]                         l1_m0_ar_done;
+  logic [N_PORTS-1:0]                         l2_m0_ar_accept;
+  logic [N_PORTS-1:0]                         l2_m0_ar_drop;
+  logic [N_PORTS-1:0]                         l2_m0_ar_done;
+  logic [N_PORTS-1:0]                         l2_m0_ar_sending;
+
+  logic [N_PORTS-1:0]                         l1_m0_aw_accept;
+  logic [N_PORTS-1:0]                         l1_m0_aw_drop;
+  logic [N_PORTS-1:0]                         l1_m0_aw_save;
+  logic [N_PORTS-1:0]                         l1_m0_aw_done;
+  logic [N_PORTS-1:0]                         l2_m0_aw_accept;
+  logic [N_PORTS-1:0]                         l2_m0_aw_drop;
+  logic [N_PORTS-1:0]                         l2_m0_aw_done;
+  logic [N_PORTS-1:0]                         l2_m0_aw_sending;
+
+  // Internal master1 AXI4 lines. These connect the second master port to the
+  // multiplexers
+  // For channels read address, write address and write data the other lines
+  // are ignored if valid is not set, therefore we only need to multiplex those
+  logic [N_PORTS-1:0]                         int_m1_awvalid;
+  logic [N_PORTS-1:0]                         int_m1_awready;
+
+  logic [N_PORTS-1:0]                         int_m1_wvalid;
+  logic [N_PORTS-1:0]                         int_m1_wready;
+
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m1_bid;
+  logic [N_PORTS-1:0]                   [1:0] int_m1_bresp;
+  logic [N_PORTS-1:0]                         int_m1_bvalid;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m1_buser;
+  logic [N_PORTS-1:0]                         int_m1_bready;
+
+  logic [N_PORTS-1:0]                         int_m1_arvalid;
+  logic [N_PORTS-1:0]                         int_m1_arready;
+
+  logic [N_PORTS-1:0]      [AXI_ID_WIDTH-1:0] int_m1_rid;
+  logic [N_PORTS-1:0]                   [1:0] int_m1_rresp;
+  logic [N_PORTS-1:0]    [AXI_DATA_WIDTH-1:0] int_m1_rdata;
+  logic [N_PORTS-1:0]                         int_m1_rlast;
+  logic [N_PORTS-1:0]    [AXI_USER_WIDTH-1:0] int_m1_ruser;
+  logic [N_PORTS-1:0]                         int_m1_rvalid;
+  logic [N_PORTS-1:0]                         int_m1_rready;
+
+  logic [N_PORTS-1:0]                         l1_m1_ar_accept;
+  logic [N_PORTS-1:0]                         l1_m1_ar_drop;
+  logic [N_PORTS-1:0]                         l1_m1_ar_save;
+  logic [N_PORTS-1:0]                         l1_m1_ar_done;
+  logic [N_PORTS-1:0]                         l2_m1_ar_accept;
+  logic [N_PORTS-1:0]                         l2_m1_ar_drop;
+  logic [N_PORTS-1:0]                         l2_m1_ar_done;
+
+  logic [N_PORTS-1:0]                         l1_m1_aw_accept;
+  logic [N_PORTS-1:0]                         l1_m1_aw_drop;
+  logic [N_PORTS-1:0]                         l1_m1_aw_save;
+  logic [N_PORTS-1:0]                         l1_m1_aw_done;
+  logic [N_PORTS-1:0]                         l2_m1_aw_accept;
+  logic [N_PORTS-1:0]                         l2_m1_aw_drop;
+  logic [N_PORTS-1:0]                         l2_m1_aw_done;
+
+  // L1 outputs
+  logic [N_PORTS-1:0]                         rab_miss; // L1 RAB miss
+  logic [N_PORTS-1:0]                         rab_prot;
+  logic [N_PORTS-1:0]                         rab_multi;
+  logic [N_PORTS-1:0]                         rab_prefetch;
+
+  //
+  // Signals used to support L2 TLB
+  //
+  // L2 RAM configuration signals
+  logic [N_PORTS-1:0] [AXI_LITE_DATA_WIDTH-1:0] L2CfgWData_D;
+  logic [N_PORTS-1:0] [AXI_LITE_ADDR_WIDTH-1:0] L2CfgWAddr_D;
+  logic [N_PORTS-1:0]                           L2CfgWE_S;
+
+  // L1 output and drop Buffer
+  logic [N_PORTS-1:0]                           L1OutRwType_D, L1DropRwType_DP;
+  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] L1OutUser_D, L1DropUser_DP;
+  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] L1OutId_D, L1DropId_DP;
+  logic [N_PORTS-1:0]                     [7:0] L1OutLen_D, L1DropLen_DP;
+  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] L1OutAddr_D, L1DropAddr_DP;
+  logic [N_PORTS-1:0]                           L1OutProt_D, L1DropProt_DP;
+  logic [N_PORTS-1:0]                           L1OutMulti_D, L1DropMulti_DP;
+  logic [N_PORTS-1:0]                           L1DropEn_S;
+  logic [N_PORTS-1:0]                           L1DropPrefetch_S;
+
+  logic [N_PORTS-1:0]                           L1DropValid_SN, L1DropValid_SP;
+
+  // L2 input Buffer
+  logic [N_PORTS-1:0]                           L2InRwType_DP;
+  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] L2InUser_DP;
+  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] L2InId_DP;
+  logic [N_PORTS-1:0]                     [7:0] L2InLen_DP;
+  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] L2InAddr_DP;
+  logic [N_PORTS-1:0]                           L2InEn_S;
+
+  // L2 output Buffer
+  logic [N_PORTS-1:0]                           L2OutRwType_DP;
+  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] L2OutUser_DP;
+  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] L2OutId_DP;
+  logic [N_PORTS-1:0]                     [7:0] L2OutLen_DP;
+  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] L2OutInAddr_DP;
+
+  logic [N_PORTS-1:0]                           L2OutHit_SN, L2OutHit_SP;
+  logic [N_PORTS-1:0]                           L2OutMiss_SN, L2OutMiss_SP;
+  logic [N_PORTS-1:0]                           L2OutProt_SN, L2OutProt_SP;
+  logic [N_PORTS-1:0]                           L2OutMulti_SN, L2OutMulti_SP;
+  logic [N_PORTS-1:0]                           L2OutCC_SN, L2OutCC_SP;
+  logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] L2OutAddr_DN, L2OutAddr_DP;
+
+  logic [N_PORTS-1:0]                           L2OutValid_SN, L2OutValid_SP;
+  logic [N_PORTS-1:0]                           L2OutPrefetch_S;
+  logic [N_PORTS-1:0]                           L2OutReady_S;
+  logic [N_PORTS-1:0]                           L2OutEn_S;
+
+   // L2 outputs
+  logic [N_PORTS-1:0]                           L2Busy_S;
+  logic [N_PORTS-1:0]                           L2OutValid_S;
+
+  logic [N_PORTS-1:0]                           L2Miss_S;
+
+  // Signals for interfacing the AXI modules
+  logic [N_PORTS-1:0]                           l1_ar_accept;
+  logic [N_PORTS-1:0]                           l1_aw_accept;
+  logic [N_PORTS-1:0]                           l1_w_accept;
+  logic [N_PORTS-1:0]                           l1_xw_accept;
+
+  logic [N_PORTS-1:0]                           l1_ar_drop;
+  logic [N_PORTS-1:0]                           l1_aw_drop;
+  logic [N_PORTS-1:0]                           l1_w_drop;
+  logic [N_PORTS-1:0]                           l1_xw_drop;
+
+  logic [N_PORTS-1:0]                           l1_ar_save;
+  logic [N_PORTS-1:0]                           l1_aw_save;
+  logic [N_PORTS-1:0]                           l1_w_save;
+  logic [N_PORTS-1:0]                           l1_xw_save;
+
+  logic [N_PORTS-1:0]                           l1_ar_done;
+  logic [N_PORTS-1:0]                           l1_r_done;
+  logic [N_PORTS-1:0]                           l1_r_drop;
+  logic [N_PORTS-1:0]                           lx_r_drop;
+  logic [N_PORTS-1:0]                           lx_r_done;
+
+  logic [N_PORTS-1:0]                           l1_aw_done;
+  logic [N_PORTS-1:0]                           l1_w_done;
+  logic [N_PORTS-1:0]                           l1_xw_done;
+  logic [N_PORTS-1:0]                           l1_aw_done_SP;
+  logic [N_PORTS-1:0]                           l1_w_done_SP;
+
+  logic [N_PORTS-1:0]                           l2_ar_accept;
+  logic [N_PORTS-1:0]                           l2_aw_accept;
+  logic [N_PORTS-1:0]                           l2_w_accept;
+  logic [N_PORTS-1:0]                           l2_xw_accept;
+
+  logic [N_PORTS-1:0]                           l2_ar_drop;
+  logic [N_PORTS-1:0]                           l2_r_drop;
+  logic [N_PORTS-1:0]                           l2_xr_drop;
+  logic [N_PORTS-1:0]                           l2_aw_drop;
+  logic [N_PORTS-1:0]                           l2_w_drop;
+  logic [N_PORTS-1:0]                           l2_xw_drop;
+
+  logic [N_PORTS-1:0]                           l2_aw_done;
+  logic [N_PORTS-1:0]                           l2_w_done;
+  logic [N_PORTS-1:0]                           l2_xw_done;
+  logic [N_PORTS-1:0]                           l2_aw_done_SP;
+  logic [N_PORTS-1:0]                           l2_w_done_SP;
+
+  logic [N_PORTS-1:0]                           l2_ar_done;
+  logic [N_PORTS-1:0]                           l2_r_done;
+  logic [N_PORTS-1:0]                           l2_xr_done;
+  logic [N_PORTS-1:0]                           l2_ar_done_SP;
+  logic [N_PORTS-1:0]                           l2_r_done_SP;
+
+  logic [N_PORTS-1:0]                           l1_mx_aw_done;
+  logic [N_PORTS-1:0]                           l1_mx_ar_done;
+  logic [N_PORTS-1:0]                           l1_m0_aw_done_SP;
+  logic [N_PORTS-1:0]                           l1_m0_ar_done_SP;
+  logic [N_PORTS-1:0]                           l1_m1_aw_done_SP;
+  logic [N_PORTS-1:0]                           l1_m1_ar_done_SP;
+
+  logic [N_PORTS-1:0]                           l2_mx_aw_done;
+  logic [N_PORTS-1:0]                           l2_mx_ar_done;
+  logic [N_PORTS-1:0]                           l2_m0_aw_done_SP;
+  logic [N_PORTS-1:0]                           l2_m0_ar_done_SP;
+  logic [N_PORTS-1:0]                           l2_m1_aw_done_SP;
+  logic [N_PORTS-1:0]                           l2_m1_ar_done_SP;
+
+  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] l1_id_drop, lx_id_drop, b_id_drop;
+  logic [N_PORTS-1:0]                     [7:0] l1_len_drop, lx_len_drop;
+  logic [N_PORTS-1:0]                           l1_prefetch_drop, lx_prefetch_drop, b_prefetch_drop;
+  logic [N_PORTS-1:0]                           l1_hit_drop, lx_hit_drop, b_hit_drop;
+
+  logic [N_PORTS-1:0]                           b_drop;
+  logic [N_PORTS-1:0]                           b_done;
+
+  logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] l2_aw_addr;
+  logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] l2_ar_addr;
+
+  logic [N_PORTS-1:0]                           l2_cache_coherent;
+  logic [N_PORTS-1:0]                           l2_master_select;
+
+  logic [N_PORTS-1:0]                           aw_in_stall;
+  logic [N_PORTS-1:0]                           aw_out_stall;
+
+  genvar                                        i;
+
+  // RRESP FSM
+  typedef enum logic                    {IDLE, BUSY} r_resp_mux_ctrl_state_t;
+  r_resp_mux_ctrl_state_t [N_PORTS-1:0] RRespMuxCtrl_SN, RRespMuxCtrl_SP;
+  logic                   [N_PORTS-1:0] RRespSel_SN, RRespSel_SP;
+  logic                   [N_PORTS-1:0] RRespBurst_S;
+  logic                   [N_PORTS-1:0] RRespSelIm_S;
+
+  // }}}
+
+  // Local parameters {{{
+
+  // Enable L2 for select ports
+  localparam integer ENABLE_L2TLB[N_PORTS-1:0] = `EN_L2TLB_ARRAY;
+
+  // L2TLB parameters
+  localparam integer HUM_BUFFER_DEPTH = (N_L2_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS)+13;
+
+  // }}}
+
+  // Derive `master_select` from cache coherency flag. {{{
+  `ifdef EN_ACP
+    assign int_wmaster_select = int_wtrans_cache_coherent;
+    assign int_rmaster_select = int_rtrans_cache_coherent;
+    assign l2_master_select   = l2_cache_coherent;
+  `else
+    assign int_wmaster_select = '0;
+    assign int_rmaster_select = '0;
+    assign l2_master_select   = '0;
+  `endif
+  // }}}
+
+  // Buf and Send {{{
+  // ██████╗ ██╗   ██╗███████╗       ██╗       ███████╗███████╗███╗   ██╗██████╗
+  // ██╔══██╗██║   ██║██╔════╝       ██║       ██╔════╝██╔════╝████╗  ██║██╔══██╗
+  // ██████╔╝██║   ██║█████╗      ████████╗    ███████╗█████╗  ██╔██╗ ██║██║  ██║
+  // ██╔══██╗██║   ██║██╔══╝      ██╔═██╔═╝    ╚════██║██╔══╝  ██║╚██╗██║██║  ██║
+  // ██████╔╝╚██████╔╝██║         ██████║      ███████║███████╗██║ ╚████║██████╔╝
+  // ╚═════╝  ╚═════╝ ╚═╝         ╚═════╝      ╚══════╝╚══════╝╚═╝  ╚═══╝╚═════╝
+  //
+  logic[N_PORTS-1:0] m0_write_is_burst, m0_read_is_burst;
+  logic[N_PORTS-1:0] m1_write_is_burst, m1_read_is_burst;
+
+  generate for (i = 0; i < N_PORTS; i++) begin : BUF_AND_SEND
+
+  // Write Address channel (aw) {{{
+  /*
+   * write address channel (aw)
+   *
+   * ██╗    ██╗██████╗ ██╗████████╗███████╗     █████╗ ██████╗ ██████╗ ██████╗
+   * ██║    ██║██╔══██╗██║╚══██╔══╝██╔════╝    ██╔══██╗██╔══██╗██╔══██╗██╔══██╗
+   * ██║ █╗ ██║██████╔╝██║   ██║   █████╗      ███████║██║  ██║██║  ██║██████╔╝
+   * ██║███╗██║██╔══██╗██║   ██║   ██╔══╝      ██╔══██║██║  ██║██║  ██║██╔══██╗
+   * ╚███╔███╔╝██║  ██║██║   ██║   ███████╗    ██║  ██║██████╔╝██████╔╝██║  ██║
+   *  ╚══╝╚══╝ ╚═╝  ╚═╝╚═╝   ╚═╝   ╚══════╝    ╚═╝  ╚═╝╚═════╝ ╚═════╝ ╚═╝  ╚═╝
+   *
+   */
+
+  axi4_aw_buffer
+    #(
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+      )
+    u_aw_buffer
+    (
+      .axi4_aclk       ( Clk_CI             ),
+      .axi4_arstn      ( Rst_RBI            ),
+      .s_axi4_awid     ( s_axi4_awid[i]     ),
+      .s_axi4_awaddr   ( s_axi4_awaddr[i]   ),
+      .s_axi4_awvalid  ( s_axi4_awvalid[i]  ),
+      .s_axi4_awready  ( s_axi4_awready[i]  ),
+      .s_axi4_awlen    ( s_axi4_awlen[i]    ),
+      .s_axi4_awsize   ( s_axi4_awsize[i]   ),
+      .s_axi4_awburst  ( s_axi4_awburst[i]  ),
+      .s_axi4_awlock   ( s_axi4_awlock[i]   ),
+      .s_axi4_awprot   ( s_axi4_awprot[i]   ),
+      .s_axi4_awcache  ( s_axi4_awcache[i]  ),
+      .s_axi4_awregion ( s_axi4_awregion[i] ),
+      .s_axi4_awqos    ( s_axi4_awqos[i]    ),
+      .s_axi4_awuser   ( s_axi4_awuser[i]   ),
+      .m_axi4_awid     ( int_awid[i]        ),
+      .m_axi4_awaddr   ( int_awaddr[i]      ),
+      .m_axi4_awvalid  ( int_awvalid[i]     ),
+      .m_axi4_awready  ( int_awready[i]     ),
+      .m_axi4_awlen    ( int_awlen[i]       ),
+      .m_axi4_awsize   ( int_awsize[i]      ),
+      .m_axi4_awburst  ( int_awburst[i]     ),
+      .m_axi4_awlock   ( int_awlock[i]      ),
+      .m_axi4_awprot   ( int_awprot[i]      ),
+      .m_axi4_awcache  ( int_awcache[i]     ),
+      .m_axi4_awregion ( int_awregion[i]    ),
+      .m_axi4_awqos    ( int_awqos[i]       ),
+      .m_axi4_awuser   ( int_awuser[i]      )
+    );
+
+  axi4_aw_sender
+    #(
+      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
+      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
+      )
+    u_aw_sender_m0
+    (
+      .axi4_aclk       ( Clk_CI                ),
+      .axi4_arstn      ( Rst_RBI               ),
+      .l1_done_o       ( l1_m0_aw_done[i]      ),
+      .l1_accept_i     ( l1_m0_aw_accept[i]    ),
+      .l1_drop_i       ( l1_m0_aw_drop[i]      ),
+      .l1_save_i       ( l1_m0_aw_save[i]      ),
+      .l2_done_o       ( l2_m0_aw_done[i]      ),
+      .l2_accept_i     ( l2_m0_aw_accept[i]    ),
+      .l2_drop_i       ( l2_m0_aw_drop[i]      ),
+      .l2_sending_o    ( l2_m0_aw_sending[i]   ),
+      .l1_awaddr_i     ( int_wtrans_addr[i]    ),
+      .l2_awaddr_i     ( l2_aw_addr[i]         ),
+      .s_axi4_awid     ( int_awid[i]           ),
+      .s_axi4_awvalid  ( int_m0_awvalid[i]     ),
+      .s_axi4_awready  ( int_m0_awready[i]     ),
+      .s_axi4_awlen    ( int_awlen[i]          ),
+      .s_axi4_awsize   ( int_awsize[i]         ),
+      .s_axi4_awburst  ( int_awburst[i]        ),
+      .s_axi4_awlock   ( int_awlock[i]         ),
+      .s_axi4_awprot   ( int_awprot[i]         ),
+      .s_axi4_awcache  ( int_awcache[i]        ),
+      .s_axi4_awregion ( int_awregion[i]       ),
+      .s_axi4_awqos    ( int_awqos[i]          ),
+      .s_axi4_awuser   ( int_awuser[i]         ),
+      .m_axi4_awid     ( m0_axi4_awid[i]       ),
+      .m_axi4_awaddr   ( m0_axi4_awaddr[i]     ),
+      .m_axi4_awvalid  ( m0_axi4_awvalid[i]    ),
+      .m_axi4_awready  ( m0_axi4_awready[i]    ),
+      .m_axi4_awlen    ( m0_axi4_awlen[i]      ),
+      .m_axi4_awsize   ( m0_axi4_awsize[i]     ),
+      .m_axi4_awburst  ( m0_axi4_awburst[i]    ),
+      .m_axi4_awlock   ( m0_axi4_awlock[i]     ),
+      .m_axi4_awprot   ( m0_axi4_awprot[i]     ),
+      .m_axi4_awcache  (                       ),
+      .m_axi4_awregion ( m0_axi4_awregion[i]   ),
+      .m_axi4_awqos    ( m0_axi4_awqos[i]      ),
+      .m_axi4_awuser   ( m0_axi4_awuser[i]     )
+    );
+
+  // The AXCACHE signals are set according to burstiness and cache coherence or statically
+  // when not connected to ACP on Zynq (implemented below).
+    assign m0_write_is_burst[i] = (m0_axi4_awlen[i] != {8{1'b0}}) && (m0_axi4_awburst[i] != 2'b00);
+  `ifndef EN_ACP
+    always_comb begin
+      if ( (l2_m0_aw_sending[i] & l2_cache_coherent[i]) | int_wtrans_cache_coherent[i]) begin
+        if (m0_write_is_burst[i]) begin
+          m0_axi4_awcache[i]  = 4'b0111;
+        end else begin
+          m0_axi4_awcache[i]  = 4'b1111;
+        end
+      end else begin
+        m0_axi4_awcache[i]    = 4'b0011;
+      end
+    end
+  `else
+    assign m0_axi4_awcache[i] = 4'b0011;
+  `endif
+
+  axi4_aw_sender
+    #(
+      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
+      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
+      )
+    u_aw_sender_m1
+    (
+      .axi4_aclk       ( Clk_CI                ),
+      .axi4_arstn      ( Rst_RBI               ),
+      .l1_accept_i     ( l1_m1_aw_accept[i]    ),
+      .l1_drop_i       ( l1_m1_aw_drop[i]      ),
+      .l1_save_i       ( l1_m1_aw_save[i]      ),
+      .l1_done_o       ( l1_m1_aw_done[i]      ),
+      .l2_accept_i     ( l2_m1_aw_accept[i]    ),
+      .l2_drop_i       ( l2_m1_aw_drop[i]      ),
+      .l2_done_o       ( l2_m1_aw_done[i]      ),
+      .l2_sending_o    (                       ), // just helps to set axcache
+      .l1_awaddr_i     ( int_wtrans_addr[i]    ),
+      .l2_awaddr_i     ( l2_aw_addr[i]         ),
+      .s_axi4_awid     ( int_awid[i]           ),
+      .s_axi4_awvalid  ( int_m1_awvalid[i]     ),
+      .s_axi4_awready  ( int_m1_awready[i]     ),
+      .s_axi4_awlen    ( int_awlen[i]          ),
+      .s_axi4_awsize   ( int_awsize[i]         ),
+      .s_axi4_awburst  ( int_awburst[i]        ),
+      .s_axi4_awlock   ( int_awlock[i]         ),
+      .s_axi4_awprot   ( int_awprot[i]         ),
+      .s_axi4_awcache  ( int_awcache[i]        ),
+      .s_axi4_awregion ( int_awregion[i]       ),
+      .s_axi4_awqos    ( int_awqos[i]          ),
+      .s_axi4_awuser   ( int_awuser[i]         ),
+      .m_axi4_awid     ( m1_axi4_awid[i]       ),
+      .m_axi4_awaddr   ( m1_axi4_awaddr[i]     ),
+      .m_axi4_awvalid  ( m1_axi4_awvalid[i]    ),
+      .m_axi4_awready  ( m1_axi4_awready[i]    ),
+      .m_axi4_awlen    ( m1_axi4_awlen[i]      ),
+      .m_axi4_awsize   ( m1_axi4_awsize[i]     ),
+      .m_axi4_awburst  ( m1_axi4_awburst[i]    ),
+      .m_axi4_awlock   ( m1_axi4_awlock[i]     ),
+      .m_axi4_awprot   ( m1_axi4_awprot[i]     ),
+      .m_axi4_awcache  (                       ),
+      .m_axi4_awregion ( m1_axi4_awregion[i]   ),
+      .m_axi4_awqos    ( m1_axi4_awqos[i]      ),
+      .m_axi4_awuser   ( m1_axi4_awuser[i]     )
+    );
+
+    // The AXCACHE signals are set according to burstiness and cache coherence or statically
+    // when not connected to ACP on Zynq (implemented below).
+      assign m1_write_is_burst[i] = (m1_axi4_awlen[i] != {8{1'b0}}) && (m1_axi4_awburst[i] != 2'b00);
+    `ifdef EN_ACP
+      always_comb begin
+        if (m1_write_is_burst[i]) begin
+          m1_axi4_awcache[i]    = 4'b1011;
+        end else begin
+          m1_axi4_awcache[i]    = 4'b1111;
+        end
+      end
+    `else
+      assign m1_axi4_awcache[i] = 4'b0011;
+    `endif
+
+  // }}}
+
+  // Write Data channel (w) {{{
+  /*
+   * write data channel (w)
+   *
+   * ██╗    ██╗██████╗ ██╗████████╗███████╗    ██████╗  █████╗ ████████╗ █████╗
+   * ██║    ██║██╔══██╗██║╚══██╔══╝██╔════╝    ██╔══██╗██╔══██╗╚══██╔══╝██╔══██╗
+   * ██║ █╗ ██║██████╔╝██║   ██║   █████╗      ██║  ██║███████║   ██║   ███████║
+   * ██║███╗██║██╔══██╗██║   ██║   ██╔══╝      ██║  ██║██╔══██║   ██║   ██╔══██║
+   * ╚███╔███╔╝██║  ██║██║   ██║   ███████╗    ██████╔╝██║  ██║   ██║   ██║  ██║
+   *  ╚══╝╚══╝ ╚═╝  ╚═╝╚═╝   ╚═╝   ╚══════╝    ╚═════╝ ╚═╝  ╚═╝   ╚═╝   ╚═╝  ╚═╝
+   *
+   */
+  axi4_w_buffer
+    #(
+      .AXI_DATA_WIDTH   ( AXI_DATA_WIDTH   ),
+      .AXI_ID_WIDTH     ( AXI_ID_WIDTH     ),
+      .AXI_USER_WIDTH   ( AXI_USER_WIDTH   ),
+      .ENABLE_L2TLB     ( ENABLE_L2TLB[i]  ),
+      .HUM_BUFFER_DEPTH ( HUM_BUFFER_DEPTH )
+      )
+    u_w_buffer
+    (
+      .axi4_aclk       ( Clk_CI                ),
+      .axi4_arstn      ( Rst_RBI               ),
+
+      // L1 interface
+      .l1_done_o       ( l1_w_done[i]          ),
+      .l1_accept_i     ( l1_w_accept[i]        ),
+      .l1_save_i       ( l1_w_save[i]          ),
+      .l1_drop_i       ( l1_w_drop[i]          ),
+      .l1_master_i     ( int_wmaster_select[i] ),
+      .l1_id_i         ( l1_id_drop[i]         ),
+      .l1_len_i        ( l1_len_drop[i]        ),
+      .l1_prefetch_i   ( l1_prefetch_drop[i]   ),
+      .l1_hit_i        ( l1_hit_drop[i]        ),
+
+      // L2 interface
+      .l2_done_o       ( l2_w_done[i]          ),
+      .l2_accept_i     ( l2_w_accept[i]        ),
+      .l2_drop_i       ( l2_w_drop[i]          ),
+      .l2_master_i     ( l2_master_select[i]   ),
+      .l2_id_i         ( lx_id_drop[i]         ),
+      .l2_len_i        ( lx_len_drop[i]        ),
+      .l2_prefetch_i   ( lx_prefetch_drop[i]   ),
+      .l2_hit_i        ( lx_hit_drop[i]        ),
+
+      // Top-level control outputs
+      .master_select_o ( w_master_select[i]    ),
+      .input_stall_o   ( aw_in_stall[i]        ), // stall L1 AW input if request buffers full
+      .output_stall_o  ( aw_out_stall[i]       ), // stall L1 AW hit forwarding if bypass not possible
+
+      // B sender interface
+      .b_drop_o        ( b_drop[i]             ),
+      .b_done_i        ( b_done[i]             ),
+      .id_o            ( b_id_drop[i]          ),
+      .prefetch_o      ( b_prefetch_drop[i]    ),
+      .hit_o           ( b_hit_drop[i]         ),
+
+      // AXI W channel interfaces
+      .s_axi4_wdata    ( s_axi4_wdata[i]       ),
+      .s_axi4_wvalid   ( s_axi4_wvalid[i]      ),
+      .s_axi4_wready   ( s_axi4_wready[i]      ),
+      .s_axi4_wstrb    ( s_axi4_wstrb[i]       ),
+      .s_axi4_wlast    ( s_axi4_wlast[i]       ),
+      .s_axi4_wuser    ( s_axi4_wuser[i]       ),
+      .m_axi4_wdata    ( int_wdata[i]          ),
+      .m_axi4_wvalid   ( int_wvalid[i]         ),
+      .m_axi4_wready   ( int_wready[i]         ),
+      .m_axi4_wstrb    ( int_wstrb[i]          ),
+      .m_axi4_wlast    ( int_wlast[i]          ),
+      .m_axi4_wuser    ( int_wuser[i]          )
+    );
+
+  axi4_w_sender
+    #(
+      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+      )
+    u_w_sender_m0
+    (
+      .axi4_aclk       ( Clk_CI            ),
+      .axi4_arstn      ( Rst_RBI           ),
+      .s_axi4_wdata    ( int_wdata[i]      ),
+      .s_axi4_wvalid   ( int_m0_wvalid[i]  ),
+      .s_axi4_wready   ( int_m0_wready[i]  ),
+      .s_axi4_wstrb    ( int_wstrb[i]      ),
+      .s_axi4_wlast    ( int_wlast[i]      ),
+      .s_axi4_wuser    ( int_wuser[i]      ),
+      .m_axi4_wdata    ( m0_axi4_wdata[i]  ),
+      .m_axi4_wvalid   ( m0_axi4_wvalid[i] ),
+      .m_axi4_wready   ( m0_axi4_wready[i] ),
+      .m_axi4_wstrb    ( m0_axi4_wstrb[i]  ),
+      .m_axi4_wlast    ( m0_axi4_wlast[i]  ),
+      .m_axi4_wuser    ( m0_axi4_wuser[i]  )
+    );
+
+  axi4_w_sender
+    #(
+      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+
+      )
+    u_w_sender_m1
+    (
+      .axi4_aclk       ( Clk_CI            ),
+      .axi4_arstn      ( Rst_RBI           ),
+      .s_axi4_wdata    ( int_wdata[i]      ),
+      .s_axi4_wvalid   ( int_m1_wvalid[i]  ),
+      .s_axi4_wready   ( int_m1_wready[i]  ),
+      .s_axi4_wstrb    ( int_wstrb[i]      ),
+      .s_axi4_wlast    ( int_wlast[i]      ),
+      .s_axi4_wuser    ( int_wuser[i]      ),
+      .m_axi4_wdata    ( m1_axi4_wdata[i]  ),
+      .m_axi4_wvalid   ( m1_axi4_wvalid[i] ),
+      .m_axi4_wready   ( m1_axi4_wready[i] ),
+      .m_axi4_wstrb    ( m1_axi4_wstrb[i]  ),
+      .m_axi4_wlast    ( m1_axi4_wlast[i]  ),
+      .m_axi4_wuser    ( m1_axi4_wuser[i]  )
+    );
+
+  /*
+   * Multiplexer to switch between the two output master ports on the write data (w) channel
+   */
+  always_comb begin
+    /* Only one output can be selected at any time */
+    if (w_master_select[i] == 1'b0) begin
+      int_m0_wvalid[i] = int_wvalid[i];
+      int_m1_wvalid[i] = 1'b0;
+      int_wready[i]    = int_m0_wready[i];
+    end else begin
+      int_m0_wvalid[i] = 1'b0;
+      int_m1_wvalid[i] = int_wvalid[i];
+      int_wready[i]    = int_m1_wready[i];
+    end
+  end
+
+  // }}}
+
+  // Write Response channel (b) {{{
+  /*
+   * write response channel (b)
+   *
+   * ██╗    ██╗██████╗ ██╗████████╗███████╗    ██████╗ ███████╗███████╗██████╗
+   * ██║    ██║██╔══██╗██║╚══██╔══╝██╔════╝    ██╔══██╗██╔════╝██╔════╝██╔══██╗
+   * ██║ █╗ ██║██████╔╝██║   ██║   █████╗      ██████╔╝█████╗  ███████╗██████╔╝
+   * ██║███╗██║██╔══██╗██║   ██║   ██╔══╝      ██╔══██╗██╔══╝  ╚════██║██╔═══╝
+   * ╚███╔███╔╝██║  ██║██║   ██║   ███████╗    ██║  ██║███████╗███████║██║
+   *  ╚══╝╚══╝ ╚═╝  ╚═╝╚═╝   ╚═╝   ╚══════╝    ╚═╝  ╚═╝╚══════╝╚══════╝╚═╝
+   *
+   */
+  axi4_b_buffer
+    #(
+        .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
+        .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+      )
+    u_b_buffer_m0
+    (
+      .axi4_aclk     ( Clk_CI            ),
+      .axi4_arstn    ( Rst_RBI           ),
+      .s_axi4_bid    ( int_m0_bid[i]     ),
+      .s_axi4_bresp  ( int_m0_bresp[i]   ),
+      .s_axi4_bvalid ( int_m0_bvalid[i]  ),
+      .s_axi4_buser  ( int_m0_buser[i]   ),
+      .s_axi4_bready ( int_m0_bready[i]  ),
+      .m_axi4_bid    ( m0_axi4_bid[i]    ),
+      .m_axi4_bresp  ( m0_axi4_bresp[i]  ),
+      .m_axi4_bvalid ( m0_axi4_bvalid[i] ),
+      .m_axi4_buser  ( m0_axi4_buser[i]  ),
+      .m_axi4_bready ( m0_axi4_bready[i] )
+    );
+
+  axi4_b_buffer
+    #(
+        .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
+        .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+      )
+    u_b_buffer_m1
+    (
+      .axi4_aclk      ( Clk_CI            ),
+      .axi4_arstn     ( Rst_RBI           ),
+      .s_axi4_bid     ( int_m1_bid[i]     ),
+      .s_axi4_bresp   ( int_m1_bresp[i]   ),
+      .s_axi4_bvalid  ( int_m1_bvalid[i]  ),
+      .s_axi4_buser   ( int_m1_buser[i]   ),
+      .s_axi4_bready  ( int_m1_bready[i]  ),
+      .m_axi4_bid     ( m1_axi4_bid[i]    ),
+      .m_axi4_bresp   ( m1_axi4_bresp[i]  ),
+      .m_axi4_bvalid  ( m1_axi4_bvalid[i] ),
+      .m_axi4_buser   ( m1_axi4_buser[i]  ),
+      .m_axi4_bready  ( m1_axi4_bready[i] )
+    );
+
+  axi4_b_sender
+    #(
+        .AXI_ID_WIDTH   ( AXI_ID_WIDTH    ),
+        .AXI_USER_WIDTH ( AXI_USER_WIDTH  )
+      )
+    u_b_sender
+    (
+      .axi4_aclk      ( Clk_CI             ),
+      .axi4_arstn     ( Rst_RBI            ),
+      .drop_i         ( b_drop[i]          ),
+      .done_o         ( b_done[i]          ),
+      .id_i           ( b_id_drop[i]       ),
+      .prefetch_i     ( b_prefetch_drop[i] ),
+      .hit_i          ( b_hit_drop[i]      ),
+      .s_axi4_bid     ( s_axi4_bid[i]      ),
+      .s_axi4_bresp   ( s_axi4_bresp[i]    ),
+      .s_axi4_bvalid  ( s_axi4_bvalid[i]   ),
+      .s_axi4_buser   ( s_axi4_buser[i]    ),
+      .s_axi4_bready  ( s_axi4_bready[i]   ),
+      .m_axi4_bid     ( int_bid[i]         ),
+      .m_axi4_bresp   ( int_bresp[i]       ),
+      .m_axi4_bvalid  ( int_bvalid[i]      ),
+      .m_axi4_buser   ( int_buser[i]       ),
+      .m_axi4_bready  ( int_bready[i]      )
+    );
+
+  /*
+   * Multiplexer to switch between the two output master ports on the write response (b) channel
+   */
+  always_comb begin
+     /* Output 1 always gets priority, so if it has something to send connect
+      it and let output 0 wait using rready = 0 */
+    if (int_m1_bvalid[i] == 1'b1) begin
+      int_m0_bready[i] = 1'b0;
+      int_m1_bready[i] = int_bready[i];
+
+      int_bid[i]       = int_m1_bid[i];
+      int_bresp[i]     = int_m1_bresp[i];
+      int_buser[i]     = int_m1_buser[i];
+      int_bvalid[i]    = int_m1_bvalid[i];
+    end else begin
+      int_m0_bready[i] = int_bready[i];
+      int_m1_bready[i] = 1'b0;
+
+      int_bid[i]       = int_m0_bid[i];
+      int_bresp[i]     = int_m0_bresp[i];
+      int_buser[i]     = int_m0_buser[i];
+      int_bvalid[i]    = int_m0_bvalid[i];
+    end
+  end
+
+  // }}}
+
+  // Read Address channel (ar) {{{
+  /*
+   * read address channel (ar)
+   *
+   * ██████╗ ███████╗ █████╗ ██████╗      █████╗ ██████╗ ██████╗ ██████╗
+   * ██╔══██╗██╔════╝██╔══██╗██╔══██╗    ██╔══██╗██╔══██╗██╔══██╗██╔══██╗
+   * ██████╔╝█████╗  ███████║██║  ██║    ███████║██║  ██║██║  ██║██████╔╝
+   * ██╔══██╗██╔══╝  ██╔══██║██║  ██║    ██╔══██║██║  ██║██║  ██║██╔══██╗
+   * ██║  ██║███████╗██║  ██║██████╔╝    ██║  ██║██████╔╝██████╔╝██║  ██║
+   * ╚═╝  ╚═╝╚══════╝╚═╝  ╚═╝╚═════╝     ╚═╝  ╚═╝╚═════╝ ╚═════╝ ╚═╝  ╚═╝
+   *
+   */
+  axi4_ar_buffer
+    #(
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+      )
+    u_ar_buffer
+    (
+      .axi4_aclk      ( Clk_CI            ),
+      .axi4_arstn     ( Rst_RBI           ),
+      .s_axi4_arid    ( s_axi4_arid[i]    ),
+      .s_axi4_araddr  ( s_axi4_araddr[i]  ),
+      .s_axi4_arvalid ( s_axi4_arvalid[i] ),
+      .s_axi4_arready ( s_axi4_arready[i] ),
+      .s_axi4_arlen   ( s_axi4_arlen[i]   ),
+      .s_axi4_arsize  ( s_axi4_arsize[i]  ),
+      .s_axi4_arburst ( s_axi4_arburst[i] ),
+      .s_axi4_arlock  ( s_axi4_arlock[i]  ),
+      .s_axi4_arprot  ( s_axi4_arprot[i]  ),
+      .s_axi4_arcache ( s_axi4_arcache[i] ),
+      .s_axi4_aruser  ( s_axi4_aruser[i]  ),
+      .m_axi4_arid    ( int_arid[i]       ),
+      .m_axi4_araddr  ( int_araddr[i]     ),
+      .m_axi4_arvalid ( int_arvalid[i]    ),
+      .m_axi4_arready ( int_arready[i]    ),
+      .m_axi4_arlen   ( int_arlen[i]      ),
+      .m_axi4_arsize  ( int_arsize[i]     ),
+      .m_axi4_arburst ( int_arburst[i]    ),
+      .m_axi4_arlock  ( int_arlock[i]     ),
+      .m_axi4_arprot  ( int_arprot[i]     ),
+      .m_axi4_arcache ( int_arcache[i]    ),
+      .m_axi4_aruser  ( int_aruser[i]     )
+    );
+
+  axi4_ar_sender
+    #(
+      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
+      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
+      )
+    u_ar_sender_m0
+    (
+      .axi4_aclk       ( Clk_CI                ),
+      .axi4_arstn      ( Rst_RBI               ),
+      .l1_done_o       ( l1_m0_ar_done[i]      ),
+      .l1_accept_i     ( l1_m0_ar_accept[i]    ),
+      .l1_drop_i       ( l1_m0_ar_drop[i]      ),
+      .l1_save_i       ( l1_m0_ar_save[i]      ),
+      .l2_done_o       ( l2_m0_ar_done[i]      ),
+      .l2_accept_i     ( l2_m0_ar_accept[i]    ),
+      .l2_drop_i       ( l2_m0_ar_drop[i]      ),
+      .l2_sending_o    ( l2_m0_ar_sending[i]   ),
+      .l1_araddr_i     ( int_rtrans_addr[i]    ),
+      .l2_araddr_i     ( l2_ar_addr[i]         ),
+      .s_axi4_arid     ( int_arid[i]           ),
+      .s_axi4_arvalid  ( int_m0_arvalid[i]     ),
+      .s_axi4_arready  ( int_m0_arready[i]     ),
+      .s_axi4_arlen    ( int_arlen[i]          ),
+      .s_axi4_arsize   ( int_arsize[i]         ),
+      .s_axi4_arburst  ( int_arburst[i]        ),
+      .s_axi4_arlock   ( int_arlock[i]         ),
+      .s_axi4_arprot   ( int_arprot[i]         ),
+      .s_axi4_arcache  ( int_arcache[i]        ),
+      .s_axi4_aruser   ( int_aruser[i]         ),
+      .m_axi4_arid     ( m0_axi4_arid[i]       ),
+      .m_axi4_araddr   ( m0_axi4_araddr[i]     ),
+      .m_axi4_arvalid  ( m0_axi4_arvalid[i]    ),
+      .m_axi4_arready  ( m0_axi4_arready[i]    ),
+      .m_axi4_arlen    ( m0_axi4_arlen[i]      ),
+      .m_axi4_arsize   ( m0_axi4_arsize[i]     ),
+      .m_axi4_arburst  ( m0_axi4_arburst[i]    ),
+      .m_axi4_arlock   ( m0_axi4_arlock[i]     ),
+      .m_axi4_arprot   ( m0_axi4_arprot[i]     ),
+      .m_axi4_arcache  (                       ),
+      .m_axi4_aruser   ( m0_axi4_aruser[i]     )
+    );
+
+    // The AXCACHE signals are set according to burstiness and cache coherence or statically
+    // when not connected to ACP on Zynq (implemented below).
+      assign m0_read_is_burst[i] = (m0_axi4_arlen[i] != {8{1'b0}}) && (m0_axi4_arburst[i] != 2'b00);
+    `ifndef EN_ACP
+      always_comb begin
+        if ( (l2_m0_ar_sending[i] & l2_cache_coherent[i]) | int_rtrans_cache_coherent[i]) begin
+          if (m0_read_is_burst[i]) begin
+            m0_axi4_arcache[i]  = 4'b1011;
+          end else begin
+            m0_axi4_arcache[i]  = 4'b1111;
+          end
+        end else begin
+          m0_axi4_arcache[i]    = 4'b0011;
+        end
+      end
+    `else
+      assign m0_axi4_arcache[i] = 4'b0011;
+    `endif
+
+  axi4_ar_sender
+    #(
+      .AXI_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH     ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH   ),
+      .ENABLE_L2TLB   ( ENABLE_L2TLB[i]  )
+      )
+    u_ar_sender_m1
+    (
+      .axi4_aclk       ( Clk_CI                ),
+      .axi4_arstn      ( Rst_RBI               ),
+      .l1_done_o       ( l1_m1_ar_done[i]      ),
+      .l1_accept_i     ( l1_m1_ar_accept[i]    ),
+      .l1_drop_i       ( l1_m1_ar_drop[i]      ),
+      .l1_save_i       ( l1_m1_ar_save[i]      ),
+      .l2_done_o       ( l2_m1_ar_done[i]      ),
+      .l2_accept_i     ( l2_m1_ar_accept[i]    ),
+      .l2_drop_i       ( l2_m1_ar_drop[i]      ),
+      .l2_sending_o    (                       ), // just helps to set axcache
+      .l1_araddr_i     ( int_rtrans_addr[i]    ),
+      .l2_araddr_i     ( l2_ar_addr[i]         ),
+      .s_axi4_arid     ( int_arid[i]           ),
+      .s_axi4_arvalid  ( int_m1_arvalid[i]     ),
+      .s_axi4_arready  ( int_m1_arready[i]     ),
+      .s_axi4_arlen    ( int_arlen[i]          ),
+      .s_axi4_arsize   ( int_arsize[i]         ),
+      .s_axi4_arburst  ( int_arburst[i]        ),
+      .s_axi4_arlock   ( int_arlock[i]         ),
+      .s_axi4_arprot   ( int_arprot[i]         ),
+      .s_axi4_arcache  ( int_arcache[i]        ),
+      .s_axi4_aruser   ( int_aruser[i]         ),
+      .m_axi4_arid     ( m1_axi4_arid[i]       ),
+      .m_axi4_araddr   ( m1_axi4_araddr[i]     ),
+      .m_axi4_arvalid  ( m1_axi4_arvalid[i]    ),
+      .m_axi4_arready  ( m1_axi4_arready[i]    ),
+      .m_axi4_arlen    ( m1_axi4_arlen[i]      ),
+      .m_axi4_arsize   ( m1_axi4_arsize[i]     ),
+      .m_axi4_arburst  ( m1_axi4_arburst[i]    ),
+      .m_axi4_arlock   ( m1_axi4_arlock[i]     ),
+      .m_axi4_arprot   ( m1_axi4_arprot[i]     ),
+      .m_axi4_arcache  (                       ),
+      .m_axi4_aruser   ( m1_axi4_aruser[i]     )
+    );
+
+    // The AXCACHE signals are set according to burstiness and cache coherence or statically
+    // when not connected to ACP on Zynq (implemented below).
+      assign m1_read_is_burst[i] = (m1_axi4_arlen[i] != {8{1'b0}}) && (m1_axi4_arburst[i] != 2'b00);
+    `ifdef EN_ACP
+      always_comb begin
+        if (m1_read_is_burst[i]) begin
+          m1_axi4_arcache[i]    = 4'b1011;
+        end else begin
+          m1_axi4_arcache[i]    = 4'b1111;
+        end
+      end
+    `else
+      assign m1_axi4_arcache[i] = 4'b0011;
+    `endif
+
+  // }}}
+
+  // Read Response channel (r) {{{
+  /*
+   * read response channel (r)
+   *
+   * ██████╗ ███████╗ █████╗ ██████╗     ██████╗ ███████╗███████╗██████╗
+   * ██╔══██╗██╔════╝██╔══██╗██╔══██╗    ██╔══██╗██╔════╝██╔════╝██╔══██╗
+   * ██████╔╝█████╗  ███████║██║  ██║    ██████╔╝█████╗  ███████╗██████╔╝
+   * ██╔══██╗██╔══╝  ██╔══██║██║  ██║    ██╔══██╗██╔══╝  ╚════██║██╔═══╝
+   * ██║  ██║███████╗██║  ██║██████╔╝    ██║  ██║███████╗███████║██║
+   * ╚═╝  ╚═╝╚══════╝╚═╝  ╚═╝╚═════╝     ╚═╝  ╚═╝╚══════╝╚══════╝╚═╝
+   *
+   */
+  axi4_r_buffer
+    #(
+      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+      )
+    u_r_buffer_m0
+    (
+      .axi4_aclk     ( Clk_CI            ),
+      .axi4_arstn    ( Rst_RBI           ),
+      .s_axi4_rid    ( int_m0_rid[i]     ),
+      .s_axi4_rresp  ( int_m0_rresp[i]   ),
+      .s_axi4_rdata  ( int_m0_rdata[i]   ),
+      .s_axi4_rlast  ( int_m0_rlast[i]   ),
+      .s_axi4_rvalid ( int_m0_rvalid[i]  ),
+      .s_axi4_ruser  ( int_m0_ruser[i]   ),
+      .s_axi4_rready ( int_m0_rready[i]  ),
+      .m_axi4_rid    ( m0_axi4_rid[i]    ),
+      .m_axi4_rresp  ( m0_axi4_rresp[i]  ),
+      .m_axi4_rdata  ( m0_axi4_rdata[i]  ),
+      .m_axi4_rlast  ( m0_axi4_rlast[i]  ),
+      .m_axi4_rvalid ( m0_axi4_rvalid[i] ),
+      .m_axi4_ruser  ( m0_axi4_ruser[i]  ),
+      .m_axi4_rready ( m0_axi4_rready[i] )
+    );
+
+  axi4_r_buffer
+    #(
+      .AXI_DATA_WIDTH ( AXI_DATA_WIDTH ),
+      .AXI_ID_WIDTH   ( AXI_ID_WIDTH   ),
+      .AXI_USER_WIDTH ( AXI_USER_WIDTH )
+      )
+    u_r_buffer_m1
+    (
+      .axi4_aclk     ( Clk_CI            ),
+      .axi4_arstn    ( Rst_RBI           ),
+      .s_axi4_rid    ( int_m1_rid[i]     ),
+      .s_axi4_rresp  ( int_m1_rresp[i]   ),
+      .s_axi4_rdata  ( int_m1_rdata[i]   ),
+      .s_axi4_rlast  ( int_m1_rlast[i]   ),
+      .s_axi4_rvalid ( int_m1_rvalid[i]  ),
+      .s_axi4_ruser  ( int_m1_ruser[i]   ),
+      .s_axi4_rready ( int_m1_rready[i]  ),
+      .m_axi4_rid    ( m1_axi4_rid[i]    ),
+      .m_axi4_rresp  ( m1_axi4_rresp[i]  ),
+      .m_axi4_rdata  ( m1_axi4_rdata[i]  ),
+      .m_axi4_rlast  ( m1_axi4_rlast[i]  ),
+      .m_axi4_rvalid ( m1_axi4_rvalid[i] ),
+      .m_axi4_ruser  ( m1_axi4_ruser[i]  ),
+      .m_axi4_rready ( m1_axi4_rready[i] )
+    );
+
+  axi4_r_sender
+    #(
+      .AXI_DATA_WIDTH  ( AXI_DATA_WIDTH ),
+      .AXI_ID_WIDTH    ( AXI_ID_WIDTH   ),
+      .AXI_USER_WIDTH  ( AXI_USER_WIDTH )
+      )
+    u_r_sender
+    (
+      .axi4_aclk     ( Clk_CI              ),
+      .axi4_arstn    ( Rst_RBI             ),
+      .drop_i        ( lx_r_drop[i]        ),
+      .drop_len_i    ( lx_len_drop[i]      ),
+      .done_o        ( lx_r_done[i]        ),
+      .id_i          ( lx_id_drop[i]       ),
+      .prefetch_i    ( lx_prefetch_drop[i] ),
+      .hit_i         ( lx_hit_drop[i]      ),
+      .s_axi4_rid    ( s_axi4_rid[i]       ),
+      .s_axi4_rresp  ( s_axi4_rresp[i]     ),
+      .s_axi4_rdata  ( s_axi4_rdata[i]     ),
+      .s_axi4_rlast  ( s_axi4_rlast[i]     ),
+      .s_axi4_rvalid ( s_axi4_rvalid[i]    ),
+      .s_axi4_ruser  ( s_axi4_ruser[i]     ),
+      .s_axi4_rready ( s_axi4_rready[i]    ),
+      .m_axi4_rid    ( int_rid[i]          ),
+      .m_axi4_rresp  ( int_rresp[i]        ),
+      .m_axi4_rdata  ( int_rdata[i]        ),
+      .m_axi4_rlast  ( int_rlast[i]        ),
+      .m_axi4_rvalid ( int_rvalid[i]       ),
+      .m_axi4_ruser  ( int_ruser[i]        ),
+      .m_axi4_rready ( int_rready[i]       )
+    );
+
+  /*
+   * Multiplexer to switch between the two output master ports on the read response(r) channel
+   *
+   * Do not perform read burst interleaving as the DMA does not support it. This means we can only
+   * switch between the two masters upon sending rlast or when idle.
+   *
+   * However, if the downstream already performs burst interleaving, this cannot be undone here.
+   * Also, the downstream may interleave a burst reponse with a single-beat transaction. In this
+   * case, the FSM below falls out of the burst mode. To avoid it performing burst interleaving
+   * after such an event, it gives priority to the master which received the last burst in case
+   * both have a have a burst ready (rvalid).
+   *
+   * Order of priority:
+   * 1. Ongoing burst transaction
+   * 2. Single-beat transaction on Master 1.
+   * 3. Single-beat transaction on Master 0.
+   * 4. Burst transaction on master that received the last burst.
+   */
+  // Select signal
+  always_ff @(posedge Clk_CI) begin
+    if (Rst_RBI == 0) begin
+      RRespSel_SP[i] <= 1'b0;
+    end else begin
+      RRespSel_SP[i] <= RRespSel_SN[i];
+    end
+  end
+
+  // FSM
+  always_comb begin : RRespMuxFsm
+    RRespMuxCtrl_SN[i] = RRespMuxCtrl_SP[i];
+    RRespSel_SN[i]     = RRespSel_SP[i];
+
+    RRespBurst_S[i]    = 1'b0;
+    RRespSelIm_S[i]    = 1'b0;
+
+    unique case (RRespMuxCtrl_SP[i])
+
+      IDLE: begin
+        // immediately forward single-beat transactions
+        if      (int_m1_rvalid[i] && int_m1_rlast[i])
+          RRespSelIm_S[i] = 1'b1;
+        else if (int_m0_rvalid[i] && int_m0_rlast[i])
+          RRespSelIm_S[i] = 1'b0;
+
+        // bursts - they also start immediately
+        else if (int_m1_rvalid[i] || int_m0_rvalid[i]) begin
+          RRespMuxCtrl_SN[i] = BUSY;
+
+          // in case both are ready, continue with the master that had the last burst
+          if    (int_m1_rvalid[i] && int_m0_rvalid[i]) begin
+            RRespSel_SN[i]  = RRespSel_SP[i];
+            RRespSelIm_S[i] = RRespSel_SP[i];
+          end else if (int_m1_rvalid[i]) begin
+            RRespSel_SN[i]  = 1'b1;
+            RRespSelIm_S[i] = 1'b1;
+          end else begin
+            RRespSel_SN[i]  = 1'b0;
+            RRespSelIm_S[i] = 1'b0;
+          end
+        end
+      end
+
+      BUSY: begin
+        RRespBurst_S[i] = 1'b1;
+        // detect last handshake of currently ongoing transfer
+        if (int_rvalid[i] && int_rready[i] && int_rlast[i])
+          RRespMuxCtrl_SN[i] = IDLE;
+      end
+
+      default: begin
+        RRespMuxCtrl_SN[i] = IDLE;
+      end
+
+    endcase
+  end
+
+  // FSM state
+  always_ff @(posedge Clk_CI) begin
+    if (Rst_RBI == 0) begin
+      RRespMuxCtrl_SP[i] <= IDLE;
+    end else begin
+      RRespMuxCtrl_SP[i] <= RRespMuxCtrl_SN[i];
+    end
+  end
+
+  // Actual multiplexer
+  always_comb begin
+    if ( (RRespBurst_S[i] && RRespSel_SP[i]) || (!RRespBurst_S[i] && RRespSelIm_S[i]) ) begin
+      int_m0_rready[i] = 1'b0;
+      int_m1_rready[i] = int_rready[i];
+
+      int_rid[i]    = int_m1_rid[i];
+      int_rresp[i]  = int_m1_rresp[i];
+      int_rdata[i]  = int_m1_rdata[i];
+      int_rlast[i]  = int_m1_rlast[i];
+      int_ruser[i]  = int_m1_ruser[i];
+      int_rvalid[i] = int_m1_rvalid[i];
+    end else begin
+      int_m0_rready[i] = int_rready[i];
+      int_m1_rready[i] = 1'b0;
+
+      int_rid[i]    = int_m0_rid[i];
+      int_rresp[i]  = int_m0_rresp[i];
+      int_rdata[i]  = int_m0_rdata[i];
+      int_rlast[i]  = int_m0_rlast[i];
+      int_ruser[i]  = int_m0_ruser[i];
+      int_rvalid[i] = int_m0_rvalid[i];
+    end
+  end
+
+  end // BUF & SEND
+
+  // }}}
+
+  endgenerate // BUF & SEND }}}
+
+  // Log {{{
+
+`ifdef RAB_AX_LOG_EN
+  AxiBramLogger
+    #(
+      .AXI_ID_BITW     ( AXI_ID_WIDTH        ),
+      .AXI_ADDR_BITW   ( AXI_S_ADDR_WIDTH    ),
+      .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES )
+    )
+    u_aw_logger
+    (
+      .Clk_CI          ( NonGatedClk_CI    ),
+      .TimestampClk_CI ( Clk_CI            ),
+      .Rst_RBI         ( Rst_RBI           ),
+      .AxiValid_SI     ( s_axi4_awvalid[1] ),
+      .AxiReady_SI     ( s_axi4_awready[1] ),
+      .AxiId_DI        ( s_axi4_awid[1]    ),
+      .AxiAddr_DI      ( s_axi4_awaddr[1]  ),
+      .AxiLen_DI       ( s_axi4_awlen[1]   ),
+      .Clear_SI        ( AwLogClr_SI       ),
+      .LogEn_SI        ( LogEn_SI          ),
+      .Full_SO         ( int_aw_log_full   ),
+      .Ready_SO        ( AwLogRdy_SO       ),
+      .Bram_PS         ( AwBram_PS         )
+    );
+
+  AxiBramLogger
+    #(
+      .AXI_ID_BITW     ( AXI_ID_WIDTH        ),
+      .AXI_ADDR_BITW   ( AXI_S_ADDR_WIDTH    ),
+      .NUM_LOG_ENTRIES ( `RAB_AX_LOG_ENTRIES )
+    )
+    u_ar_logger
+    (
+      .Clk_CI          ( NonGatedClk_CI    ),
+      .TimestampClk_CI ( Clk_CI            ),
+      .Rst_RBI         ( Rst_RBI           ),
+      .AxiValid_SI     ( s_axi4_arvalid[1] ),
+      .AxiReady_SI     ( s_axi4_arready[1] ),
+      .AxiId_DI        ( s_axi4_arid[1]    ),
+      .AxiAddr_DI      ( s_axi4_araddr[1]  ),
+      .AxiLen_DI       ( s_axi4_arlen[1]   ),
+      .Clear_SI        ( ArLogClr_SI       ),
+      .LogEn_SI        ( LogEn_SI          ),
+      .Full_SO         ( int_ar_log_full   ),
+      .Ready_SO        ( ArLogRdy_SO       ),
+      .Bram_PS         ( ArBram_PS         )
+    );
+`endif
+
+  // }}}
+
+  // RAB Core {{{
+  // ██████╗  █████╗ ██████╗      ██████╗ ██████╗ ██████╗ ███████╗
+  // ██╔══██╗██╔══██╗██╔══██╗    ██╔════╝██╔═══██╗██╔══██╗██╔════╝
+  // ██████╔╝███████║██████╔╝    ██║     ██║   ██║██████╔╝█████╗
+  // ██╔══██╗██╔══██║██╔══██╗    ██║     ██║   ██║██╔══██╗██╔══╝
+  // ██║  ██║██║  ██║██████╔╝    ╚██████╗╚██████╔╝██║  ██║███████╗
+  // ╚═╝  ╚═╝╚═╝  ╚═╝╚═════╝      ╚═════╝ ╚═════╝ ╚═╝  ╚═╝╚══════╝
+  //
+  /*
+   * rab_core
+   *
+   * The rab core translates addresses. It has two ports, which can be used
+   * independently, however they will compete for time internally, as lookups
+   * are serialized.
+   *
+   * type is the read(0) or write(1) used to check the protection flags. If they
+   * don't match an interrupt is created on the int_prot line.
+   */
+
+  rab_core
+    #(
+      .N_PORTS             ( N_PORTS             ),
+      .N_L2_SETS           ( N_L2_SETS           ),
+      .N_L2_SET_ENTRIES    ( N_L2_SET_ENTRIES    ),
+      .AXI_DATA_WIDTH      ( AXI_DATA_WIDTH      ),
+      .AXI_S_ADDR_WIDTH    ( AXI_S_ADDR_WIDTH    ),
+      .AXI_M_ADDR_WIDTH    ( AXI_M_ADDR_WIDTH    ),
+      .AXI_LITE_DATA_WIDTH ( AXI_LITE_DATA_WIDTH ),
+      .AXI_LITE_ADDR_WIDTH ( AXI_LITE_ADDR_WIDTH ),
+      .AXI_ID_WIDTH        ( AXI_ID_WIDTH        ),
+      .AXI_USER_WIDTH      ( AXI_USER_WIDTH      ),
+      .MH_FIFO_DEPTH       ( MH_FIFO_DEPTH       )
+    )
+    u_rab_core
+    (
+      .Clk_CI               ( Clk_CI                     ),
+      .Rst_RBI              ( Rst_RBI                    ),
+
+      // Config IF
+      .s_axi_awaddr         ( s_axi4lite_awaddr          ),
+      .s_axi_awvalid        ( s_axi4lite_awvalid         ),
+      .s_axi_awready        ( s_axi4lite_awready         ),
+      .s_axi_wdata          ( s_axi4lite_wdata           ),
+      .s_axi_wstrb          ( s_axi4lite_wstrb           ),
+      .s_axi_wvalid         ( s_axi4lite_wvalid          ),
+      .s_axi_wready         ( s_axi4lite_wready          ),
+      .s_axi_bresp          ( s_axi4lite_bresp           ),
+      .s_axi_bvalid         ( s_axi4lite_bvalid          ),
+      .s_axi_bready         ( s_axi4lite_bready          ),
+      .s_axi_araddr         ( s_axi4lite_araddr          ),
+      .s_axi_arvalid        ( s_axi4lite_arvalid         ),
+      .s_axi_arready        ( s_axi4lite_arready         ),
+      .s_axi_rready         ( s_axi4lite_rready          ),
+      .s_axi_rdata          ( s_axi4lite_rdata           ),
+      .s_axi_rresp          ( s_axi4lite_rresp           ),
+      .s_axi_rvalid         ( s_axi4lite_rvalid          ),
+
+      // L1 miss info outputs -> L2 TLB arbitration
+      .int_miss             ( rab_miss                   ),
+      .int_multi            ( rab_multi                  ),
+      .int_prot             ( rab_prot                   ),
+      .int_prefetch         ( rab_prefetch               ),
+      .int_mhf_full         ( int_mhf_full               ),
+
+      // L1 transaction info outputs -> L2 TLB arbitration
+      .int_axaddr_o         ( L1OutAddr_D                ),
+      .int_axid_o           ( L1OutId_D                  ),
+      .int_axlen_o          ( L1OutLen_D                 ),
+      .int_axuser_o         ( L1OutUser_D                ),
+
+      // Write Req IF
+      .port1_addr           ( int_awaddr                 ),
+      .port1_id             ( int_awid                   ),
+      .port1_len            ( int_awlen                  ),
+      .port1_size           ( int_awsize                 ),
+      .port1_addr_valid     ( int_awvalid & ~aw_in_stall ), // avoid the FSM accepting new AW requests
+      .port1_type           ( {N_PORTS{1'b1}}            ),
+      .port1_user           ( int_awuser                 ),
+      .port1_sent           ( int_wtrans_sent            ), // signal done to L1 FSM
+      .port1_out_addr       ( int_wtrans_addr            ),
+      .port1_cache_coherent ( int_wtrans_cache_coherent  ),
+      .port1_accept         ( int_wtrans_accept          ),
+      .port1_drop           ( int_wtrans_drop            ),
+      .port1_miss           ( int_wtrans_miss            ),
+
+      // Read Req IF
+      .port2_addr           ( int_araddr                 ),
+      .port2_id             ( int_arid                   ),
+      .port2_len            ( int_arlen                  ),
+      .port2_size           ( int_arsize                 ),
+      .port2_addr_valid     ( int_arvalid                ),
+      .port2_type           ( {N_PORTS{1'b0}}            ),
+      .port2_user           ( int_aruser                 ),
+      .port2_sent           ( int_rtrans_sent            ), // signal done to L1 FSM
+      .port2_out_addr       ( int_rtrans_addr            ),
+      .port2_cache_coherent ( int_rtrans_cache_coherent  ),
+      .port2_accept         ( int_rtrans_accept          ),
+      .port2_drop           ( int_rtrans_drop            ),
+      .port2_miss           ( int_rtrans_miss            ),
+
+      // L2 miss info inputs -> axi_rab_cfg
+      .miss_l2_i            ( L2Miss_S                   ),
+      .miss_l2_addr_i       ( L2OutInAddr_DP             ),
+      .miss_l2_id_i         ( L2OutId_DP                 ),
+      .miss_l2_user_i       ( L2OutUser_DP               ),
+
+      // L2 config outputs
+      .wdata_l2_o           ( L2CfgWData_D               ),
+      .waddr_l2_o           ( L2CfgWAddr_D               ),
+      .wren_l2_o            ( L2CfgWE_S                  )
+    );
+
+  // }}}
+
+  // AX SPLITS {{{
+  //  █████╗ ██╗  ██╗    ███████╗██████╗ ██╗     ██╗████████╗
+  // ██╔══██╗╚██╗██╔╝    ██╔════╝██╔══██╗██║     ██║╚══██╔══╝
+  // ███████║ ╚███╔╝     ███████╗██████╔╝██║     ██║   ██║
+  // ██╔══██║ ██╔██╗     ╚════██║██╔═══╝ ██║     ██║   ██║
+  // ██║  ██║██╔╝ ██╗    ███████║██║     ███████╗██║   ██║
+  // ╚═╝  ╚═╝╚═╝  ╚═╝    ╚══════╝╚═╝     ╚══════╝╚═╝   ╚═╝
+  //
+  /**
+   * Multiplex the two output master ports of the Read Address and Write Address (AR/AW) channels.
+   *
+   * Use the `int_xmaster_select` signal to route the signals to either Master 0 (to memory) or
+   * Master 1 (to ACP). In case of an L1 miss: Route the signals to both masters. They shall be
+   * saved until the L2 outputs are available.
+   */
+  generate for (i = 0; i < N_PORTS; i++) begin : AX_SPLIT
+
+    /*
+     * When accepting L1 transactions, we must just do so on the selected master. Drop requests must
+     * be performed on any one of the two masters. Save requests must be performed by both masters.
+     */
+    always_comb begin : AW_L1_SPLIT
+
+      // TLB handshake
+      l1_m0_aw_accept[i] = 1'b0;
+      l1_m1_aw_accept[i] = 1'b0;
+      l1_m0_aw_drop[i]   = 1'b0;
+      l1_m1_aw_drop[i]   = 1'b0;
+      l1_m0_aw_save[i]   = 1'b0;
+      l1_m1_aw_save[i]   = 1'b0;
+
+      l1_mx_aw_done[i]   = 1'b0;
+
+      // AXI sender input handshake
+      int_m0_awvalid[i]  = 1'b0;
+      int_m1_awvalid[i]  = 1'b0;
+      int_awready[i]     = 1'b0;
+
+      // accept on selected master only
+      if (l1_aw_accept[i]) begin
+        if (int_wmaster_select[i]) begin
+          l1_m1_aw_accept[i] = 1'b1;
+          l1_mx_aw_done[i]   = l1_m1_aw_done[i];
+
+          int_m1_awvalid[i]  = int_awvalid[i];
+          int_awready[i]     = int_m1_awready[i];
+
+        end else begin
+          l1_m0_aw_accept[i] = 1'b1;
+          l1_mx_aw_done[i]   = l1_m0_aw_done[i];
+
+          int_m0_awvalid[i]  = int_awvalid[i];
+          int_awready[i]     = int_m0_awready[i];
+        end
+
+      // drop on Master 0 only
+      end else if (l1_aw_drop[i]) begin
+        l1_m0_aw_drop[i]     = 1'b1;
+        l1_mx_aw_done[i]     = l1_m0_aw_done[i];
+
+        int_m0_awvalid[i]    = int_awvalid[i];
+        int_awready[i]       = l1_m0_aw_done[i];
+
+      // save on both masters
+      end else if (l1_aw_save[i]) begin
+        // split save
+        l1_m0_aw_save[i]     = ~l1_m0_aw_done_SP[i];
+        l1_m1_aw_save[i]     = ~l1_m1_aw_done_SP[i];
+
+        // combine done
+        l1_mx_aw_done[i]     = l1_m0_aw_done_SP[i] & l1_m1_aw_done_SP[i];
+
+        int_m0_awvalid[i]    = int_awvalid[i];
+        int_m1_awvalid[i]    = int_awvalid[i];
+        int_awready[i]       = l1_mx_aw_done[i];
+      end
+    end
+
+    // signal back to handshake splitter
+    assign l1_aw_done[i]     = l1_mx_aw_done[i];
+
+    always_ff @(posedge Clk_CI) begin : L1_MX_AW_DONE_REG
+      if (Rst_RBI == 0) begin
+        l1_m0_aw_done_SP[i] <= 1'b0;
+        l1_m1_aw_done_SP[i] <= 1'b0;
+      end else if (l1_mx_aw_done[i]) begin
+        l1_m0_aw_done_SP[i] <= 1'b0;
+        l1_m1_aw_done_SP[i] <= 1'b0;
+      end else begin
+        l1_m0_aw_done_SP[i] <= l1_m0_aw_done_SP[i] | l1_m0_aw_done[i];
+        l1_m1_aw_done_SP[i] <= l1_m1_aw_done_SP[i] | l1_m1_aw_done[i];
+      end
+    end
+
+    /*
+     * When accepting L2 transactions, we must drop the corresponding transaction from the other
+     * master to make it available again for save requests from L1_DROP_SAVE.
+     */
+    always_comb begin : AW_L2_SPLIT
+
+      l2_m0_aw_accept[i] = 1'b0;
+      l2_m1_aw_accept[i] = 1'b0;
+      l2_m0_aw_drop[i]   = 1'b0;
+      l2_m1_aw_drop[i]   = 1'b0;
+
+      // de-assert request signals individually upon handshakes
+      if (l2_aw_accept[i]) begin
+        if (l2_master_select[i]) begin
+          l2_m1_aw_accept[i] = ~l2_m1_aw_done_SP[i];
+          l2_m0_aw_drop[i]   = ~l2_m0_aw_done_SP[i];
+
+        end else begin
+          l2_m0_aw_accept[i] = ~l2_m0_aw_done_SP[i];
+          l2_m1_aw_drop[i]   = ~l2_m1_aw_done_SP[i];
+
+        end
+      end else begin
+        l2_m0_aw_drop[i]     = ~l2_m0_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0;
+        l2_m1_aw_drop[i]     = ~l2_m1_aw_done_SP[i] ? l2_aw_drop[i] : 1'b0;
+
+      end
+
+      // combine done
+      l2_mx_aw_done[i] = l2_m0_aw_done_SP[i] & l2_m1_aw_done_SP[i];
+
+      l2_aw_done[i]    = l2_mx_aw_done[i];
+    end
+
+    always_ff @(posedge Clk_CI) begin : L2_MX_AW_DONE_REG
+      if (Rst_RBI == 0) begin
+        l2_m0_aw_done_SP[i] <= 1'b0;
+        l2_m1_aw_done_SP[i] <= 1'b0;
+      end else if (l2_mx_aw_done[i]) begin
+        l2_m0_aw_done_SP[i] <= 1'b0;
+        l2_m1_aw_done_SP[i] <= 1'b0;
+      end else begin
+        l2_m0_aw_done_SP[i] <= l2_m0_aw_done_SP[i] | l2_m0_aw_done[i];
+        l2_m1_aw_done_SP[i] <= l2_m1_aw_done_SP[i] | l2_m1_aw_done[i];
+      end
+    end
+
+    /*
+     * When accepting L1 transactions, we must just do so on the selected master. Drop requests must
+     * be performed on any one of the two masters. Save requests must be performed by both masters.
+     */
+    always_comb begin : AR_L1_SPLIT
+
+      // TLB handshake
+      l1_m0_ar_accept[i] = 1'b0;
+      l1_m1_ar_accept[i] = 1'b0;
+      l1_m0_ar_drop[i]   = 1'b0;
+      l1_m1_ar_drop[i]   = 1'b0;
+      l1_m0_ar_save[i]   = 1'b0;
+      l1_m1_ar_save[i]   = 1'b0;
+
+      l1_mx_ar_done[i]   = 1'b0;
+
+      // AXI sender input handshake
+      int_m0_arvalid[i]  = 1'b0;
+      int_m1_arvalid[i]  = 1'b0;
+      int_arready[i]     = 1'b0;
+
+      // accept on selected master only
+      if (l1_ar_accept[i]) begin
+        if (int_rmaster_select[i]) begin
+          l1_m1_ar_accept[i] = 1'b1;
+          l1_mx_ar_done[i]   = l1_m1_ar_done[i];
+
+          int_m1_arvalid[i]  = int_arvalid[i];
+          int_arready[i]     = int_m1_arready[i];
+
+        end else begin
+          l1_m0_ar_accept[i] = 1'b1;
+          l1_mx_ar_done[i]   = l1_m0_ar_done[i];
+
+          int_m0_arvalid[i]  = int_arvalid[i];
+          int_arready[i]     = int_m0_arready[i];
+        end
+
+      // drop on Master 0 only
+      end else if (l1_ar_drop[i]) begin
+        l1_m0_ar_drop[i]     = 1'b1;
+        l1_mx_ar_done[i]     = l1_m0_ar_done[i];
+
+        int_m0_arvalid[i]    = int_arvalid[i];
+        int_arready[i]       = l1_m0_ar_done[i];
+
+      // save on both masters
+      end else if (l1_ar_save[i]) begin
+        // split save
+        l1_m0_ar_save[i]     = ~l1_m0_ar_done_SP[i];
+        l1_m1_ar_save[i]     = ~l1_m1_ar_done_SP[i];
+
+        // combine done
+        l1_mx_ar_done[i]     = l1_m0_ar_done_SP[i] & l1_m1_ar_done_SP[i];
+
+        int_m0_arvalid[i]    = int_arvalid[i];
+        int_m1_arvalid[i]    = int_arvalid[i];
+        int_arready[i]       = l1_mx_ar_done[i];
+      end
+    end
+
+    // signal back to handshake splitter
+    assign l1_ar_done[i]     = l1_mx_ar_done[i];
+
+    always_ff @(posedge Clk_CI) begin : L1_MX_AR_DONE_REG
+      if (Rst_RBI == 0) begin
+        l1_m0_ar_done_SP[i] <= 1'b0;
+        l1_m1_ar_done_SP[i] <= 1'b0;
+      end else if (l1_mx_ar_done[i]) begin
+        l1_m0_ar_done_SP[i] <= 1'b0;
+        l1_m1_ar_done_SP[i] <= 1'b0;
+      end else begin
+        l1_m0_ar_done_SP[i] <= l1_m0_ar_done_SP[i] | l1_m0_ar_done[i];
+        l1_m1_ar_done_SP[i] <= l1_m1_ar_done_SP[i] | l1_m1_ar_done[i];
+      end
+    end
+
+    /*
+     * When accepting L2 transactions, we must drop the corresponding transaction from the other
+     * master to make it available again for save requests from L1_DROP_SAVE.
+     */
+    always_comb begin : AR_L2_SPLIT
+
+      l2_m0_ar_accept[i] = 1'b0;
+      l2_m1_ar_accept[i] = 1'b0;
+      l2_m0_ar_drop[i]   = 1'b0;
+      l2_m1_ar_drop[i]   = 1'b0;
+
+      // de-assert request signals individually upon handshakes
+      if (l2_ar_accept[i]) begin
+        if (l2_master_select[i]) begin
+          l2_m1_ar_accept[i] = ~l2_m1_ar_done_SP[i];
+          l2_m0_ar_drop[i]   = ~l2_m0_ar_done_SP[i];
+
+        end else begin
+          l2_m0_ar_accept[i] = ~l2_m0_ar_done_SP[i];
+          l2_m1_ar_drop[i]   = ~l2_m1_ar_done_SP[i];
+
+        end
+      end else if (l2_ar_drop[i]) begin
+        l2_m0_ar_drop[i]     = ~l2_m0_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0;
+        l2_m1_ar_drop[i]     = ~l2_m1_ar_done_SP[i] ? l2_ar_drop[i] : 1'b0;
+
+      end
+
+      // combine done
+      l2_mx_ar_done[i] = l2_m0_ar_done_SP[i] & l2_m1_ar_done_SP[i];
+
+      l2_ar_done[i]    = l2_mx_ar_done[i];
+    end
+
+    always_ff @(posedge Clk_CI) begin : L2_MX_AR_DONE_REG
+      if (Rst_RBI == 0) begin
+        l2_m0_ar_done_SP[i] <= 1'b0;
+        l2_m1_ar_done_SP[i] <= 1'b0;
+      end else if (l2_mx_ar_done[i]) begin
+        l2_m0_ar_done_SP[i] <= 1'b0;
+        l2_m1_ar_done_SP[i] <= 1'b0;
+      end else begin
+        l2_m0_ar_done_SP[i] <= l2_m0_ar_done_SP[i] | l2_m0_ar_done[i];
+        l2_m1_ar_done_SP[i] <= l2_m1_ar_done_SP[i] | l2_m1_ar_done[i];
+      end
+    end
+
+  end // AX_SPLIT
+  endgenerate // AX_SPLIT
+
+  // }}}
+
+  // HANDSHAKE SPLITS {{{
+  // ██╗  ██╗███████╗    ███████╗██████╗ ██╗     ██╗████████╗
+  // ██║  ██║██╔════╝    ██╔════╝██╔══██╗██║     ██║╚══██╔══╝
+  // ███████║███████╗    ███████╗██████╔╝██║     ██║   ██║
+  // ██╔══██║╚════██║    ╚════██║██╔═══╝ ██║     ██║   ██║
+  // ██║  ██║███████║    ███████║██║     ███████╗██║   ██║
+  // ╚═╝  ╚═╝╚══════╝    ╚══════╝╚═╝     ╚══════╝╚═╝   ╚═╝
+  //
+  /*
+   * We need to perform combined handshakes with multiple AXI modules
+   * upon transactions drops, accepts, saves etc. from two TLBs.
+   */
+  generate for (i = 0; i < N_PORTS; i++) begin : HANDSHAKE_SPLIT
+
+    assign l1_xw_accept[i]    = int_wtrans_accept[i] & ~aw_out_stall[i];
+    assign int_wtrans_sent[i] = l1_xw_done[i];
+
+    assign l1_ar_accept[i]    = int_rtrans_accept[i];
+    assign int_rtrans_sent[i] = l1_ar_done[i];
+
+    /*
+     * L1 AW sender + W buffer handshake split
+     */
+    // forward
+    assign l1_aw_accept[i] = l1_xw_accept[i] & ~l1_aw_done_SP[i];
+    assign l1_w_accept[i]  = l1_xw_accept[i] & ~l1_w_done_SP[i];
+
+    assign l1_aw_save[i]   = l1_xw_save[i]   & ~l1_aw_done_SP[i];
+    assign l1_w_save[i]    = l1_xw_save[i]   & ~l1_w_done_SP[i];
+
+    assign l1_aw_drop[i]   = l1_xw_drop[i]   & ~l1_aw_done_SP[i];
+    assign l1_w_drop[i]    = l1_xw_drop[i]   & ~l1_w_done_SP[i];
+
+    // backward
+    assign l1_xw_done[i]   = l1_aw_done_SP[i] & l1_w_done_SP[i];
+
+    always_ff @(posedge Clk_CI) begin : L1_XW_HS_SPLIT
+      if (Rst_RBI == 0) begin
+        l1_aw_done_SP[i] <= 1'b0;
+        l1_w_done_SP[i]  <= 1'b0;
+      end else if (l1_xw_done[i]) begin
+        l1_aw_done_SP[i] <= 1'b0;
+        l1_w_done_SP[i]  <= 1'b0;
+      end else begin
+        l1_aw_done_SP[i] <= l1_aw_done_SP[i] | l1_aw_done[i];
+        l1_w_done_SP[i]  <= l1_w_done_SP[i]  | l1_w_done[i];
+      end
+    end
+
+    if (ENABLE_L2TLB[i] == 1) begin : L2_HS_SPLIT
+
+      /*
+       * L1 AR sender + R sender handshake split
+       *
+       * AR and R do not need to be strictly in sync. We thus use separate handshakes.
+       * But the handshake signals for the R sender are multiplexed with the those for
+       * the L2. However, L2_ACCEPT_DROP_SAVE has always higher priority.
+       */
+      assign lx_r_drop[i] = l2_r_drop[i] | l1_r_drop[i];
+      assign l1_r_done[i] = l2_r_drop[i] ? 1'b0         : lx_r_done[i];
+      assign l2_r_done[i] = l2_r_drop[i] ? lx_r_done[i] : 1'b0;
+
+      /*
+       * L2 AW sender + W buffer handshake split
+       */
+      // forward
+      assign l2_aw_accept[i] = l2_xw_accept[i] & ~l2_aw_done_SP[i];
+      assign l2_w_accept[i]  = l2_xw_accept[i] & ~l2_w_done_SP[i];
+
+      assign l2_aw_drop[i]   = l2_xw_drop[i]   & ~l2_aw_done_SP[i];
+      assign l2_w_drop[i]    = l2_xw_drop[i]   & ~l2_w_done_SP[i];
+
+      // backward
+      assign l2_xw_done[i]   = l2_aw_done_SP[i] & l2_w_done_SP[i];
+
+      always_ff @(posedge Clk_CI) begin : L2_XW_HS_SPLIT
+        if (Rst_RBI == 0) begin
+          l2_aw_done_SP[i] <= 1'b0;
+          l2_w_done_SP[i]  <= 1'b0;
+        end else if (l2_xw_done[i]) begin
+          l2_aw_done_SP[i] <= 1'b0;
+          l2_w_done_SP[i]  <= 1'b0;
+        end else begin
+          l2_aw_done_SP[i] <= l2_aw_done_SP[i] | l2_aw_done[i];
+          l2_w_done_SP[i]  <= l2_w_done_SP[i]  | l2_w_done[i];
+        end
+      end
+
+      /*
+       * L2 AR + R sender handshake split
+       */
+      // forward
+      assign l2_ar_drop[i]   = l2_xr_drop[i]   & ~l2_ar_done_SP[i];
+      assign l2_r_drop[i]    = l2_xr_drop[i]   & ~l2_r_done_SP[i];
+
+      // backward - make sure to always clear L2_XR_HS_SPLIT
+      always_comb begin
+        if (l2_xr_drop[i]) begin
+          l2_xr_done[i]      = l2_ar_done_SP[i] & l2_r_done_SP[i];
+        end else begin
+          l2_xr_done[i]      = l2_ar_done_SP[i];
+        end
+      end
+
+      always_ff @(posedge Clk_CI) begin : L2_XR_HS_SPLIT
+        if (Rst_RBI == 0) begin
+          l2_ar_done_SP[i] <= 1'b0;
+          l2_r_done_SP[i]  <= 1'b0;
+        end else if (l2_xr_done[i]) begin
+          l2_ar_done_SP[i] <= 1'b0;
+          l2_r_done_SP[i]  <= 1'b0;
+        end else begin
+          l2_ar_done_SP[i] <= l2_ar_done_SP[i] | l2_ar_done[i];
+          l2_r_done_SP[i]  <= l2_r_done_SP[i]  | l2_r_done[i];
+        end
+      end
+
+    end else begin // if (ENABLE_L2TLB[i] == 1)
+
+      assign lx_r_drop[i]     = l1_r_drop[i];
+      assign l1_r_done[i]     = lx_r_done[i];
+
+      assign l2_aw_accept[i]  = 1'b0;
+      assign l2_w_accept[i]   = 1'b0;
+      assign l2_aw_drop[i]    = 1'b0;
+      assign l2_w_drop[i]     = 1'b0;
+      assign l2_xw_done[i]    = 1'b0;
+      assign l2_aw_done_SP[i] = 1'b0;
+      assign l2_w_done_SP[i]  = 1'b0;
+
+      assign l2_ar_accept[i]  = 1'b0;
+      assign l2_ar_drop[i]    = 1'b0;
+      assign l2_r_drop[i]     = 1'b0;
+      assign l2_xr_done[i]    = 1'b0;
+      assign l2_r_done[i]     = 1'b0;
+      assign l2_ar_done_SP[i] = 1'b0;
+      assign l2_r_done_SP[i]  = 1'b0;
+
+    end // if (ENABLE_L2TLB[i] == 1)
+
+  end // HANDSHAKE_SPLIT
+  endgenerate // HANDSHAKE_SPLIT
+
+  // }}}
+
+  // L2 TLB {{{
+  // ██╗     ██████╗     ████████╗██╗     ██████╗
+  // ██║     ╚════██╗    ╚══██╔══╝██║     ██╔══██╗
+  // ██║      █████╔╝       ██║   ██║     ██████╔╝
+  // ██║     ██╔═══╝        ██║   ██║     ██╔══██╗
+  // ███████╗███████╗       ██║   ███████╗██████╔╝
+  // ╚══════╝╚══════╝       ╚═╝   ╚══════╝╚═════╝
+  //
+  /*
+   * l2_tlb
+   *
+   * The L2 TLB translates addresses upon misses in the L1 TLB (rab_core).
+   *
+   * It supports one ongoing translation at a time. If an L1 miss occurs while the L2 is busy,
+   * the L1 is stalled untill the L2 is available again.
+   *
+   */
+  generate for (i = 0; i < N_PORTS; i++) begin : L2_TLB
+    if (ENABLE_L2TLB[i] == 1) begin : L2_TLB
+
+      /*
+       * L1 output selector
+       */
+      assign L1OutRwType_D[i] = int_wtrans_drop[i] ? 1'b1 : 1'b0;
+      assign L1OutProt_D[i]   = rab_prot[i];
+      assign L1OutMulti_D[i]  = rab_multi[i];
+
+      /*
+       * L1 output control + L1_DROP_BUF, L2_IN_BUF management
+       *
+       * Forward the L1 drop request to AR/AW sender modules if
+       * 1. the transactions needs to be dropped (L1 multi, prot, prefetch), or
+       * 2. if a lookup in the L2 TLB is required (L1 miss) and the input buffer is not full.
+       *
+       * The AR/AW senders do not support more than 1 oustanding L1 miss. The push back towards
+       * the upstream is realized by not accepting the save request (saving the L1 transaction)
+       * in the senders as long as the L2 TLB is busy or has valid output. This ultimately
+       * blocks the L1 TLB.
+       *
+       * Together with the AW drop/save, we also perform the W drop/save as AW and W need to
+       * absolutely remain in order. In contrast, the R drop is performed
+       */
+      always_comb begin : L1_DROP_SAVE
+
+        l1_ar_drop[i]       = 1'b0;
+        l1_ar_save[i]       = 1'b0;
+        l1_xw_drop[i]       = 1'b0;
+        l1_xw_save[i]       = 1'b0;
+
+        l1_id_drop[i]       = L1OutId_D[i];
+        l1_len_drop[i]      = L1OutLen_D[i];
+        l1_prefetch_drop[i] = rab_prefetch[i];
+        l1_hit_drop[i]      = 1'b1; // there are no drops for L1 misses
+
+        L1DropEn_S[i]       = 1'b0;
+        L2InEn_S[i]         = 1'b0;
+
+        if ( rab_prot[i] | rab_multi[i] | rab_prefetch[i] ) begin
+          // 1. Drop
+          l1_ar_drop[i] = int_rtrans_drop[i] & ~L1DropValid_SP[i];
+          l1_xw_drop[i] = int_wtrans_drop[i] & ~L1DropValid_SP[i];
+
+          // Store to L1_DROP_BUF upon handshake
+          L1DropEn_S[i] = (l1_ar_drop[i] & l1_ar_done[i]) |
+                          (l1_xw_drop[i] & l1_xw_done[i]);
+
+        end else if ( rab_miss[i] ) begin
+          // 2. Save - Make sure L2 is really available.
+          l1_ar_save[i] = int_rtrans_drop[i] & ~L2Busy_S[i];
+          l1_xw_save[i] = int_wtrans_drop[i] & ~L2Busy_S[i];
+
+          // Store to L2_IN_BUF upon handshake - triggers the L2 TLB
+          L2InEn_S[i]   = (l1_ar_save[i] & l1_ar_done[i]) |
+                          (l1_xw_save[i] & l1_xw_done[i]);
+        end
+      end
+
+      /*
+       * L2 output control + L2_OUT_BUF management + R/B sender control + W buffer control
+       *
+       * Perform L1 R transaction drops unless the L2 output buffer holds valid data. The AXI specs
+       * require the B response to be sent only after consuming/discarding the corresponding data
+       * in the W channel. Thus, we only send L2 drop request to the W buffer here. The drop
+       * request to the B sender is then sent by the W buffer autonomously.
+       *
+       * L1 AW/W drop requests are managed by L1_DROP_SAVE.
+       */
+      always_comb begin : L2_ACCEPT_DROP_SAVE
+
+        l2_ar_addr[i]       =  'b0;
+        l2_aw_addr[i]       =  'b0;
+        l2_ar_accept[i]     = 1'b0;
+        l2_xr_drop[i]       = 1'b0;
+        l2_xw_accept[i]     = 1'b0;
+        l2_xw_drop[i]       = 1'b0;
+
+        l1_r_drop[i]        = 1'b0;
+
+        lx_id_drop[i]       =  'b0;
+        lx_len_drop[i]      =  'b0;
+        lx_prefetch_drop[i] = 1'b0;
+        lx_hit_drop[i]      = 1'b0;
+
+        L1DropValid_SN[i]   = L1DropValid_SP[i] | L1DropEn_S[i];
+        L2OutValid_SN[i]    = L2OutValid_SP[i];
+        L2OutReady_S[i]     = 1'b0;
+        L2OutEn_S[i]        = 1'b0;
+
+        L2Miss_S[i]         = 1'b0;
+        int_multi[i]        = 1'b0;
+        int_prot[i]         = 1'b0;
+
+        if (L2OutValid_SP[i] == 1'b0) begin
+
+          // Drop L1 from R senders
+          if (L1DropValid_SP[i] == 1'b1) begin
+
+            // Only perform the R sender drop here.
+            if (~L1DropRwType_DP[i]) begin
+
+              l1_r_drop[i]        = 1'b1;
+              lx_id_drop[i]       = L1DropId_DP[i];
+              lx_len_drop[i]      = L1DropLen_DP[i];
+              lx_prefetch_drop[i] = L1DropPrefetch_S[i];
+              lx_hit_drop[i]      = 1'b1; // there are no drops for L1 misses
+
+              // Invalidate L1_DROP_BUF upon handshake
+              if ( l1_r_drop[i] & l1_r_done[i] ) begin
+
+                L1DropValid_SN[i] = 1'b0;
+                int_prot[i]       = L1DropProt_DP[i];
+                int_multi[i]      = L1DropMulti_DP[i];
+              end
+
+            end else begin
+              // Invalidate L1_DROP_BUF
+              L1DropValid_SN[i]   = 1'b0;
+              int_prot[i]         = L1DropProt_DP[i];
+              int_multi[i]        = L1DropMulti_DP[i];
+            end
+          end
+
+        end else begin // L2_OUT_BUF has valid data
+
+          if ( L2OutHit_SP[i] & ~(L2OutPrefetch_S[i] | L2OutProt_SP[i] | L2OutMulti_SP[i]) ) begin
+
+            l2_ar_addr[i]       = L2OutAddr_DP[i];
+            l2_aw_addr[i]       = L2OutAddr_DP[i];
+
+            l2_ar_accept[i]     = L2OutRwType_DP[i] ? 1'b0 : 1'b1;
+            l2_xw_accept[i]     = L2OutRwType_DP[i] ? 1'b1 : 1'b0;
+
+            // Invalidate L2_OUT_BUF upon handshake
+            L2OutValid_SN[i] = ~( (l2_ar_accept[i] & l2_ar_done[i]) |
+                                  (l2_xw_accept[i] & l2_xw_done[i]) );
+          end else begin
+
+            lx_id_drop[i]       = L2OutId_DP[i];
+            lx_len_drop[i]      = L2OutLen_DP[i];
+            lx_prefetch_drop[i] = L2OutPrefetch_S[i];
+            lx_hit_drop[i]      = L2OutHit_SP[i];
+
+            // The l2_xr_drop will also perform the handshake with the R sender
+            l2_xr_drop[i]       = L2OutRwType_DP[i] ? 1'b0 : 1'b1;
+            l2_xw_drop[i]       = L2OutRwType_DP[i] ? 1'b1 : 1'b0;
+
+            // Invalidate L1_DROP_BUF upon handshake
+            if ( (l2_xr_drop[i] & l2_xr_done[i]) | (l2_xw_drop[i] & l2_xw_done[i]) ) begin
+
+              L2OutValid_SN[i]  = 1'b0;
+              L2Miss_S[i]       = ~L2OutHit_SP[i];
+              int_prot[i]       = L2OutProt_SP[i];
+              int_multi[i]      = L2OutMulti_SP[i];
+            end
+          end
+        end
+
+        // Only accept new L2 output after ongoing drops have finished.
+        if ( (l2_xr_drop[i] == l2_xr_done[i]) &
+             (l2_xw_drop[i] == l2_xw_done[i]) &
+             (l1_r_drop[i]  == l1_r_done[i] ) ) begin
+          // Store to L2_OUT_BUF upon handshake with L2 TLB module
+          if ( (L2OutValid_SP[i] == 1'b0) && (L2OutValid_S[i] == 1'b1) ) begin
+            L2OutValid_SN[i]   = 1'b1;
+            L2OutReady_S[i]    = 1'b1;
+            L2OutEn_S[i]       = 1'b1;
+          end
+        end
+      end
+
+      /*
+       * L1 drop buffer
+       *
+       * Used in case of multi, prot and prefetch hits in the L1 TLB.
+       */
+      always_ff @(posedge Clk_CI) begin : L1_DROP_BUF
+         if (Rst_RBI == 0) begin
+            L1DropProt_DP[i]   <= 1'b0;
+            L1DropMulti_DP[i]  <= 1'b0;
+            L1DropRwType_DP[i] <= 1'b0;
+            L1DropUser_DP[i]   <=  'b0;
+            L1DropId_DP[i]     <=  'b0;
+            L1DropLen_DP[i]    <=  'b0;
+            L1DropAddr_DP[i]   <=  'b0;
+         end else if (L1DropEn_S[i] == 1'b1) begin
+            L1DropProt_DP[i]   <= L1OutProt_D[i]  ;
+            L1DropMulti_DP[i]  <= L1OutMulti_D[i] ;
+            L1DropRwType_DP[i] <= L1OutRwType_D[i];
+            L1DropUser_DP[i]   <= L1OutUser_D[i]  ;
+            L1DropId_DP[i]     <= L1OutId_D[i]    ;
+            L1DropLen_DP[i]    <= L1OutLen_D[i]   ;
+            L1DropAddr_DP[i]   <= L1OutAddr_D[i]  ;
+         end
+      end // always_ff @ (posedge Clk_CI)
+
+      /*
+       * L2 input buffer
+       *
+       * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB.
+       */
+      always_ff @(posedge Clk_CI) begin : L2_IN_BUF
+         if (Rst_RBI == 0) begin
+            L2InRwType_DP[i] <= 1'b0;
+            L2InUser_DP[i]   <=  'b0;
+            L2InId_DP[i]     <=  'b0;
+            L2InLen_DP[i]    <=  'b0;
+            L2InAddr_DP[i]   <=  'b0;
+         end else if (L2InEn_S[i] == 1'b1) begin
+            L2InRwType_DP[i] <= L1OutRwType_D[i];
+            L2InUser_DP[i]   <= L1OutUser_D[i]  ;
+            L2InId_DP[i]     <= L1OutId_D[i]    ;
+            L2InLen_DP[i]    <= L1OutLen_D[i]   ;
+            L2InAddr_DP[i]   <= L1OutAddr_D[i]  ;
+         end
+      end // always_ff @ (posedge Clk_CI)
+
+      l2_tlb
+        #(
+          .AXI_S_ADDR_WIDTH       ( AXI_S_ADDR_WIDTH                                    ),
+          .AXI_M_ADDR_WIDTH       ( AXI_M_ADDR_WIDTH                                    ),
+          .AXI_LITE_DATA_WIDTH    ( AXI_LITE_DATA_WIDTH                                 ),
+          .AXI_LITE_ADDR_WIDTH    ( AXI_LITE_ADDR_WIDTH                                 ),
+          .N_SETS                 ( `RAB_L2_N_SETS                                      ),
+          .N_OFFSETS              ( `RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS       ),
+          .N_PAR_VA_RAMS          ( `RAB_L2_N_PAR_VA_RAMS                               ),
+          .HIT_OFFSET_STORE_WIDTH ( log2(`RAB_L2_N_SET_ENTRIES/2/`RAB_L2_N_PAR_VA_RAMS) )
+          )
+      u_l2_tlb
+        (
+          .clk_i              ( Clk_CI           ),
+          .rst_ni             ( Rst_RBI          ),
+
+          // Config inputs
+          .we_i               ( L2CfgWE_S[i]     ),
+          .waddr_i            ( L2CfgWAddr_D[i]  ),
+          .wdata_i            ( L2CfgWData_D[i]  ),
+
+          // Request input
+          .start_i            ( L2InEn_S[i]      ),
+          .busy_o             ( L2Busy_S[i]      ),
+          .rw_type_i          ( L2InRwType_DP[i] ),
+          .in_addr_i          ( L2InAddr_DP[i]   ),
+
+          // Response output
+          .out_ready_i        ( L2OutReady_S[i]  ),
+          .out_valid_o        ( L2OutValid_S[i]  ),
+          .hit_o              ( L2OutHit_SN[i]   ),
+          .miss_o             ( L2OutMiss_SN[i]  ),
+          .prot_o             ( L2OutProt_SN[i]  ),
+          .multi_o            ( L2OutMulti_SN[i] ),
+          .cache_coherent_o   ( L2OutCC_SN[i]    ),
+          .out_addr_o         ( L2OutAddr_DN[i]  )
+        );
+
+      /*
+       * L2 output buffer
+       *
+       * Make sure there are no combinational paths between L1 TLB/inputs and L2 TLB.
+       */
+      always_ff @(posedge Clk_CI) begin : L2_OUT_BUF
+         if (Rst_RBI == 0) begin
+            L2OutRwType_DP[i] <= 1'b0;
+            L2OutUser_DP[i]   <=  'b0;
+            L2OutLen_DP[i]    <=  'b0;
+            L2OutId_DP[i]     <=  'b0;
+            L2OutInAddr_DP[i] <=  'b0;
+
+            L2OutHit_SP[i]    <= 1'b0;
+            L2OutMiss_SP[i]   <= 1'b0;
+            L2OutProt_SP[i]   <= 1'b0;
+            L2OutMulti_SP[i]  <= 1'b0;
+            L2OutCC_SP[i]     <= 1'b0;
+            L2OutAddr_DP[i]   <=  'b0;
+         end else if (L2OutEn_S[i] == 1'b1) begin
+            L2OutRwType_DP[i] <= L2InRwType_DP[i];
+            L2OutUser_DP[i]   <= L2InUser_DP[i]  ;
+            L2OutLen_DP[i]    <= L2InLen_DP[i]   ;
+            L2OutId_DP[i]     <= L2InId_DP[i]    ;
+            L2OutInAddr_DP[i] <= L2InAddr_DP[i]  ;
+
+            L2OutHit_SP[i]    <= L2OutHit_SN[i]  ;
+            L2OutMiss_SP[i]   <= L2OutMiss_SN[i] ;
+            L2OutProt_SP[i]   <= L2OutProt_SN[i] ;
+            L2OutMulti_SP[i]  <= L2OutMulti_SN[i];
+            L2OutCC_SP[i]     <= L2OutCC_SN[i]   ;
+            L2OutAddr_DP[i]   <= L2OutAddr_DN[i] ;
+         end
+      end // always_ff @ (posedge Clk_CI)
+
+      always_ff @(posedge Clk_CI) begin : BUF_VALID
+        if (Rst_RBI == 0) begin
+          L1DropValid_SP[i] = 1'b0;
+          L2OutValid_SP[i]  = 1'b0;
+        end else begin
+          L1DropValid_SP[i] = L1DropValid_SN[i];
+          L2OutValid_SP[i]  = L2OutValid_SN[i];
+        end
+      end
+
+      always_comb begin : BUF_TO_PREFETCH
+        // L1 Drop Buf
+        if (L1DropUser_DP[i] == {AXI_USER_WIDTH{1'b1}})
+          L1DropPrefetch_S[i] = 1'b1;
+        else
+          L1DropPrefetch_S[i] = 1'b0;
+
+        // L2 Out Buf
+        if (L2OutUser_DP[i] == {AXI_USER_WIDTH{1'b1}})
+          L2OutPrefetch_S[i]  = 1'b1;
+        else
+          L2OutPrefetch_S[i]  = 1'b0;
+      end
+
+      assign l2_cache_coherent[i] = L2OutCC_SP[i];
+      assign int_miss[i]          = L2Miss_S[i];
+
+    end else begin : L2_TLB_STUB // if (ENABLE_L2TLB[i] == 1)
+
+      assign l1_ar_drop[i]        = int_rtrans_drop[i];
+      assign l1_r_drop[i]         = int_rtrans_drop[i];
+      assign l1_xw_drop[i]        = int_wtrans_drop[i];
+
+      assign l1_ar_save[i]        = 1'b0;
+      assign l1_xw_save[i]        = 1'b0;
+      assign l2_xw_accept[i]      = 1'b0;
+      assign l2_xr_drop[i]        = 1'b0;
+      assign l2_xw_drop[i]        = 1'b0;
+
+      assign l2_ar_addr[i]        =  'b0;
+      assign l2_aw_addr[i]        =  'b0;
+
+      assign l1_id_drop[i]        = int_wtrans_drop[i] ? int_awid[i] :
+                                    int_rtrans_drop[i] ? int_arid[i] :
+                                    '0;
+      assign l1_len_drop[i]       = int_wtrans_drop[i] ? int_awlen[i] :
+                                    int_rtrans_drop[i] ? int_arlen[i] :
+                                    '0;
+      assign l1_prefetch_drop[i]  = rab_prefetch[i];
+      assign l1_hit_drop[i]       = ~rab_miss[i];
+
+      assign lx_id_drop[i]        = int_wtrans_drop[i] ? int_awid[i] :
+                                    int_rtrans_drop[i] ? int_arid[i] :
+                                    '0;
+      assign lx_len_drop[i]       = int_wtrans_drop[i] ? int_awlen[i] :
+                                    int_rtrans_drop[i] ? int_arlen[i] :
+                                    '0;
+      assign lx_prefetch_drop[i]  = rab_prefetch[i];
+      assign lx_hit_drop[i]       = ~rab_miss[i];
+
+      assign l2_cache_coherent[i] = 1'b0;
+
+      assign int_miss[i]          = rab_miss[i];
+      assign int_prot[i]          = rab_prot[i];
+      assign int_multi[i]         = rab_multi[i];
+
+      // unused signals
+      assign L2Miss_S[i]          = 1'b0;
+
+      assign L1OutRwType_D[i]     = 1'b0;
+      assign L1OutProt_D[i]       = 1'b0;
+      assign L1OutMulti_D[i]      = 1'b0;
+
+      assign L1DropRwType_DP[i]   = 1'b0;
+      assign L1DropUser_DP[i]     =  'b0;
+      assign L1DropId_DP[i]       =  'b0;
+      assign L1DropLen_DP[i]      =  'b0;
+      assign L1DropAddr_DP[i]     =  'b0;
+      assign L1DropProt_DP[i]     = 1'b0;
+      assign L1DropMulti_DP[i]    = 1'b0;
+
+      assign L1DropEn_S[i]        = 1'b0;
+      assign L1DropPrefetch_S[i]  = 1'b0;
+      assign L1DropValid_SN[i]    = 1'b0;
+      assign L1DropValid_SP[i]    = 1'b0;
+
+      assign L2InRwType_DP[i]     = 1'b0;
+      assign L2InUser_DP[i]       =  'b0;
+      assign L2InId_DP[i]         =  'b0;
+      assign L2InLen_DP[i]        =  'b0;
+      assign L2InAddr_DP[i]       =  'b0;
+
+      assign L2InEn_S[i]          = 1'b0;
+
+      assign L2OutHit_SN[i]       = 1'b0;
+      assign L2OutMiss_SN[i]      = 1'b0;
+      assign L2OutProt_SN[i]      = 1'b0;
+      assign L2OutMulti_SN[i]     = 1'b0;
+      assign L2OutCC_SN[i]        = 1'b0;
+      assign L2OutAddr_DN[i]      =  'b0;
+
+      assign L2OutRwType_DP[i]    = 1'b0;
+      assign L2OutUser_DP[i]      =  'b0;
+      assign L2OutId_DP[i]        =  'b0;
+      assign L2OutLen_DP[i]       =  'b0;
+      assign L2OutInAddr_DP[i]    =  'b0;
+      assign L2OutHit_SP[i]       = 1'b0;
+      assign L2OutMiss_SP[i]      = 1'b0;
+      assign L2OutProt_SP[i]      = 1'b0;
+      assign L2OutMulti_SP[i]     = 1'b0;
+      assign L2OutCC_SP[i]        = 1'b0;
+      assign L2OutAddr_DP[i]      =  'b0;
+
+      assign L2OutEn_S[i]         = 1'b0;
+      assign L2OutPrefetch_S[i]   = 1'b0;
+      assign L2Busy_S[i]          = 1'b0;
+      assign L2OutValid_S[i]      = 1'b0;
+      assign L2OutValid_SN[i]     = 1'b0;
+      assign L2OutValid_SP[i]     = 1'b0;
+      assign L2OutReady_S[i]      = 1'b0;
+
+    end // !`ifdef ENABLE_L2TLB
+  end // for (i = 0; i < N_PORTS; i++)
+  endgenerate
+
+// }}}
+"""
+# endmodule
+#
+#
+# // vim: ts=2 sw=2 sts=2 et nosmartindent autoindent foldmethod=marker
+#
+#
diff --git a/unused_please_ignore_completely/iommu/axi_rab/check_ram.py b/unused_please_ignore_completely/iommu/axi_rab/check_ram.py
new file mode 100644 (file)
index 0000000..31bf32e
--- /dev/null
@@ -0,0 +1,240 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class check_ram(Elaboratable):
+
+    def __init__(self):
+        self.clk_i = Signal()  # input
+        self.rst_ni = Signal()  # input
+        self.in_addr = Signal(ADDR_WIDTH)  # input
+        self.rw_type = Signal()  # input
+        self.ram_we = Signal()  # input
+        self.port0_addr = Signal(1+ERROR p_expression_25)  # input
+        self.port1_addr = Signal(1+ERROR p_expression_25)  # input
+        self.ram_wdata = Signal(RAM_DATA_WIDTH)  # input
+        self.output_sent = Signal()  # input
+        self.output_valid = Signal()  # input
+        self.offset_addr_d = Signal(OFFSET_WIDTH)  # input
+        self.hit_addr = Signal(1+ERROR p_expression_25)  # output
+        self.master = Signal()  # output
+        self.hit = Signal()  # output
+        self.multi_hit = Signal()  # output
+        self.prot = Signal()  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# //import CfMath::log2;
+#
+# //`define MULTI_HIT_FULL_SET
+#
+# module check_ram
+#  //#(
+#  //  parameter ADDR_WIDTH     = 32,
+#   // parameter RAM_DATA_WIDTH = 32,
+#   // parameter PAGE_SIZE      = 4096, // 4kB
+#  //  parameter SET_WIDTH      = 5,
+#   // parameter OFFSET_WIDTH   = 4
+#   // )
+#  (
+#   input  logic                                clk_i,
+#   input  logic                                rst_ni,
+#   input  logic [ADDR_WIDTH-1:0]               in_addr,
+#   input  logic                                rw_type, // 1 => write, 0=> read
+#   input  logic                                ram_we,
+#   input  logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr,
+#   input  logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr,
+#   input  logic [RAM_DATA_WIDTH-1:0]           ram_wdata,
+#   input  logic                                output_sent,
+#   input  logic                                output_valid,
+#   input  logic [OFFSET_WIDTH-1:0]             offset_addr_d,
+#   output logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr,
+#   output logic                                master,
+#   output logic                                hit,
+#   output logic                                multi_hit,
+#   output logic                                prot
+#   );
+#
+"""   #docstring_begin
+
+   localparam IGNORE_LSB = log2(PAGE_SIZE); // 12
+
+   logic [RAM_DATA_WIDTH-1:0]           port0_data_o, port1_data_o; // RAM read data outputs
+   logic                                port0_hit, port1_hit; // Ram output matches in_addr
+
+    logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr_saved, port1_addr_saved;
+
+   // Hit FSM Signals
+   typedef enum                         logic {SEARCH, HIT} hit_state_t;
+   hit_state_t                          hit_SP; // Hit FSM state
+   hit_state_t                          hit_SN; // Hit FSM next state
+
+   // Multi Hit FSM signals
+`ifdef MULTI_HIT_FULL_SET
+   typedef enum                         logic[1:0] {NO_HITS, ONE_HIT, MULTI_HIT} multi_state_t;
+   multi_state_t                        multi_SP; // Multi Hit FSM state
+   multi_state_t                        multi_SN; // Multi Hit FSM next state
+
+   logic [SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr_saved;
+   logic                                master_saved;
+`endif
+
+  //// --------------- Block RAM (Dual Port) -------------- ////
+
+  // The outputs of the BRAMs are only valid if in the previous cycle:
+  // 1. the inputs were valid, and
+  // 2. the BRAM was not written to.
+  // Otherwise, the outputs must be ignored which is controlled by the output_valid signal.
+  // This signal is driven by the uppler level L2 TLB module.
+  ram_tp_no_change #(
+      .ADDR_WIDTH( SET_WIDTH+OFFSET_WIDTH+1 ),
+      .DATA_WIDTH( RAM_DATA_WIDTH           )
+    )
+    ram_tp_no_change_0
+    (
+      .clk   ( clk_i         ),
+      .we    ( ram_we        ),
+      .addr0 ( port0_addr    ),
+      .addr1 ( port1_addr    ),
+      .d_i   ( ram_wdata     ),
+      .d0_o  ( port0_data_o  ),
+      .d1_o  ( port1_data_o  )
+    );
+
+   //// Check Ram Outputs
+   assign port0_hit = (port0_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port0_data_o[RAM_DATA_WIDTH-1:4]);
+   assign port1_hit = (port1_data_o[0] == 1'b1) && (in_addr[ADDR_WIDTH-1: IGNORE_LSB] == port1_data_o[RAM_DATA_WIDTH-1:4]);
+   //// ----------------------------------------------------- /////
+
+   //// ------------------- Check if Hit ------------------------ ////
+   // FSM
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         hit_SP <= SEARCH;
+      end else begin
+         hit_SP <= hit_SN;
+      end
+   end
+
+   always_ff @(posedge clk_i, negedge rst_ni) begin
+       if (!rst_ni) begin
+           port0_addr_saved <= '0;
+           port1_addr_saved <= '0;
+       end else begin
+           port0_addr_saved <= port0_addr;
+           port1_addr_saved <= port1_addr;
+       end
+   end
+
+   always_comb begin
+      hit_SN   = hit_SP;
+      hit      = 1'b0;
+      hit_addr = 0;
+      master   = 1'b0;
+      unique case(hit_SP)
+        SEARCH :
+          if (output_valid)
+            if (port0_hit || port1_hit) begin
+               hit_SN   = HIT;
+               hit      = 1'b1;
+               hit_addr = port0_hit ? {port0_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} :
+                          port1_hit ? {port1_addr_saved[SET_WIDTH+OFFSET_WIDTH:OFFSET_WIDTH], offset_addr_d} :
+                          0;
+               master   = port0_hit ? port0_data_o[3] :
+                          port1_hit ? port1_data_o[3] :
+                          1'b0;
+            end
+
+        HIT : begin
+`ifdef MULTI_HIT_FULL_SET // Since the search continues after the first hit, it needs to be saved to be accessed later.
+           hit      = 1'b1;
+           hit_addr = hit_addr_saved;
+           master   = master_saved;
+`endif
+           if (output_sent)
+             hit_SN = SEARCH;
+        end
+
+        default : begin
+           hit_SN = SEARCH;
+        end
+      endcase // case (hit_SP)
+   end // always_comb begin
+
+   //// ------------------------------------------- ////
+
+   assign prot = output_valid && port0_hit ? ((~port0_data_o[2] && rw_type) || (~port0_data_o[1] && ~rw_type)) :
+                 output_valid && port1_hit ? ((~port1_data_o[2] && rw_type) || (~port1_data_o[1] && ~rw_type)) :
+                 1'b0;
+
+   //// ------------------- Multi ------------------- ////
+`ifdef MULTI_HIT_FULL_SET
+
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         hit_addr_saved <= 0;
+         master_saved   <= 1'b0;
+      end else if (output_valid) begin
+         hit_addr_saved <= hit_addr;
+         master_saved   <= master;
+      end
+   end
+
+   // FSM
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         multi_SP <= NO_HITS;
+      end else begin
+         multi_SP <= multi_SN;
+      end
+   end
+
+   always_comb begin
+      multi_SN  = multi_SP;
+      multi_hit = 1'b0;
+      unique case(multi_SP)
+        NO_HITS :
+          if(output_valid && (port0_hit && port1_hit)) begin
+             multi_SN  = MULTI_HIT;
+             multi_hit = 1'b1;
+          end else if(output_valid && (port0_hit || port1_hit))
+            multi_SN = ONE_HIT;
+
+        ONE_HIT :
+          if(output_valid && (port0_hit || port1_hit)) begin
+             multi_SN  = MULTI_HIT;
+             multi_hit = 1'b1;
+          end else if (output_sent)
+            multi_SN = NO_HITS;
+
+        MULTI_HIT : begin
+          multi_hit = 1'b1;
+           if (output_sent)
+             multi_SN = NO_HITS;
+        end
+
+      endcase // case (multi_SP)
+   end // always_comb begin
+
+`else // !`ifdef MULTI_HIT_FULL_SET
+   assign multi_hit = output_valid && port0_hit && port1_hit;
+`endif // !`ifdef MULTI_HIT_FULL_SET
+   //// ------------------------------------------- ////
+"""
+# endmodule
+#
+#
diff --git a/unused_please_ignore_completely/iommu/axi_rab/coreconfig.py b/unused_please_ignore_completely/iommu/axi_rab/coreconfig.py
new file mode 100644 (file)
index 0000000..247d0ce
--- /dev/null
@@ -0,0 +1,6 @@
+class CoreConfig:
+    def __init__(self):
+        self.N_SLICES = 16
+        self.N_REGS = 4*self.N_SLICES
+        self.ADDR_WIDTH_PHYS = 40
+        self.ADDR_WIDTH_VIRT = 32
diff --git a/unused_please_ignore_completely/iommu/axi_rab/fsm.py b/unused_please_ignore_completely/iommu/axi_rab/fsm.py
new file mode 100644 (file)
index 0000000..d64b1cb
--- /dev/null
@@ -0,0 +1,243 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class fsm(Elaboratable):
+
+    def __init__(self):
+        self.Clk_CI = Signal()  # input
+        self.Rst_RBI = Signal()  # input
+        self.port1_addr_valid_i = Signal()  # input
+        self.port2_addr_valid_i = Signal()  # input
+        self.port1_sent_i = Signal()  # input
+        self.port2_sent_i = Signal()  # input
+        self.select_i = Signal()  # input
+        self.no_hit_i = Signal()  # input
+        self.multi_hit_i = Signal()  # input
+        self.no_prot_i = Signal()  # input
+        self.prefetch_i = Signal()  # input
+        self.out_addr_i = Signal(AXI_M_ADDR_WIDTH)  # input
+        self.cache_coherent_i = Signal()  # input
+        self.port1_accept_o = Signal()  # output
+        self.port1_drop_o = Signal()  # output
+        self.port1_miss_o = Signal()  # output
+        self.port2_accept_o = Signal()  # output
+        self.port2_drop_o = Signal()  # output
+        self.port2_miss_o = Signal()  # output
+        self.out_addr_o = Signal(AXI_M_ADDR_WIDTH)  # output
+        self.cache_coherent_o = Signal()  # output
+        self.miss_o = Signal()  # output
+        self.multi_o = Signal()  # output
+        self.prot_o = Signal()  # output
+        self.prefetch_o = Signal()  # output
+        self.in_addr_i = Signal(AXI_S_ADDR_WIDTH)  # input
+        self.in_id_i = Signal(AXI_ID_WIDTH)  # input
+        self.in_len_i = Signal(8)  # input
+        self.in_user_i = Signal(AXI_USER_WIDTH)  # input
+        self.in_addr_o = Signal(AXI_S_ADDR_WIDTH)  # output
+        self.in_id_o = Signal(AXI_ID_WIDTH)  # output
+        self.in_len_o = Signal(8)  # output
+        self.in_user_o = Signal(AXI_USER_WIDTH)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# //`timescale 1ns / 1ps
+#
+# module fsm
+#  #(
+#    parameter AXI_M_ADDR_WIDTH = 40,
+#    parameter AXI_S_ADDR_WIDTH = 32,
+#    parameter AXI_ID_WIDTH     = 8,
+#    parameter AXI_USER_WIDTH   = 6
+#  )
+#  (
+#    input  logic                        Clk_CI,
+#    input  logic                        Rst_RBI,
+#
+#    input  logic                        port1_addr_valid_i,
+#    input  logic                        port2_addr_valid_i,
+#    input  logic                        port1_sent_i,
+#    input  logic                        port2_sent_i,
+#    input  logic                        select_i,
+#    input  logic                        no_hit_i,
+#    input  logic                        multi_hit_i,
+#    input  logic                        no_prot_i,
+#    input  logic                        prefetch_i,
+#    input  logic [AXI_M_ADDR_WIDTH-1:0] out_addr_i,
+#    input  logic                        cache_coherent_i,
+#    output logic                        port1_accept_o,
+#    output logic                        port1_drop_o,
+#    output logic                        port1_miss_o,
+#    output logic                        port2_accept_o,
+#    output logic                        port2_drop_o,
+#    output logic                        port2_miss_o,
+#    output logic [AXI_M_ADDR_WIDTH-1:0] out_addr_o,
+#    output logic                        cache_coherent_o,
+#    output logic                        miss_o,
+#    output logic                        multi_o,
+#    output logic                        prot_o,
+#    output logic                        prefetch_o,
+#    input  logic [AXI_S_ADDR_WIDTH-1:0] in_addr_i,
+#    input  logic     [AXI_ID_WIDTH-1:0] in_id_i,
+#    input  logic                  [7:0] in_len_i,
+#    input  logic   [AXI_USER_WIDTH-1:0] in_user_i,
+#    output logic [AXI_S_ADDR_WIDTH-1:0] in_addr_o,
+#    output logic     [AXI_ID_WIDTH-1:0] in_id_o,
+#    output logic                  [7:0] in_len_o,
+#    output logic   [AXI_USER_WIDTH-1:0] in_user_o
+#  );
+#
+"""  #docstring_begin
+
+  //-------------Internal Signals----------------------
+
+  typedef enum logic           {IDLE, WAIT} state_t;
+  logic                        state_SP; // Present state
+  logic                        state_SN; // Next State
+
+  logic                        port1_accept_SN;
+  logic                        port1_drop_SN;
+  logic                        port1_miss_SN;
+  logic                        port2_accept_SN;
+  logic                        port2_drop_SN;
+  logic                        port2_miss_SN;
+  logic                        miss_SN;
+  logic                        multi_SN;
+  logic                        prot_SN;
+  logic                        prefetch_SN;
+  logic                        cache_coherent_SN;
+  logic [AXI_M_ADDR_WIDTH-1:0] out_addr_DN;
+
+  logic                        out_reg_en_S;
+
+  //----------FSM comb------------------------------
+
+  always_comb begin: FSM_COMBO
+    state_SN          = state_SP;
+
+    port1_accept_SN   = 1'b0;
+    port1_drop_SN     = 1'b0;
+    port1_miss_SN     = 1'b0;
+    port2_accept_SN   = 1'b0;
+    port2_drop_SN     = 1'b0;
+    port2_miss_SN     = 1'b0;
+    miss_SN           = 1'b0;
+    multi_SN          = 1'b0;
+    prot_SN           = 1'b0;
+    prefetch_SN       = 1'b0;
+    cache_coherent_SN = 1'b0;
+    out_addr_DN       =   '0;
+
+    out_reg_en_S      = 1'b0; // by default hold register output
+
+    unique case(state_SP)
+        IDLE :
+          if ( (port1_addr_valid_i & select_i) | (port2_addr_valid_i & ~select_i) ) begin
+            out_reg_en_S = 1'b1;
+            state_SN     = WAIT;
+
+            // Select inputs for output registers
+            if          (port1_addr_valid_i & select_i) begin
+              port1_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
+              port1_drop_SN   =  (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
+              port1_miss_SN   =   no_hit_i;
+              port2_accept_SN = 1'b0;
+              port2_drop_SN   = 1'b0;
+              port2_miss_SN   = 1'b0;
+            end else if (port2_addr_valid_i & ~select_i) begin
+              port1_accept_SN = 1'b0;
+              port1_drop_SN   = 1'b0;
+              port1_miss_SN   = 1'b0;
+              port2_accept_SN = ~(no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
+              port2_drop_SN   =  (no_hit_i | multi_hit_i | ~no_prot_i | prefetch_i);
+              port2_miss_SN   =   no_hit_i;
+            end
+
+            miss_SN           = port1_miss_SN | port2_miss_SN;
+            multi_SN          = multi_hit_i;
+            prot_SN           = ~no_prot_i;
+            prefetch_SN       = ~no_hit_i & prefetch_i;
+
+            cache_coherent_SN = cache_coherent_i;
+            out_addr_DN       = out_addr_i;
+          end
+
+        WAIT :
+          if ( port1_sent_i | port2_sent_i ) begin
+            out_reg_en_S = 1'b1; // "clear" the register
+            state_SN     = IDLE;
+          end
+
+        default : begin
+           state_SN      = IDLE;
+        end
+      endcase
+    end
+
+  //----------FSM seq-------------------------------
+
+  always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: FSM_SEQ
+    if (Rst_RBI == 1'b0)
+      state_SP <= IDLE;
+    else
+      state_SP <= state_SN;
+  end
+
+  //----------Output seq--------------------------
+
+  always_ff @(posedge Clk_CI, negedge Rst_RBI) begin: OUTPUT_SEQ
+    if (Rst_RBI == 1'b0) begin
+      port1_accept_o   = 1'b0;
+      port1_drop_o     = 1'b0;
+      port1_miss_o     = 1'b0;
+      port2_accept_o   = 1'b0;
+      port2_drop_o     = 1'b0;
+      port2_miss_o     = 1'b0;
+      miss_o           = 1'b0;
+      multi_o          = 1'b0;
+      prot_o           = 1'b0;
+      prefetch_o       = 1'b0;
+      cache_coherent_o = 1'b0;
+      out_addr_o       =   '0;
+      in_addr_o        =   '0;
+      in_id_o          =   '0;
+      in_len_o         =   '0;
+      in_user_o        =   '0;
+    end else if (out_reg_en_S == 1'b1) begin
+      port1_accept_o   = port1_accept_SN;
+      port1_drop_o     = port1_drop_SN;
+      port1_miss_o     = port1_miss_SN;
+      port2_accept_o   = port2_accept_SN;
+      port2_drop_o     = port2_drop_SN;
+      port2_miss_o     = port2_miss_SN;
+      miss_o           = miss_SN;
+      multi_o          = multi_SN;
+      prot_o           = prot_SN;
+      prefetch_o       = prefetch_SN;
+      cache_coherent_o = cache_coherent_SN;
+      out_addr_o       = out_addr_DN;
+      in_addr_o        = in_addr_i;
+      in_id_o          = in_id_i;
+      in_len_o         = in_len_i;
+      in_user_o        = in_user_i;
+    end
+  end // block: OUTPUT_SEQ
+"""
+#
+# endmodule
+#
+#
diff --git a/unused_please_ignore_completely/iommu/axi_rab/l2_tlb.py b/unused_please_ignore_completely/iommu/axi_rab/l2_tlb.py
new file mode 100644 (file)
index 0000000..11983f6
--- /dev/null
@@ -0,0 +1,550 @@
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class l2_tlb(Elaboratable):
+
+    def __init__(self):
+        self.clk_i = Signal()  # input
+        self.rst_ni = Signal()  # input
+        self.we_i = Signal()  # input
+        self.waddr_i = Signal(AXI_LITE_ADDR_WIDTH)  # input
+        self.wdata_i = Signal(AXI_LITE_DATA_WIDTH)  # input
+        self.start_i = Signal()  # input
+        self.busy_o = Signal()  # output
+        self.in_addr_i = Signal(AXI_S_ADDR_WIDTH)  # input
+        self.rw_type_i = Signal()  # input
+        self.out_ready_i = Signal()  # input
+        self.out_valid_o = Signal()  # output
+        self.hit_o = Signal()  # output
+        self.miss_o = Signal()  # output
+        self.prot_o = Signal()  # output
+        self.multi_o = Signal()  # output
+        self.cache_coherent_o = Signal()  # output
+        self.out_addr_o = Signal(AXI_M_ADDR_WIDTH)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# //`include "pulp_soc_defines.sv"
+#
+# ////import CfMath::log2;
+#
+# //`define MULTI_HIT_FULL_SET  // Enable full multi hit detection. Always the entire set is searched.
+# //`define MULTI_HIT_CUR_CYCLE // Enable partial multi hit detection. Only multi hits in the same search cycle are detected.
+#
+# //`ifdef MULTI_HIT_FULL_SET
+# //  `ifndef MULTI_HIT_CUR_CYCLE
+# //    `define MULTI_HIT_CUR_CYCLE
+# //  `endif
+# //`endif
+#
+# module l2_tlb
+#  //#(
+#  //  parameter AXI_S_ADDR_WIDTH       = 32,
+#   // parameter AXI_M_ADDR_WIDTH       = 40,
+#  //  parameter AXI_LITE_DATA_WIDTH    = 64,
+#   // parameter AXI_LITE_ADDR_WIDTH    = 32,
+#   // parameter N_SETS                 = 32,
+#   // parameter N_OFFSETS              = 4, //per port. There are 2 ports.
+#  //  parameter PAGE_SIZE              = 4096, // 4kB
+#  //  parameter N_PAR_VA_RAMS          = 4,
+#  //  parameter HIT_OFFSET_STORE_WIDTH = 2 // Num of bits of VA RAM offset stored. This should not be greater than OFFSET_WIDTH
+#  //  )
+#   (
+#    input  logic                           clk_i,
+#    input  logic                           rst_ni,
+#
+#    input  logic                           we_i,
+#    input  logic [AXI_LITE_ADDR_WIDTH-1:0] waddr_i,
+#    input  logic [AXI_LITE_DATA_WIDTH-1:0] wdata_i,
+#
+#    input  logic                           start_i,
+#    output logic                           busy_o,
+#    input  logic    [AXI_S_ADDR_WIDTH-1:0] in_addr_i,
+#    input  logic                           rw_type_i, //1 => write, 0=> read
+#
+#    input  logic                           out_ready_i,
+#    output logic                           out_valid_o,
+#    output logic                           hit_o,
+#    output logic                           miss_o,
+#    output logic                           prot_o,
+#    output logic                           multi_o,
+#    output logic                           cache_coherent_o,
+#    output logic    [AXI_M_ADDR_WIDTH-1:0] out_addr_o
+#    );
+#
+"""    #docstring_begin
+
+   localparam VA_RAM_DEPTH      = N_SETS * N_OFFSETS * 2;
+   localparam PA_RAM_DEPTH      = VA_RAM_DEPTH * N_PAR_VA_RAMS;
+   localparam VA_RAM_ADDR_WIDTH = log2(VA_RAM_DEPTH);
+   localparam PA_RAM_ADDR_WIDTH = log2(PA_RAM_DEPTH);
+   localparam SET_WIDTH         = log2(N_SETS);
+   localparam OFFSET_WIDTH      = log2(N_OFFSETS);
+   localparam LL_WIDTH          = log2(N_PAR_VA_RAMS);
+   localparam IGNORE_LSB        = log2(PAGE_SIZE);
+
+   localparam VA_RAM_DATA_WIDTH = AXI_S_ADDR_WIDTH - IGNORE_LSB + 4;
+   localparam PA_RAM_DATA_WIDTH = AXI_M_ADDR_WIDTH - IGNORE_LSB;
+
+   logic                               [N_PAR_VA_RAMS-1:0] hit, prot, multi_hit, cache_coherent;
+   logic                               [N_PAR_VA_RAMS-1:0] ram_we;
+   logic                                                   last_search, last_search_next;
+   logic                                                   first_search, first_search_next;
+   logic                    [SET_WIDTH+OFFSET_WIDTH+1-1:0] ram_waddr;
+   logic [N_PAR_VA_RAMS-1:0][SET_WIDTH+OFFSET_WIDTH+1-1:0] hit_addr;
+   logic                                                   pa_ram_we;
+   logic                           [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr, pa_port0_waddr; // PA RAM read, Write addr;
+   logic                           [PA_RAM_ADDR_WIDTH-1:0] pa_port0_raddr_reg_SN, pa_port0_raddr_reg_SP; // registered addresses, needed for WAIT_ON_WRITE;
+   logic                           [PA_RAM_ADDR_WIDTH-1:0] pa_port0_addr; // PA RAM addr
+   logic                           [PA_RAM_DATA_WIDTH-1:0] pa_port0_data, pa_data, pa_port0_data_reg; // PA RAM data
+   logic                                                   pa_ram_store_data_SN, pa_ram_store_data_SP;
+   logic                                                   hit_top, prot_top, multi_hit_top, first_hit_top;
+   logic                                                   output_sent;
+   int                                                     hit_block_num;
+
+   logic                                                   searching, search_done;
+   logic                    [SET_WIDTH+OFFSET_WIDTH+1-1:0] port0_addr, port0_raddr; // VA RAM port0 addr
+   logic                    [SET_WIDTH+OFFSET_WIDTH+1-1:0] port1_addr; // VA RAM port1 addr
+   logic                                [OFFSET_WIDTH-1:0] offset_addr, offset_addr_d;
+   logic                                [OFFSET_WIDTH-1:0] offset_start_addr, offset_end_addr;
+   logic                                   [SET_WIDTH-1:0] set_num;
+
+   logic                                                   va_output_valid;
+   logic                                                   searching_q;
+
+   genvar                                                  z;
+
+   // Search FSM
+   typedef enum logic                                [1:0] {IDLE, SEARCH, DONE} search_state_t;
+   search_state_t                                          search_SP; // Present state
+   search_state_t                                          search_SN; // Next State
+
+   // Output FSM
+   typedef enum logic                                [1:0] {OUT_IDLE, SEND_OUTPUT, WAIT_ON_WRITE} out_state_t;
+   out_state_t                                             out_SP; // Present state
+   out_state_t                                             out_SN; // Next State
+
+   logic                                                   miss_next;
+   logic                                                   hit_next;
+   logic                                                   prot_next;
+   logic                                                   multi_next;
+   logic                                                   cache_coherent_next;
+
+   // Generate the VA Block rams and their surrounding logic
+   generate
+      for (z = 0; z < N_PAR_VA_RAMS; z++) begin : VA_RAMS
+         check_ram
+           #(
+             .ADDR_WIDTH     ( AXI_S_ADDR_WIDTH  ),
+             .RAM_DATA_WIDTH ( VA_RAM_DATA_WIDTH ),
+             .PAGE_SIZE      ( PAGE_SIZE         ),
+             .SET_WIDTH      ( SET_WIDTH         ),
+             .OFFSET_WIDTH   ( OFFSET_WIDTH      )
+             )
+         u_check_ram
+             (
+              .clk_i         ( clk_i                          ),
+              .rst_ni        ( rst_ni                         ),
+              .in_addr       ( in_addr_i                      ),
+              .rw_type       ( rw_type_i                      ),
+              .ram_we        ( ram_we[z]                      ),
+              .port0_addr    ( port0_addr                     ),
+              .port1_addr    ( port1_addr                     ),
+              .ram_wdata     ( wdata_i[VA_RAM_DATA_WIDTH-1:0] ),
+              .output_sent   ( output_sent                    ),
+              .output_valid  ( va_output_valid                ),
+              .offset_addr_d ( offset_addr_d                  ),
+              .hit_addr      ( hit_addr[z]                    ),
+              .master        ( cache_coherent[z]              ),
+              .hit           ( hit[z]                         ),
+              .multi_hit     ( multi_hit[z]                   ),
+              .prot          ( prot[z]                        )
+              );
+      end // for (z = 0; z < N_PORTS; z++)
+   endgenerate
+
+   ////////////////// ---------------- Control and Address --------------- ////////////////////////
+   // FSM
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         search_SP <= IDLE;
+      end else begin
+         search_SP <= search_SN;
+      end
+   end
+
+   always_comb begin : SEARCH_FSM
+      search_SN         = search_SP;
+      busy_o            = 1'b0;
+      searching         = 1'b0;
+      search_done       = 1'b0;
+      last_search_next  = 1'b0;
+      first_search_next = first_search;
+
+      unique case (search_SP)
+        IDLE : begin
+          if (start_i) begin
+            search_SN         = SEARCH;
+            first_search_next = 1'b1;
+          end
+        end
+
+        SEARCH : begin
+          busy_o = 1'b1;
+
+          // detect last search cycle
+          if ( (first_search == 1'b0) && (offset_addr == offset_end_addr) )
+             last_search_next  = 1'b1;
+
+          // pause search during VA RAM reconfigration
+          if (|ram_we) begin
+             searching         = 1'b0;
+          end else begin
+             searching         = 1'b1;
+             first_search_next = 1'b0;
+          end
+
+          if (va_output_valid) begin
+            // stop search
+`ifdef MULTI_HIT_FULL_SET
+            if (last_search | prot_top | multi_hit_top) begin
+`else
+            if (last_search | prot_top | multi_hit_top | hit_top ) begin
+`endif
+              search_SN      = DONE;
+              search_done    = 1'b1;
+            end
+          end
+        end
+
+        DONE : begin
+          busy_o = 1'b1;
+          if (out_valid_o & out_ready_i)
+            search_SN = IDLE;
+        end
+
+        default : begin
+          search_SN = IDLE;
+        end
+      endcase // case (prot_SP)
+   end // always_comb begin
+
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         last_search  <= 1'b0;
+         first_search <= 1'b0;
+      end else begin
+         last_search  <= last_search_next;
+         first_search <= first_search_next;
+      end
+   end
+
+   /*
+    * VA RAM address generation
+    *
+    * The input address and set number, and thus the offset start address, are available in the
+    * cycle after the start signal. The buffered offset_addr becomes available one cycle later.
+    * During the first search cycle, we therefore directly use offset_addr_start for the lookup.
+    */
+   assign set_num = in_addr_i[SET_WIDTH+IGNORE_LSB -1 : IGNORE_LSB];
+
+   assign port0_raddr[OFFSET_WIDTH] = 1'b0;
+   assign port1_addr [OFFSET_WIDTH] = 1'b1;
+
+   assign port0_raddr[OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr;
+   assign port1_addr [OFFSET_WIDTH-1:0] = first_search ? offset_start_addr : offset_addr;
+
+   assign port0_raddr[SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num;
+   assign port1_addr [SET_WIDTH+OFFSET_WIDTH : OFFSET_WIDTH+1] = set_num;
+
+   assign port0_addr = ram_we ? ram_waddr : port0_raddr;
+
+   // The outputs of the BRAMs are only valid if in the previous cycle:
+   // 1. the inputs were valid, and
+   // 2. the BRAMs were not written to.
+   // Otherwise, the outputs must be ignored.
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         searching_q <= 1'b0;
+      end else begin
+         searching_q <= searching;
+      end
+   end
+   assign va_output_valid = searching_q;
+
+   // Address offset for looking up the VA RAMs
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         offset_addr   <= 0;
+      end else if (first_search) begin
+         offset_addr <= offset_start_addr + 1'b1;
+      end else if (searching) begin
+         offset_addr <= offset_addr + 1'b1;
+      end
+   end
+
+   // Delayed address offest for looking up the PA RAM upon a hit in the VA RAMs
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         offset_addr_d <= 0;
+      end else if (first_search) begin
+         offset_addr_d <= offset_start_addr;
+      end else if (searching) begin
+         offset_addr_d <= offset_addr_d + 1'b1;
+      end
+   end
+
+   // Store the offset addr for hit to reduce latency for next search.
+   generate
+      if (HIT_OFFSET_STORE_WIDTH > 0) begin : OFFSET_STORE
+`ifndef MULTI_HIT_FULL_SET
+         logic [N_SETS-1:0][HIT_OFFSET_STORE_WIDTH-1:0] hit_offset_addr; // Contains offset addr for previous hit for every SET.
+         logic [SET_WIDTH+OFFSET_WIDTH+1-1:0]           hit_addr_reg;
+
+         assign offset_start_addr = { hit_offset_addr[set_num] , {{OFFSET_WIDTH-HIT_OFFSET_STORE_WIDTH}{1'b0}} };
+         assign offset_end_addr   =   hit_offset_addr[set_num]-1'b1;
+
+         // Register the hit addr
+         always_ff @(posedge clk_i) begin
+            if (rst_ni == 0) begin
+               hit_addr_reg <= 0;
+            end else if (hit_top) begin
+               hit_addr_reg <= hit_addr[hit_block_num];
+            end
+         end
+
+         // Store hit addr for each set. The next search in the same set will start from the saved addr.
+         always_ff @(posedge clk_i) begin
+            if (rst_ni == 0) begin
+               hit_offset_addr <= 0;
+            end else if (hit_o) begin
+               hit_offset_addr[set_num][HIT_OFFSET_STORE_WIDTH-1:0] <= hit_addr_reg[OFFSET_WIDTH-1 : (OFFSET_WIDTH - HIT_OFFSET_STORE_WIDTH)];
+            end
+         end
+`else // No need to store offset if full multi hit detection is enabled because the entire SET is searched.
+         assign offset_start_addr = 0;
+         assign offset_end_addr   = {OFFSET_WIDTH{1'b1}};
+`endif
+      end else begin // if (HIT_OFFSET_STORE_WIDTH > 0)
+         assign offset_start_addr = 0;
+         assign offset_end_addr   = {OFFSET_WIDTH{1'b1}};
+      end
+   endgenerate
+
+   assign prot_top = |prot;
+
+   //////////////////////////////////////////////////////////////////////////////////////
+   // check for hit, multi hit
+   // In case of a multi hit, the hit_block_num indicates the lowest VA RAM with a hit.
+   // In case of a multi hit in the same VA RAM, Port 0 is given priority.
+   always_comb begin : HIT_CHECK
+      hit_top       = |hit;
+      hit_block_num = 0;
+      first_hit_top = 1'b0;
+      multi_hit_top = 1'b0;
+      for (int i=N_PAR_VA_RAMS-1; i>=0; i--) begin
+        if (hit[i] == 1'b1) begin
+`ifdef MULTI_HIT_CUR_CYCLE
+          if (multi_hit[i] | first_hit_top ) begin
+            multi_hit_top = 1'b1;
+          end
+`endif
+          first_hit_top = 1'b1;
+          hit_block_num = i;
+        end
+      end // for (int i=0; i<N_PAR_VA_RAMS; i++)
+   end // always_comb begin
+
+   ///////////////////// ------------- Outputs ------------ //////////////////////////////////
+   //// FSM
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         out_SP                     <= OUT_IDLE;
+         pa_ram_store_data_SP       <= 1'b0;
+         pa_port0_raddr_reg_SP      <=  'b0;
+      end else begin
+         out_SP                     <= out_SN;
+         pa_ram_store_data_SP       <= pa_ram_store_data_SN;
+         pa_port0_raddr_reg_SP      <= pa_port0_raddr_reg_SN;
+      end
+   end
+
+   always_comb begin : OUTPUT_FSM
+      out_SN                   = out_SP;
+
+      miss_next                = miss_o;
+      prot_next                = prot_o;
+      multi_next               = multi_o;
+      hit_next                 = hit_o;
+      cache_coherent_next      = cache_coherent_o;
+      pa_port0_raddr_reg_SN    = pa_port0_raddr_reg_SP;
+
+      pa_port0_raddr           =  'b0;
+      pa_ram_store_data_SN     = 1'b0;
+
+      out_valid_o              = 1'b0;
+      output_sent              = 1'b0;
+
+      unique case (out_SP)
+        OUT_IDLE : begin
+           hit_next            = 1'b0;
+           miss_next           = 1'b0;
+           prot_next           = 1'b0;
+           multi_next          = 1'b0;
+           cache_coherent_next = 1'b0;
+
+          // abort transaction
+          if         ((search_done & ~hit_top) | prot_top | multi_hit_top) begin
+             out_SN = SEND_OUTPUT;
+
+             if (search_done & ~hit_top) begin
+                miss_next  = 1'b1;
+             end
+             if (prot_top) begin
+                prot_next  = 1'b1;
+                hit_next   = 1'b1;
+             end
+             if (multi_hit_top) begin
+                multi_next = 1'b1;
+                hit_next   = 1'b1;
+             end
+
+          // read PA RAM
+          end else if (search_done & hit_top) begin
+             hit_next              = 1'b1;
+             cache_coherent_next   = cache_coherent[hit_block_num];
+             pa_port0_raddr        = (N_PAR_VA_RAMS * hit_addr[hit_block_num]) + hit_block_num;
+             pa_port0_raddr_reg_SN = pa_port0_raddr;
+
+             // read PA RAM now
+             if (~pa_ram_we) begin
+                out_SN               = SEND_OUTPUT;
+                pa_ram_store_data_SN = 1'b1;
+
+             // read PA RAM after PA RAM reconfiguration
+             end else begin // pa_ram_we
+                out_SN               = WAIT_ON_WRITE;
+
+             end
+          end
+        end
+
+        WAIT_ON_WRITE : begin
+          if ( ~pa_ram_we ) begin
+             out_SN               = SEND_OUTPUT;
+             pa_port0_raddr       = pa_port0_raddr_reg_SP;
+             pa_ram_store_data_SN = 1'b1;
+          end
+        end
+
+        SEND_OUTPUT : begin
+           out_valid_o  = 1'b1;
+           if (out_ready_i) begin
+              out_SN      = OUT_IDLE;
+              output_sent = 1'b1;
+           end
+        end
+
+        default : begin
+           out_SN = OUT_IDLE;
+        end
+
+      endcase // case (out_SP)
+   end // always_comb begin
+
+   //// Output signals
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         miss_o           <= 1'b0;
+         prot_o           <= 1'b0;
+         multi_o          <= 1'b0;
+         hit_o            <= 1'b0;
+         cache_coherent_o <= 1'b0;
+      end else begin
+         miss_o           <= miss_next;
+         prot_o           <= prot_next;
+         multi_o          <= multi_next;
+         hit_o            <= hit_next;
+         cache_coherent_o <= cache_coherent_next;
+      end
+   end
+
+   ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+  ///////////////////// --------------- Physical Address -------------- ////////////////////////////
+
+  /// PA Block RAM
+  ram_tp_no_change #(
+        .ADDR_WIDTH( PA_RAM_ADDR_WIDTH ),
+        .DATA_WIDTH( PA_RAM_DATA_WIDTH )
+        )
+  pa_ram
+    (
+      .clk   ( clk_i                          ),
+      .we    ( pa_ram_we                      ),
+      .addr0 ( pa_port0_addr                  ),
+      .addr1 ( '0                             ),
+      .d_i   ( wdata_i[PA_RAM_DATA_WIDTH-1:0] ),
+      .d0_o  ( pa_port0_data                  ),
+      .d1_o  (                                )
+    );
+
+   assign out_addr_o[IGNORE_LSB-1:0]                = in_addr_i[IGNORE_LSB-1:0];
+   assign out_addr_o[AXI_M_ADDR_WIDTH-1:IGNORE_LSB] = pa_data;
+
+   always_ff @(posedge clk_i) begin
+      if (rst_ni == 0) begin
+         pa_port0_data_reg <= 0;
+      end else if (pa_ram_store_data_SP) begin
+         pa_port0_data_reg <= pa_port0_data;
+      end
+   end
+
+   assign pa_data = pa_ram_store_data_SP ? pa_port0_data : pa_port0_data_reg;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+///// Write enable for all block rams
+generate if (LL_WIDTH != 0) begin
+   always_comb begin
+      var reg[LL_WIDTH:0] para;
+      var int             para_int;
+      for (para = 0; para < N_PAR_VA_RAMS; para=para+1'b1) begin
+        para_int         = int'(para);
+        ram_we[para_int] = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b0) && (waddr_i[LL_WIDTH-1:0] == para);
+      end
+   end
+end else begin
+   assign ram_we[0] = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b0);
+end
+
+endgenerate
+
+// Addresses are word, not byte addresses
+assign pa_ram_we      = we_i && (waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] == 1'b1); //waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH] will be 0 for all VA writes and 1 for all PA writes
+assign ram_waddr      = waddr_i[LL_WIDTH+VA_RAM_ADDR_WIDTH-1:LL_WIDTH];
+assign pa_port0_waddr = waddr_i[PA_RAM_ADDR_WIDTH-1:0];
+assign pa_port0_addr  = pa_ram_we ? pa_port0_waddr : pa_port0_raddr;
+
+"""
+# endmodule
+#
+# // vim: ts=3 sw=3 sts=3 et nosmartindent autoindent foldmethod=marker tw=100
+#
+#
diff --git a/unused_please_ignore_completely/iommu/axi_rab/rab_core.py b/unused_please_ignore_completely/iommu/axi_rab/rab_core.py
new file mode 100644 (file)
index 0000000..7d7494a
--- /dev/null
@@ -0,0 +1,539 @@
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+
+# this file has been generated by sv2nmigen
+
+#
+# //`include "pulp_soc_defines.sv"
+#
+# ////import CfMath::log2;
+#
+# //`define MY_ARRAY_SUM(MY_ARRAY,ARRAY_SIZE) ( (ARRAY_SIZE==1) ? MY_ARRAY[0] : (ARRAY_SIZE==2) ? MY_ARRAY[0] + MY_ARRAY[1] : (ARRAY_SIZE==3) ? MY_ARRAY[0] + MY_ARRAY[1] + MY_ARRAY[2] : (ARRAY_SIZE==4) ? MY_ARRAY[0] + MY_ARRAY[1] + MY_ARRAY[2] + MY_ARRAY[3] : 0 )
+#
+
+# module rab_core
+#  #(
+#    parameter N_PORTS             =  3,
+#    parameter N_L2_SETS           = 32,
+#    parameter N_L2_SET_ENTRIES    = 32,
+#    parameter AXI_DATA_WIDTH      = 64,
+#    parameter AXI_S_ADDR_WIDTH    = 32,
+#    parameter AXI_M_ADDR_WIDTH    = 40,
+#    parameter AXI_LITE_DATA_WIDTH = 64,
+#    parameter AXI_LITE_ADDR_WIDTH = 32,
+#    parameter AXI_ID_WIDTH        =  8,
+#    parameter AXI_USER_WIDTH      =  6,
+#    parameter MH_FIFO_DEPTH       = 16
+#    )
+#   (
+#    input  logic                                         Clk_CI,
+#    input  logic                                         Rst_RBI,
+#
+#    input  logic               [AXI_LITE_ADDR_WIDTH-1:0] s_axi_awaddr,
+#    input  logic                                         s_axi_awvalid,
+#    output logic                                         s_axi_awready,
+#
+#    input  logic               [AXI_LITE_DATA_WIDTH-1:0] s_axi_wdata,
+#    input  logic             [AXI_LITE_DATA_WIDTH/8-1:0] s_axi_wstrb,
+#    input  logic                                         s_axi_wvalid,
+#    output logic                                         s_axi_wready,
+#
+#    input  logic               [AXI_LITE_ADDR_WIDTH-1:0] s_axi_araddr,
+#    input  logic                                         s_axi_arvalid,
+#    output logic                                         s_axi_arready,
+#
+#    input  logic                                         s_axi_rready,
+#    output logic               [AXI_LITE_DATA_WIDTH-1:0] s_axi_rdata,
+#    output logic                                   [1:0] s_axi_rresp,
+#    output logic                                         s_axi_rvalid,
+#
+#    output logic                                   [1:0] s_axi_bresp,
+#    output logic                                         s_axi_bvalid,
+#    input  logic                                         s_axi_bready,
+#
+#    output logic [N_PORTS-1:0]                           int_miss,
+#    output logic [N_PORTS-1:0]                           int_prot,
+#    output logic [N_PORTS-1:0]                           int_multi,
+#    output logic [N_PORTS-1:0]                           int_prefetch,
+#    output logic                                         int_mhf_full,
+#
+#    output logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] int_axaddr_o,
+#    output logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] int_axid_o,
+#    output logic [N_PORTS-1:0]                     [7:0] int_axlen_o,
+#    output logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] int_axuser_o,
+#
+#    input  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] port1_addr,
+#    input  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] port1_id,
+#    input  logic [N_PORTS-1:0]                     [7:0] port1_len,
+#    input  logic [N_PORTS-1:0]                     [2:0] port1_size,
+#    input  logic [N_PORTS-1:0]                           port1_addr_valid,
+#    input  logic [N_PORTS-1:0]                           port1_type,
+#    input  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] port1_user,
+#    input  logic [N_PORTS-1:0]                           port1_sent,
+#    output logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] port1_out_addr,
+#    output logic [N_PORTS-1:0]                           port1_cache_coherent,
+#    output logic [N_PORTS-1:0]                           port1_accept,
+#    output logic [N_PORTS-1:0]                           port1_drop,
+#    output logic [N_PORTS-1:0]                           port1_miss,
+#
+#    input  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] port2_addr,
+#    input  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] port2_id,
+#    input  logic [N_PORTS-1:0]                     [7:0] port2_len,
+#    input  logic [N_PORTS-1:0]                     [2:0] port2_size,
+#    input  logic [N_PORTS-1:0]                           port2_addr_valid,
+#    input  logic [N_PORTS-1:0]                           port2_type,
+#    input  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] port2_user,
+#    input  logic [N_PORTS-1:0]                           port2_sent,
+#    output logic [N_PORTS-1:0]    [AXI_M_ADDR_WIDTH-1:0] port2_out_addr,
+#    output logic [N_PORTS-1:0]                           port2_cache_coherent,
+#    output logic [N_PORTS-1:0]                           port2_accept,
+#    output logic [N_PORTS-1:0]                           port2_drop,
+#    output logic [N_PORTS-1:0]                           port2_miss,
+#
+#    input  logic [N_PORTS-1:0]                           miss_l2_i,
+#    input  logic [N_PORTS-1:0]    [AXI_S_ADDR_WIDTH-1:0] miss_l2_addr_i,
+#    input  logic [N_PORTS-1:0]        [AXI_ID_WIDTH-1:0] miss_l2_id_i,
+#    input  logic [N_PORTS-1:0]      [AXI_USER_WIDTH-1:0] miss_l2_user_i,
+#
+#    output logic [N_PORTS-1:0] [AXI_LITE_DATA_WIDTH-1:0] wdata_l2_o,
+#    output logic [N_PORTS-1:0] [AXI_LITE_ADDR_WIDTH-1:0] waddr_l2_o,
+#    output logic [N_PORTS-1:0]                           wren_l2_o
+#    );
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class rab_core(Elaboratable):
+
+    def __init__(self):
+        self.s_axi_awaddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
+        self.s_axi_awvalid = Signal()  # input
+        self.s_axi_awready = Signal()  # output
+        self.s_axi_wdata = Signal(AXI_LITE_DATA_WIDTH)  # input
+        self.s_axi_wstrb = Signal(FIXME)  # input
+        self.s_axi_wvalid = Signal()  # input
+        self.s_axi_wready = Signal()  # output
+        self.s_axi_araddr = Signal(AXI_LITE_ADDR_WIDTH)  # input
+        self.s_axi_arvalid = Signal()  # input
+        self.s_axi_arready = Signal()  # output
+        self.s_axi_rready = Signal()  # input
+        self.s_axi_rdata = Signal(AXI_LITE_DATA_WIDTH)  # output
+        self.s_axi_rresp = Signal(2)  # output
+        self.s_axi_rvalid = Signal()  # output
+        self.s_axi_bresp = Signal(2)  # output
+        self.s_axi_bvalid = Signal()  # output
+        self.s_axi_bready = Signal()  # input
+        self.int_miss = Signal(N_PORTS)  # output
+        self.int_prot = Signal(N_PORTS)  # output
+        self.int_multi = Signal(N_PORTS)  # output
+        self.int_prefetch = Signal(N_PORTS)  # output
+        self.int_mhf_full = Signal()  # output
+        self.int_axaddr_o = Signal()  # output
+        self.int_axid_o = Signal()  # output
+        self.int_axlen_o = Signal()  # output
+        self.int_axuser_o = Signal()  # output
+        self.port1_addr = Signal()  # input
+        self.port1_id = Signal()  # input
+        self.port1_len = Signal()  # input
+        self.port1_size = Signal()  # input
+        self.port1_addr_valid = Signal(N_PORTS)  # input
+        self.port1_type = Signal(N_PORTS)  # input
+        self.port1_user = Signal()  # input
+        self.port1_sent = Signal(N_PORTS)  # input
+        self.port1_out_addr = Signal()  # output
+        self.port1_cache_coherent = Signal(N_PORTS)  # output
+        self.port1_accept = Signal(N_PORTS)  # output
+        self.port1_drop = Signal(N_PORTS)  # output
+        self.port1_miss = Signal(N_PORTS)  # output
+        self.port2_addr = Signal()  # input
+        self.port2_id = Signal()  # input
+        self.port2_len = Signal()  # input
+        self.port2_size = Signal()  # input
+        self.port2_addr_valid = Signal(N_PORTS)  # input
+        self.port2_type = Signal(N_PORTS)  # input
+        self.port2_user = Signal()  # input
+        self.port2_sent = Signal(N_PORTS)  # input
+        self.port2_out_addr = Signal()  # output
+        self.port2_cache_coherent = Signal(N_PORTS)  # output
+        self.port2_accept = Signal(N_PORTS)  # output
+        self.port2_drop = Signal(N_PORTS)  # output
+        self.port2_miss = Signal(N_PORTS)  # output
+        self.miss_l2_i = Signal(N_PORTS)  # input
+        self.miss_l2_addr_i = Signal()  # input
+        self.miss_l2_id_i = Signal()  # input
+        self.miss_l2_user_i = Signal()  # input
+        self.wdata_l2_o = Signal()  # output
+        self.waddr_l2_o = Signal()  # output
+        self.wren_l2_o = Signal(N_PORTS)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        return m
+
+
+""" 
+
+
+    // ███████╗██╗ ██████╗ ███╗   ██╗ █████╗ ██╗     ███████╗
+    // ██╔════╝██║██╔════╝ ████╗  ██║██╔══██╗██║     ██╔════╝
+    // ███████╗██║██║  ███╗██╔██╗ ██║███████║██║     ███████╗
+    // ╚════██║██║██║   ██║██║╚██╗██║██╔══██║██║     ╚════██║
+    // ███████║██║╚██████╔╝██║ ╚████║██║  ██║███████╗███████║
+    // ╚══════╝╚═╝ ╚═════╝ ╚═╝  ╚═══╝╚═╝  ╚═╝╚══════╝╚══════╝
+    // signals
+
+  localparam integer ENABLE_L2TLB[N_PORTS-1:0] = `EN_L2TLB_ARRAY;
+
+  localparam integer N_SLICES[N_PORTS-1:0]     = `N_SLICES_ARRAY;
+  localparam         N_SLICES_TOT              = `MY_ARRAY_SUM(N_SLICES,N_PORTS);
+  localparam         N_SLICES_MAX              = `N_SLICES_MAX;
+
+  localparam N_REGS                            = 4*N_SLICES_TOT + 4;
+  localparam AXI_SIZE_WIDTH                    = log2(AXI_DATA_WIDTH/8);
+
+  localparam PORT_ID_WIDTH                     = (N_PORTS < 2) ? 1 : log2(N_PORTS);
+  localparam MISS_META_WIDTH                   = PORT_ID_WIDTH + AXI_USER_WIDTH + AXI_ID_WIDTH;
+
+  logic [N_PORTS-1:0]                      [15:0] p1_burst_size;
+  logic [N_PORTS-1:0]                      [15:0] p2_burst_size;
+
+  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p1_align_addr;
+  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p2_align_addr;
+
+  logic [N_PORTS-1:0]        [AXI_SIZE_WIDTH-1:0] p1_mask;
+  logic [N_PORTS-1:0]        [AXI_SIZE_WIDTH-1:0] p2_mask;
+
+  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p1_max_addr;
+  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] p2_max_addr;
+
+  logic [N_PORTS-1:0]                             p1_prefetch;
+  logic [N_PORTS-1:0]                             p2_prefetch;
+
+  logic [N_PORTS-1:0]                             int_rw;
+  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] int_addr_min;
+  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] int_addr_max;
+  logic [N_PORTS-1:0]          [AXI_ID_WIDTH-1:0] int_id;
+  logic [N_PORTS-1:0]                       [7:0] int_len;
+  logic [N_PORTS-1:0]        [AXI_USER_WIDTH-1:0] int_user;
+
+  logic [N_PORTS-1:0]                             hit;
+  logic [N_PORTS-1:0]                             prot;
+  logic [N_PORTS-1:0]                             prefetch;
+
+  logic [N_PORTS-1:0]                             no_hit;
+  logic [N_PORTS-1:0]                             no_prot;
+
+  logic [N_PORTS-1:0]          [N_SLICES_MAX-1:0] hit_slices;
+  logic [N_PORTS-1:0]          [N_SLICES_MAX-1:0] prot_slices;
+
+  logic [N_PORTS-1:0]      [AXI_M_ADDR_WIDTH-1:0] out_addr;
+  logic [N_PORTS-1:0]      [AXI_M_ADDR_WIDTH-1:0] out_addr_reg;
+
+  logic [N_PORTS-1:0]                             cache_coherent;
+  logic [N_PORTS-1:0]                             cache_coherent_reg;
+
+  logic [N_PORTS-1:0]                             select;
+  reg   [N_PORTS-1:0]                             curr_priority;
+
+  reg   [N_PORTS-1:0]                             multi_hit;
+
+  logic [N_PORTS-1:0]                             miss_valid_mhf;
+  logic [N_PORTS-1:0]      [AXI_S_ADDR_WIDTH-1:0] miss_addr_mhf;
+  logic [N_PORTS-1:0]       [MISS_META_WIDTH-1:0] miss_meta_mhf;
+
+  logic [N_REGS-1:0]                       [63:0] int_cfg_regs;
+  logic [N_PORTS-1:0] [4*N_SLICES_MAX-1:0] [63:0] int_cfg_regs_slices;
+
+  logic                                           L1AllowMultiHit_S;
+
+  genvar z;
+
+  //  █████╗ ███████╗███████╗██╗ ██████╗ ███╗   ██╗███╗   ███╗███████╗███╗   ██╗████████╗███████╗
+  // ██╔══██╗██╔════╝██╔════╝██║██╔════╝ ████╗  ██║████╗ ████║██╔════╝████╗  ██║╚══██╔══╝██╔════╝
+  // ███████║███████╗███████╗██║██║  ███╗██╔██╗ ██║██╔████╔██║█████╗  ██╔██╗ ██║   ██║   ███████╗
+  // ██╔══██║╚════██║╚════██║██║██║   ██║██║╚██╗██║██║╚██╔╝██║██╔══╝  ██║╚██╗██║   ██║   ╚════██║
+  // ██║  ██║███████║███████║██║╚██████╔╝██║ ╚████║██║ ╚═╝ ██║███████╗██║ ╚████║   ██║   ███████║
+  // ╚═╝  ╚═╝╚══════╝╚══════╝╚═╝ ╚═════╝ ╚═╝  ╚═══╝╚═╝     ╚═╝╚══════╝╚═╝  ╚═══╝   ╚═╝   ╚══════╝
+  // assignments
+
+  always_comb
+    begin : PORT_SELECT
+      var integer idx;
+
+      for (idx=0; idx<N_PORTS; idx++) begin
+
+        // select = 1 -> port1 active
+        // select = 0 -> port2 active
+        select[idx] = (curr_priority[idx] & port1_addr_valid[idx]) | ~port2_addr_valid[idx];
+
+        p1_burst_size[idx] = (port1_len[idx] + 1) << port1_size[idx];
+        p2_burst_size[idx] = (port2_len[idx] + 1) << port2_size[idx];
+
+        // align min addr for max addr computation to allow for smart AXI bursts around the 4k boundary
+        if      (port1_size[idx] == 3'b001)
+          p1_mask[idx] = 3'b110;
+        else if (port1_size[idx] == 3'b010)
+          p1_mask[idx] = 3'b100;
+        else if (port1_size[idx] == 3'b011)
+          p1_mask[idx] = 3'b000;
+        else
+          p1_mask[idx] = 3'b111;
+
+        p1_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port1_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH];
+        p1_align_addr[idx][AXI_SIZE_WIDTH-1:0]                = port1_addr[idx][AXI_SIZE_WIDTH-1:0] & p1_mask[idx];
+
+        if      (port2_size[idx] == 3'b001)
+          p2_mask[idx] = 3'b110;
+        else if (port2_size[idx] == 3'b010)
+          p2_mask[idx] = 3'b100;
+        else if (port2_size[idx] == 3'b011)
+          p2_mask[idx] = 3'b000;
+        else
+          p2_mask[idx] = 3'b111;
+
+        if (port1_user[idx] == {AXI_USER_WIDTH{1'b1}})
+          p1_prefetch[idx] = 1'b1;
+        else
+          p1_prefetch[idx] = 1'b0;
+
+        if (port2_user[idx] == {AXI_USER_WIDTH{1'b1}})
+          p2_prefetch[idx] = 1'b1;
+        else
+          p2_prefetch[idx] = 1'b0;
+
+        p2_align_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH] = port2_addr[idx][AXI_S_ADDR_WIDTH-1:AXI_SIZE_WIDTH];
+        p2_align_addr[idx][AXI_SIZE_WIDTH-1:0]                = port2_addr[idx][AXI_SIZE_WIDTH-1:0] & p2_mask[idx];
+
+        p1_max_addr[idx]  = p1_align_addr[idx] + p1_burst_size[idx] - 1;
+        p2_max_addr[idx]  = p2_align_addr[idx] + p2_burst_size[idx] - 1;
+
+        int_addr_min[idx] = select[idx] ? port1_addr[idx]  : port2_addr[idx];
+        int_addr_max[idx] = select[idx] ? p1_max_addr[idx] : p2_max_addr[idx];
+        int_rw[idx]       = select[idx] ? port1_type[idx]  : port2_type[idx];
+        int_id[idx]       = select[idx] ? port1_id[idx]    : port2_id[idx];
+        int_len[idx]      = select[idx] ? port1_len[idx]   : port2_len[idx];
+        int_user[idx]     = select[idx] ? port1_user[idx]  : port2_user[idx];
+        prefetch[idx]     = select[idx] ? p1_prefetch[idx] : p2_prefetch[idx];
+
+        hit [idx]    = | hit_slices [idx];
+        prot[idx]    = | prot_slices[idx];
+
+        no_hit [idx] = ~hit [idx];
+        no_prot[idx] = ~prot[idx];
+
+        port1_out_addr[idx] = out_addr_reg[idx];
+        port2_out_addr[idx] = out_addr_reg[idx];
+
+        port1_cache_coherent[idx] = cache_coherent_reg[idx];
+        port2_cache_coherent[idx] = cache_coherent_reg[idx];
+      end
+    end
+
+  always_comb
+    begin
+      var integer idx_port, idx_slice;
+      var integer reg_num;
+      reg_num=0;
+      for ( idx_port = 0; idx_port < N_PORTS; idx_port++ ) begin
+        for ( idx_slice = 0; idx_slice < 4*N_SLICES[idx_port]; idx_slice++ ) begin
+          int_cfg_regs_slices[idx_port][idx_slice] = int_cfg_regs[4+reg_num];
+          reg_num++;
+        end
+        // int_cfg_regs_slices[idx_port][N_SLICES_MAX:N_SLICES[idx_port]] will be dangling
+        // Fix to zero. Synthesis will remove these signals.
+        // int_cfg_regs_slices[idx_port][4*N_SLICES_MAX-1:4*N_SLICES[idx_port]] = 0;
+      end
+  end
+
+  always @(posedge Clk_CI or negedge Rst_RBI)
+    begin : PORT_PRIORITY
+      var integer idx;
+      if (Rst_RBI == 1'b0)
+        curr_priority = 'h0;
+      else begin
+        for (idx=0; idx<N_PORTS; idx++) begin
+          if (port1_accept[idx] || port1_drop[idx])
+            curr_priority[idx] = 1'b1;
+          else if (port2_accept[idx] || port2_drop[idx])
+            curr_priority[idx] = 1'b0;
+        end
+      end
+    end
+
+  // find port that misses
+  logic [PORT_ID_WIDTH-1:0] PortIdx_D; // index of the first missing port
+  var integer               idx_miss;
+  always_comb begin : MHF_PORT_SELECT
+    PortIdx_D = 'b0;
+    for (idx_miss = 0; idx_miss < N_PORTS; idx_miss++) begin
+      if (miss_valid_mhf[idx_miss] == 1'b1) begin
+        PortIdx_D = idx_miss;
+        break;
+      end
+    end
+  end // always_comb begin
+
+  //  █████╗ ██╗  ██╗██╗    ██████╗  █████╗ ██████╗      ██████╗███████╗ ██████╗
+  // ██╔══██╗╚██╗██╔╝██║    ██╔══██╗██╔══██╗██╔══██╗    ██╔════╝██╔════╝██╔════╝
+  // ███████║ ╚███╔╝ ██║    ██████╔╝███████║██████╔╝    ██║     █████╗  ██║  ███╗
+  // ██╔══██║ ██╔██╗ ██║    ██╔══██╗██╔══██║██╔══██╗    ██║     ██╔══╝  ██║   ██║
+  // ██║  ██║██╔╝ ██╗██║    ██║  ██║██║  ██║██████╔╝    ╚██████╗██║     ╚██████╔╝
+  // ╚═╝  ╚═╝╚═╝  ╚═╝╚═╝    ╚═╝  ╚═╝╚═╝  ╚═╝╚═════╝      ╚═════╝╚═╝      ╚═════╝
+  axi_rab_cfg
+    #(
+      .N_PORTS         ( N_PORTS             ),
+      .N_REGS          ( N_REGS              ),
+      .N_L2_SETS       ( N_L2_SETS           ),
+      .N_L2_SET_ENTRIES( N_L2_SET_ENTRIES    ),
+      .ADDR_WIDTH_PHYS ( AXI_M_ADDR_WIDTH    ),
+      .ADDR_WIDTH_VIRT ( AXI_S_ADDR_WIDTH    ),
+      .N_FLAGS         ( 4                   ),
+      .AXI_DATA_WIDTH  ( AXI_LITE_DATA_WIDTH ),
+      .AXI_ADDR_WIDTH  ( AXI_LITE_ADDR_WIDTH ),
+      .MISS_META_WIDTH ( MISS_META_WIDTH     ),
+      .MH_FIFO_DEPTH   ( MH_FIFO_DEPTH       )
+    )
+    u_axi_rab_cfg
+    (
+      .Clk_CI             ( Clk_CI                    ),
+      .Rst_RBI            ( Rst_RBI                   ),
+      .s_axi_awaddr       ( s_axi_awaddr              ),
+      .s_axi_awvalid      ( s_axi_awvalid             ),
+      .s_axi_wdata        ( s_axi_wdata               ),
+      .s_axi_wstrb        ( s_axi_wstrb               ),
+      .s_axi_wvalid       ( s_axi_wvalid              ),
+      .s_axi_bready       ( s_axi_bready              ),
+      .s_axi_araddr       ( s_axi_araddr              ),
+      .s_axi_arvalid      ( s_axi_arvalid             ),
+      .s_axi_rready       ( s_axi_rready              ),
+      .s_axi_arready      ( s_axi_arready             ),
+      .s_axi_rdata        ( s_axi_rdata               ),
+      .s_axi_rresp        ( s_axi_rresp               ),
+      .s_axi_rvalid       ( s_axi_rvalid              ),
+      .s_axi_wready       ( s_axi_wready              ),
+      .s_axi_bresp        ( s_axi_bresp               ),
+      .s_axi_bvalid       ( s_axi_bvalid              ),
+      .s_axi_awready      ( s_axi_awready             ),
+      .L1Cfg_DO           ( int_cfg_regs              ),
+      .L1AllowMultiHit_SO ( L1AllowMultiHit_S         ),
+      .MissAddr_DI        ( miss_addr_mhf[PortIdx_D]  ),
+      .MissMeta_DI        ( miss_meta_mhf[PortIdx_D]  ),
+      .Miss_SI            ( miss_valid_mhf[PortIdx_D] ),
+      .MhFifoFull_SO      ( int_mhf_full              ),
+      .wdata_l2           ( wdata_l2_o                ),
+      .waddr_l2           ( waddr_l2_o                ),
+      .wren_l2            ( wren_l2_o                 )
+    );
+
+  generate for (z = 0; z < N_PORTS; z++) begin : MHF_TLB_SELECT
+    if (ENABLE_L2TLB[z] == 1) begin // L2 TLB is enabled
+      assign miss_valid_mhf[z] = miss_l2_i[z];
+      assign miss_addr_mhf[z]  = miss_l2_addr_i[z];
+      assign miss_meta_mhf[z]  = {miss_l2_user_i[z], PortIdx_D, miss_l2_id_i[z]};
+    end else begin// L2 TLB is disabled
+      assign miss_valid_mhf[z] = int_miss[z];
+      assign miss_addr_mhf[z]  = int_addr_min[z];
+      assign miss_meta_mhf[z]  = {int_user[z], PortIdx_D, int_id[z]};
+    end
+  end
+  endgenerate
+
+  // ███████╗██╗     ██╗ ██████╗███████╗    ████████╗ ██████╗ ██████╗
+  // ██╔════╝██║     ██║██╔════╝██╔════╝    ╚══██╔══╝██╔═══██╗██╔══██╗
+  // ███████╗██║     ██║██║     █████╗         ██║   ██║   ██║██████╔╝
+  // ╚════██║██║     ██║██║     ██╔══╝         ██║   ██║   ██║██╔═══╝
+  // ███████║███████╗██║╚██████╗███████╗       ██║   ╚██████╔╝██║
+  // ╚══════╝╚══════╝╚═╝ ╚═════╝╚══════╝       ╚═╝    ╚═════╝ ╚═╝
+  generate for (z = 0; z < N_PORTS; z++) begin : SLICE_TOP_GEN
+    slice_top
+      #(
+        .N_SLICES        ( N_SLICES[z]      ),
+        .N_REGS          ( 4*N_SLICES[z]    ),
+        .ADDR_WIDTH_PHYS ( AXI_M_ADDR_WIDTH ),
+        .ADDR_WIDTH_VIRT ( AXI_S_ADDR_WIDTH )
+      )
+      u_slice_top
+      (
+        .int_cfg_regs    ( int_cfg_regs_slices[z][4*N_SLICES[z]-1:0] ),
+        .int_rw          ( int_rw[z]                                 ),
+        .int_addr_min    ( int_addr_min[z]                           ),
+        .int_addr_max    ( int_addr_max[z]                           ),
+        .multi_hit_allow ( L1AllowMultiHit_S                         ),
+        .multi_hit       ( multi_hit[z]                              ),
+        .prot            ( prot_slices[z][N_SLICES[z]-1:0]           ),
+        .hit             ( hit_slices [z][N_SLICES[z]-1:0]           ),
+        .cache_coherent  ( cache_coherent[z]                         ),
+        .out_addr        ( out_addr[z]                               )
+      );
+    // hit_slices [N_SLICES_MAX-1:N_SLICES_MAX-N_SLICES[z]] will be dangling
+    // prot_slices[N_SLICES_MAX-1:N_SLICES_MAX-N_SLICES[z]] will be dangling
+    // Fix to zero. Synthesis will remove these signals.
+    if ( N_SLICES[z] < N_SLICES_MAX ) begin
+      assign hit_slices [z][N_SLICES_MAX-1:N_SLICES[z]] = 0;
+      assign prot_slices[z][N_SLICES_MAX-1:N_SLICES[z]] = 0;
+    end
+  end // for (z = 0; z < N_PORTS; z++)
+  endgenerate
+
+  // ███████╗███████╗███╗   ███╗
+  // ██╔════╝██╔════╝████╗ ████║
+  // █████╗  ███████╗██╔████╔██║
+  // ██╔══╝  ╚════██║██║╚██╔╝██║
+  // ██║     ███████║██║ ╚═╝ ██║
+  // ╚═╝     ╚══════╝╚═╝     ╚═╝
+  //
+  generate for (z = 0; z < N_PORTS; z++) begin : FSM_GEN
+    fsm
+      #(
+        .AXI_M_ADDR_WIDTH ( AXI_M_ADDR_WIDTH ),
+        .AXI_S_ADDR_WIDTH ( AXI_S_ADDR_WIDTH ),
+        .AXI_ID_WIDTH     ( AXI_ID_WIDTH     ),
+        .AXI_USER_WIDTH   ( AXI_USER_WIDTH   )
+      )
+      u_fsm
+      (
+        .Clk_CI             ( Clk_CI                ),
+        .Rst_RBI            ( Rst_RBI               ),
+        .port1_addr_valid_i ( port1_addr_valid[z]   ),
+        .port2_addr_valid_i ( port2_addr_valid[z]   ),
+        .port1_sent_i       ( port1_sent[z]         ),
+        .port2_sent_i       ( port2_sent[z]         ),
+        .select_i           ( select[z]             ),
+        .no_hit_i           ( no_hit[z]             ),
+        .multi_hit_i        ( multi_hit[z]          ),
+        .no_prot_i          ( no_prot[z]            ),
+        .prefetch_i         ( prefetch[z]           ),
+        .out_addr_i         ( out_addr[z]           ),
+        .cache_coherent_i   ( cache_coherent[z]     ),
+        .port1_accept_o     ( port1_accept[z]       ),
+        .port1_drop_o       ( port1_drop[z]         ),
+        .port1_miss_o       ( port1_miss[z]         ),
+        .port2_accept_o     ( port2_accept[z]       ),
+        .port2_drop_o       ( port2_drop[z]         ),
+        .port2_miss_o       ( port2_miss[z]         ),
+        .out_addr_o         ( out_addr_reg[z]       ),
+        .cache_coherent_o   ( cache_coherent_reg[z] ),
+        .miss_o             ( int_miss[z]           ),
+        .multi_o            ( int_multi[z]          ),
+        .prot_o             ( int_prot[z]           ),
+        .prefetch_o         ( int_prefetch[z]       ),
+        .in_addr_i          ( int_addr_min[z]       ),
+        .in_id_i            ( int_id[z]             ),
+        .in_len_i           ( int_len[z]            ),
+        .in_user_i          ( int_user[z]           ),
+        .in_addr_o          ( int_axaddr_o[z]       ),
+        .in_id_o            ( int_axid_o[z]         ),
+        .in_len_o           ( int_axlen_o[z]        ),
+        .in_user_o          ( int_axuser_o[z]       )
+      );
+  end
+  endgenerate
+  
+"""
diff --git a/unused_please_ignore_completely/iommu/axi_rab/rab_slice.py b/unused_please_ignore_completely/iommu/axi_rab/rab_slice.py
new file mode 100644 (file)
index 0000000..59f84e3
--- /dev/null
@@ -0,0 +1,76 @@
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# module rab_slice
+# #(
+#    parameter ADDR_WIDTH_PHYS = 40,
+#    parameter ADDR_WIDTH_VIRT = 32
+#    )
+#   (
+#    input  logic [ADDR_WIDTH_VIRT-1:0] cfg_min,
+#    input  logic [ADDR_WIDTH_VIRT-1:0] cfg_max,
+#    input  logic [ADDR_WIDTH_PHYS-1:0] cfg_offset,
+#    input  logic                       cfg_wen,
+#    input  logic                       cfg_ren,
+#    input  logic                       cfg_en,
+#    input  logic                       in_trans_type,
+#    input  logic [ADDR_WIDTH_VIRT-1:0] in_addr_min,
+#    input  logic [ADDR_WIDTH_VIRT-1:0] in_addr_max,
+#    output logic                       out_hit,
+#    output logic                       out_prot,
+#    output logic [ADDR_WIDTH_PHYS-1:0] out_addr
+#  );
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+
+
+class rab_slice(Elaboratable):
+
+    def __init__(self, params):  # pass config object
+        # TODO parameters
+        self.params = params
+        self.cfg_min = Signal(params.ADDR_WIDTH_VIRT)  # input
+        self.cfg_max = Signal(params.ADDR_WIDTH_VIRT)  # input
+        self.cfg_offset = Signal(params.ADDR_WIDTH_PHYS)  # input
+        self.cfg_wen = Signal()  # input
+        self.cfg_ren = Signal()  # input
+        self.cfg_en = Signal()  # input
+        self.in_trans_type = Signal()  # input
+        self.in_addr_min = Signal(params.ADDR_WIDTH_VIRT)  # input
+        self.in_addr_max = Signal(params.ADDR_WIDTH_VIRT)  # input
+        self.out_hit = Signal()  # output
+        self.out_prot = Signal()  # output
+        self.out_addr = Signal(params.ADDR_WIDTH_PHYS)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+        min_above_min = Signal()
+        min_below_max = Signal()
+        max_below_max = Signal()
+
+        #  assign min_above_min = (in_addr_min >= cfg_min) ? 1'b1 : 1'b0;
+        #  assign min_below_max = (in_addr_min <= cfg_max) ? 1'b1 : 1'b0;
+        #  assign max_below_max = (in_addr_max <= cfg_max) ? 1'b1 : 1'b0;
+        #  assign out_hit  = cfg_en & min_above_min & min_below_max & max_below_max;
+        #  assign out_prot = out_hit & ((in_trans_type & ~cfg_wen) | (~in_trans_type & ~cfg_ren));
+        #  assign out_addr = in_addr_min - cfg_min + cfg_offset;
+        m.d.comb += [
+            min_above_min.eq(self.in_addr_min >= self.cfg_min),
+            min_below_max.eq(self.in_addr_min <= self.cfg_max),
+            max_below_max.eq(self.in_addr_max <= self.cfg_max),
+            self.out_hit.eq(self.cfg_en & min_above_min &
+                            min_below_max & max_below_max),
+            self.out_prot.eq(self.out_hit & (
+                (self.in_trans_type & ~self.cfg_wen) | (~self.in_trans_type & ~self.cfg_ren))),
+            self.out_addr.eq(self.in_addr_min - self.cfg_min + self.cfg_offset)
+        ]
+
+        return m
diff --git a/unused_please_ignore_completely/iommu/axi_rab/ram_tp_no_change.py b/unused_please_ignore_completely/iommu/axi_rab/ram_tp_no_change.py
new file mode 100644 (file)
index 0000000..d010473
--- /dev/null
@@ -0,0 +1,97 @@
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# /*
+# * ram_tp_no_change
+# *
+# * This code implements a parameterizable two-port memory. Port 0 can read and
+# * write while Port 1 can read only. The Xilinx tools will infer a BRAM with
+# * Port 0 in "no change" mode, i.e., during a write, it retains the last read
+# * value on the output. Port 1 (read-only) is in "write first" mode. Still, it
+# * outputs the old data during the write cycle. Note: Port 1 outputs invalid
+# * data in the cycle after the write when reading the same address.
+# *
+# * For more information, see Xilinx PG058 Block Memory Generator Product Guide.
+# */
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+from nmigen import Memory
+
+import math
+
+#
+# module ram_tp_no_change
+#  #(
+ADDR_WIDTH = 10
+DATA_WIDTH = 36
+#  )
+#  (
+#    input                   clk,
+#    input                   we,
+#    input  [ADDR_WIDTH-1:0] addr0,
+#    input  [ADDR_WIDTH-1:0] addr1,
+#    input  [DATA_WIDTH-1:0] d_i,
+#    output [DATA_WIDTH-1:0] d0_o,
+#    output [DATA_WIDTH-1:0] d1_o
+#  );
+
+
+class ram_tp_no_change(Elaboratable):
+
+    def __init__(self):
+        self.we = Signal()               # input
+        self.addr0 = Signal(ADDR_WIDTH)  # input
+        self.addr1 = Signal(ADDR_WIDTH)  # input
+        self.d_i = Signal(DATA_WIDTH)    # input
+        self.d0_o = Signal(DATA_WIDTH)   # output
+        self.d1_o = Signal(DATA_WIDTH)   # output
+
+        DEPTH = int(math.pow(2, ADDR_WIDTH))
+        self.ram = Memory(width=DATA_WIDTH, depth=DEPTH)
+    #
+    #  localparam DEPTH = 2**ADDR_WIDTH;
+    #
+    #  (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH];
+    #                            reg [DATA_WIDTH-1:0] d0;
+    #                            reg [DATA_WIDTH-1:0] d1;
+    #
+    #  always_ff @(posedge clk) begin
+    #    if(we == 1'b1) begin
+    #      ram[addr0] <= d_i;
+    #    end else begin
+    # only change data if we==false
+    #      d0 <= ram[addr0];
+    #    end
+    #    d1   <= ram[addr1];
+    #  end
+    #
+    #  assign d0_o = d0;
+    #  assign d1_o = d1;
+    #
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.submodules.read_ram0 = read_ram0 = self.ram.read_port()
+        m.submodules.read_ram1 = read_ram1 = self.ram.read_port()
+        m.submodules.write_ram = write_ram = self.ram.write_port()
+
+        # write port
+        m.d.comb += write_ram.en.eq(self.we)
+        m.d.comb += write_ram.addr.eq(self.addr0)
+        m.d.comb += write_ram.data.eq(self.d_i)
+
+        # read ports
+        m.d.comb += read_ram0.addr.eq(self.addr0)
+        m.d.comb += read_ram1.addr.eq(self.addr1)
+        with m.If(self.we == 0):
+            m.d.sync += self.d0_o.eq(read_ram0.data)
+        m.d.sync += self.d1_o.eq(read_ram1.data)
+
+        return m
diff --git a/unused_please_ignore_completely/iommu/axi_rab/ram_tp_write_first.py b/unused_please_ignore_completely/iommu/axi_rab/ram_tp_write_first.py
new file mode 100644 (file)
index 0000000..8fd2abb
--- /dev/null
@@ -0,0 +1,93 @@
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+#
+# /*
+# * ram_tp_write_first
+# *
+# * This code implements a parameterizable two-port memory. Port 0 can read and
+# * write while Port 1 can read only. Xilinx Vivado will infer a BRAM in
+# * "write first" mode, i.e., upon a read and write to the same address, the
+# * new value is read. Note: Port 1 outputs invalid data in the cycle after
+# * the write when reading the same address.
+# *
+# * For more information, see Xilinx PG058 Block Memory Generator Product Guide.
+# */
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+from nmigen import Memory
+
+import math
+#
+# module ram_tp_write_first
+#  #(
+ADDR_WIDTH = 10
+DATA_WIDTH = 36
+#  )
+#  (
+#    input                   clk,
+#    input                   we,
+#    input  [ADDR_WIDTH-1:0] addr0,
+#    input  [ADDR_WIDTH-1:0] addr1,
+#    input  [DATA_WIDTH-1:0] d_i,
+#    output [DATA_WIDTH-1:0] d0_o,
+#    output [DATA_WIDTH-1:0] d1_o
+#  );
+
+
+class ram_tp_write_first(Elaboratable):
+
+    def __init__(self):
+        self.we = Signal()               # input
+        self.addr0 = Signal(ADDR_WIDTH)  # input
+        self.addr1 = Signal(ADDR_WIDTH)  # input
+        self.d_i = Signal(DATA_WIDTH)    # input
+        self.d0_o = Signal(DATA_WIDTH)   # output
+        self.d1_o = Signal(DATA_WIDTH)   # output
+
+        DEPTH = int(math.pow(2, ADDR_WIDTH))
+        self.ram = Memory(width=DATA_WIDTH, depth=DEPTH)
+
+    #
+    #  localparam DEPTH = 2**ADDR_WIDTH;
+    #
+    #  (* ram_style = "block" *) reg [DATA_WIDTH-1:0] ram[DEPTH];
+    #                            reg [ADDR_WIDTH-1:0] raddr0;
+    #                            reg [ADDR_WIDTH-1:0] raddr1;
+    #
+    #  always_ff @(posedge clk) begin
+    #    if(we == 1'b1) begin
+    #      ram[addr0] <= d_i;
+    #    end
+    #    raddr0 <= addr0;
+    #    raddr1 <= addr1;
+    #  end
+    #
+    #  assign d0_o = ram[raddr0];
+    #  assign d1_o = ram[raddr1];
+    #
+
+    def elaborate(self, platform=None):
+        m = Module()
+        m.submodules.read_ram0 = read_ram0 = self.ram.read_port()
+        m.submodules.read_ram1 = read_ram1 = self.ram.read_port()
+        m.submodules.write_ram = write_ram = self.ram.write_port()
+
+        # write port
+        m.d.comb += write_ram.en.eq(self.we)
+        m.d.comb += write_ram.addr.eq(self.addr0)
+        m.d.comb += write_ram.data.eq(self.d_i)
+
+        # read ports
+        m.d.comb += read_ram0.addr.eq(self.addr0)
+        m.d.comb += read_ram1.addr.eq(self.addr1)
+        m.d.sync += self.d0_o.eq(read_ram0.data)
+        m.d.sync += self.d1_o.eq(read_ram1.data)
+
+        return m
diff --git a/unused_please_ignore_completely/iommu/axi_rab/slice_top.py b/unused_please_ignore_completely/iommu/axi_rab/slice_top.py
new file mode 100644 (file)
index 0000000..6eedb1c
--- /dev/null
@@ -0,0 +1,141 @@
+# // Copyright 2018 ETH Zurich and University of Bologna.
+# // Copyright and related rights are licensed under the Solderpad Hardware
+# // License, Version 0.51 (the "License"); you may not use this file except in
+# // compliance with the License.  You may obtain a copy of the License at
+# // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+# // or agreed to in writing, software, hardware and materials distributed under
+# // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# // CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# // specific language governing permissions and limitations under the License.
+
+# this file has been generated by sv2nmigen
+
+from nmigen import Signal, Module, Const, Cat, Elaboratable
+import rab_slice
+import coreconfig
+
+#
+# module slice_top
+# //#(
+#  //  parameter N_SLICES        = 16,
+#  //  parameter N_REGS          = 4*N_SLICES,
+#   // parameter ADDR_WIDTH_PHYS = 40,
+#   // parameter ADDR_WIDTH_VIRT = 32
+#  //  )
+#   (
+#    input   logic   [N_REGS-1:0] [63:0] int_cfg_regs,
+#    input   logic                       int_rw,
+#    input   logic [ADDR_WIDTH_VIRT-1:0] int_addr_min,
+#    input   logic [ADDR_WIDTH_VIRT-1:0] int_addr_max,
+#    input   logic                       multi_hit_allow,
+#    output  logic                       multi_hit,
+#    output  logic        [N_SLICES-1:0] prot,
+#    output  logic        [N_SLICES-1:0] hit,
+#    output  logic                       cache_coherent,
+#    output  logic [ADDR_WIDTH_PHYS-1:0] out_addr
+#  );
+#
+
+
+class slice_top(Elaboratable):
+
+    def __init__(self):
+        # FIXME self.int_cfg_regs = Signal()  # input
+        self.params = coreconfig.CoreConfig() # rename ?
+        self.int_rw = Signal()  # input
+        self.int_addr_min = Signal(self.params.ADDR_WIDTH_VIRT)  # input
+        self.int_addr_max = Signal(self.params.ADDR_WIDTH_VIRT)  # input
+        self.multi_hit_allow = Signal()  # input
+        self.multi_hit = Signal()  # output
+        self.prot = Signal(self.params.N_SLICES)  # output
+        self.hit = Signal(self.params.N_SLICES)  # output
+        self.cache_coherent = Signal()  # output
+        self.out_addr = Signal(self.params.ADDR_WIDTH_PHYS)  # output
+
+    def elaborate(self, platform=None):
+        m = Module()
+
+        first_hit = Signal()
+
+        for i in range(self.params.N_SLICES):
+            # TODO pass params / core config here
+            u_slice = rab_slice.rab_slice(self.params)
+            setattr(m.submodules, "u_slice%d" % i, u_slice)
+            # TODO set param and connect ports
+
+        # In case of a multi hit, the lowest slice with a hit is selected.
+        # TODO always_comb begin : HIT_CHECK
+        m.d.comb += [
+            first_hit.eq(0),
+            self.multi_hit.eq(0),
+            self.out_addr.eq(0),
+            self.cache_coherent.eq(0)]
+
+        for j in range(self.params.N_SLICES):
+            with m.If(self.hit[j] == 1):
+                with m.If(first_hit == 1):
+                    with m.If(self.multi_hit_allow == 0):
+                        m.d.comb += [self.multi_hit.eq(1)]
+                with m.Elif(first_hit == 1):
+                    m.d.comb += [first_hit.eq(1)
+                                 # only output first slice that was hit
+                                 # SV self.out_addr.eq(slice_out_addr[ADDR_WIDTH_PHYS*j + : ADDR_WIDTH_PHYS]),
+                                 # SV self.cache_coherent.eq(int_cfg_regs[4*j+3][3]),
+                                 ]
+        return m
+
+  # TODO translate generate statement
+
+
+"""
+  logic [ADDR_WIDTH_PHYS*N_SLICES-1:0]  slice_out_addr;
+
+  generate
+    for ( i=0; i<N_SLICES; i++ )
+      begin
+        rab_slice
+          #(
+            .ADDR_WIDTH_PHYS ( ADDR_WIDTH_PHYS ),
+            .ADDR_WIDTH_VIRT ( ADDR_WIDTH_VIRT )
+            )
+          u_slice
+          (
+            .cfg_min       ( int_cfg_regs[4*i]  [ADDR_WIDTH_VIRT-1:0]                              ),
+            .cfg_max       ( int_cfg_regs[4*i+1][ADDR_WIDTH_VIRT-1:0]                              ),
+            .cfg_offset    ( int_cfg_regs[4*i+2][ADDR_WIDTH_PHYS-1:0]                              ),
+            .cfg_wen       ( int_cfg_regs[4*i+3][2]                                                ),
+            .cfg_ren       ( int_cfg_regs[4*i+3][1]                                                ),
+            .cfg_en        ( int_cfg_regs[4*i+3][0]                                                ),
+            .in_trans_type ( int_rw                                                                ),
+            .in_addr_min   ( int_addr_min                                                          ),
+            .in_addr_max   ( int_addr_max                                                          ),
+            .out_addr      ( slice_out_addr[ADDR_WIDTH_PHYS*i+ADDR_WIDTH_PHYS-1:ADDR_WIDTH_PHYS*i] ),
+            .out_prot      ( prot[i]                                                               ),
+            .out_hit       ( hit[i]                                                                )
+          );
+     end
+  endgenerate
+
+  // In case of a multi hit, the lowest slice with a hit is selected.
+  always_comb begin : HIT_CHECK
+    first_hit      =  0;
+    multi_hit      =  0;
+    out_addr       = '0;
+    cache_coherent =  0;
+    for (j = 0; j < N_SLICES; j++) begin
+      if (hit[j] == 1'b1) begin
+        if (first_hit == 1'b1) begin
+          if (multi_hit_allow == 1'b0) begin
+            multi_hit = 1'b1;
+          end
+        end else begin
+          first_hit       = 1'b1;
+          out_addr        = slice_out_addr[ADDR_WIDTH_PHYS*j +: ADDR_WIDTH_PHYS];
+          cache_coherent  = int_cfg_regs[4*j+3][3];
+        end
+      end
+    end
+  end
+"""
+
+# sv 2 migen: TODO add translate code for generate statements and for loops inside always_comb
diff --git a/unused_please_ignore_completely/iommu/axi_rab/test/__init__.py b/unused_please_ignore_completely/iommu/axi_rab/test/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/unused_please_ignore_completely/iommu/axi_rab/test/test_ram_tp_no_change.py b/unused_please_ignore_completely/iommu/axi_rab/test/test_ram_tp_no_change.py
new file mode 100644 (file)
index 0000000..8d23ef0
--- /dev/null
@@ -0,0 +1,18 @@
+from ram_tp_write_first import ram_tp_write_first
+from nmigen.compat.sim import run_simulation
+import sys
+sys.path.append("../")
+
+
+def tbench(dut):
+    yield dut.we.eq(1)
+    for i in range(0, 255):
+        yield dut.addr0.eq(i)
+        yield dut.d_i.eq(i)
+        yield
+
+
+if __name__ == "__main__":
+    dut = ram_tp_write_first()
+    run_simulation(dut, tbench(dut), vcd_name="ram_tp_write_first.vcd")
+    print("ram_tp_write_first Unit Test Success")
diff --git a/unused_please_ignore_completely/iommu/axi_rab/test/test_slice_top.py b/unused_please_ignore_completely/iommu/axi_rab/test/test_slice_top.py
new file mode 100644 (file)
index 0000000..c234b90
--- /dev/null
@@ -0,0 +1,14 @@
+from nmigen.compat.sim import run_simulation
+import sys
+sys.path.append("../")
+# sys.path.append("../../../TestUtil")
+from slice_top import slice_top
+
+def tbench(dut):
+    yield
+
+
+if __name__ == "__main__":
+    dut = slice_top()
+    run_simulation(dut, tbench(dut), vcd_name="test_slice_top.vcd")
+    print("slice_top Unit Test Success")
diff --git a/unused_please_ignore_completely/simulator/__init__.py b/unused_please_ignore_completely/simulator/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/unused_please_ignore_completely/simulator/internalop_sim.py b/unused_please_ignore_completely/simulator/internalop_sim.py
new file mode 100644 (file)
index 0000000..f7a4b26
--- /dev/null
@@ -0,0 +1,196 @@
+from soc.decoder.power_enums import (Function, Form, InternalOp,
+                                     In1Sel, In2Sel, In3Sel, OutSel,
+                                     RC, LdstLen, CryIn, get_csv,
+                                     single_bit_flags,
+                                     get_signal_name, default_values)
+import math
+
+
+class MemorySim:
+    def __init__(self, bytes_per_word=8):
+        self.mem = {}
+        self.bytes_per_word = bytes_per_word
+        self.word_log2 = math.ceil(math.log2(bytes_per_word))
+
+    def _get_shifter_mask(self, width, remainder):
+        shifter = ((self.bytes_per_word - width) - remainder) * \
+            8  # bits per byte
+        mask = (1 << (width * 8)) - 1
+        return shifter, mask
+
+    # TODO: Implement ld/st of lesser width
+    def ld(self, address, width=8):
+        remainder = address & (self.bytes_per_word - 1)
+        address = address >> self.word_log2
+        assert remainder & (width - 1) == 0, "Unaligned access unsupported!"
+        if address in self.mem:
+            val = self.mem[address]
+        else:
+            val = 0
+
+        if width != self.bytes_per_word:
+            shifter, mask = self._get_shifter_mask(width, remainder)
+            val = val & (mask << shifter)
+            val >>= shifter
+        print("Read {:x} from addr {:x}".format(val, address))
+        return val
+
+    def st(self, address, value, width=8):
+        remainder = address & (self.bytes_per_word - 1)
+        address = address >> self.word_log2
+        assert remainder & (width - 1) == 0, "Unaligned access unsupported!"
+        print("Writing {:x} to addr {:x}".format(value, address))
+        if width != self.bytes_per_word:
+            if address in self.mem:
+                val = self.mem[address]
+            else:
+                val = 0
+            shifter, mask = self._get_shifter_mask(width, remainder)
+            val &= ~(mask << shifter)
+            val |= value << shifter
+            self.mem[address] = val
+        else:
+            self.mem[address] = value
+
+
+class RegFile:
+    def __init__(self):
+        self.regfile = [0] * 32
+        self.sprs = {}
+
+    def write_reg(self, regnum, value):
+        all1s = (1 << 64)-1  # 64 bits worth of 1s
+        value &= all1s
+        print("Writing {:x} to reg r{}".format(value, regnum))
+        self.regfile[regnum] = value
+
+    def read_reg(self, regnum):
+        val = self.regfile[regnum]
+        print("Read {:x} from reg r{}".format(val, regnum))
+        return val
+
+    def assert_gpr(self, gpr, val):
+        reg_val = self.read_reg(gpr)
+        msg = "reg r{} got {:x}, expecting {:x}".format(
+            gpr, reg_val, val)
+        assert reg_val == val, msg
+
+    def assert_gprs(self, gprs):
+        for k, v in list(gprs.items()):
+            self.assert_gpr(k, v)
+
+    def set_xer(self, result, operanda, operandb):
+        xer = 0
+        if result & 1 << 64:
+            xer |= XER.CA
+
+        self.xer = xer
+
+
+class InternalOpSimulator:
+    def __init__(self):
+        self.mem_sim = MemorySim()
+        self.regfile = RegFile()
+
+    def execute_alu_op(self, op1, op2, internal_op, carry=0):
+        print(internal_op)
+        if internal_op == InternalOp.OP_ADD.value:
+            return op1 + op2 + carry
+        elif internal_op == InternalOp.OP_AND.value:
+            return op1 & op2
+        elif internal_op == InternalOp.OP_OR.value:
+            return op1 | op2
+        elif internal_op == InternalOp.OP_MUL_L64.value:
+            return op1 * op2
+        else:
+            assert False, "Not implemented"
+
+    def update_cr0(self, result):
+        if result == 0:
+            self.cr0 = 0b001
+        elif result >> 63:
+            self.cr0 = 0b100
+        else:
+            self.cr0 = 0b010
+        print("update_cr0", self.cr0)
+
+    def alu_op(self, pdecode2):
+        all1s = (1 << 64)-1  # 64 bits worth of 1s
+        internal_op = yield pdecode2.dec.op.internal_op
+        operand1 = 0
+        operand2 = 0
+        result = 0
+        carry = 0
+        r1_ok = yield pdecode2.e.read_reg1.ok
+        r2_ok = yield pdecode2.e.read_reg2.ok
+        r3_ok = yield pdecode2.e.read_reg3.ok
+        imm_ok = yield pdecode2.e.imm_data.ok
+        if r1_ok:
+            r1_sel = yield pdecode2.e.read_reg1.data
+            operand1 = self.regfile.read_reg(r1_sel)
+        elif r3_ok:
+            r3_sel = yield pdecode2.e.read_reg3.data
+            operand1 = self.regfile.read_reg(r3_sel)
+        if r2_ok:
+            r2_sel = yield pdecode2.e.read_reg2.data
+            operand2 = self.regfile.read_reg(r2_sel)
+        if imm_ok:
+            operand2 = yield pdecode2.e.imm_data.data
+
+        inv_a = yield pdecode2.dec.op.inv_a
+        if inv_a:
+            operand1 = (~operand1) & all1s
+
+        cry_in = yield pdecode2.dec.op.cry_in
+        if cry_in == CryIn.ONE.value:
+            carry = 1
+        elif cry_in == CryIn.CA.value:
+            carry = self.carry_out
+
+        # TODO rc_sel = yield pdecode2.dec.op.rc_sel
+        result = self.execute_alu_op(operand1, operand2, internal_op,
+                                     carry=carry)
+
+        cry_out = yield pdecode2.dec.op.cry_out
+        rc = yield pdecode2.e.rc.data
+
+        if rc:
+            self.update_cr0(result)
+        if cry_out == 1:
+            self.carry_out = (result >> 64)
+            print("setting carry_out", self.carry_out)
+
+        ro_ok = yield pdecode2.e.write_reg.ok
+        if ro_ok:
+            ro_sel = yield pdecode2.e.write_reg.data
+            self.regfile.write_reg(ro_sel, result)
+
+    def mem_op(self, pdecode2):
+        internal_op = yield pdecode2.dec.op.internal_op
+        addr_reg = yield pdecode2.e.read_reg1.data
+        addr = self.regfile.read_reg(addr_reg)
+
+        imm_ok = yield pdecode2.e.imm_data.ok
+        r2_ok = yield pdecode2.e.read_reg2.ok
+        width = yield pdecode2.e.data_len
+        if imm_ok:
+            imm = yield pdecode2.e.imm_data.data
+            addr += imm
+        elif r2_ok:
+            r2_sel = yield pdecode2.e.read_reg2.data
+            addr += self.regfile.read_reg(r2_sel)
+        if internal_op == InternalOp.OP_STORE.value:
+            val_reg = yield pdecode2.e.read_reg3.data
+            val = self.regfile.read_reg(val_reg)
+            self.mem_sim.st(addr, val, width)
+        elif internal_op == InternalOp.OP_LOAD.value:
+            dest_reg = yield pdecode2.e.write_reg.data
+            val = self.mem_sim.ld(addr, width)
+            self.regfile.write_reg(dest_reg, val)
+
+    def execute_op(self, pdecode2):
+        function = yield pdecode2.dec.op.function_unit
+        if function == Function.ALU.value:
+            yield from self.alu_op(pdecode2)
+        elif function == Function.LDST.value:
+            yield from self.mem_op(pdecode2)