add license and copyright header to dcache.py, master
authorLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Tue, 25 Jan 2022 00:43:56 +0000 (00:43 +0000)
committerLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Tue, 25 Jan 2022 00:43:56 +0000 (00:43 +0000)
extracted authors from git history for the file, but made sure to
credit the original dcache.vhdl as being from microwatt and its
license being CC4

307 files changed:
.gitignore
.gitlab-ci.yml
Makefile
flake.lock [new file with mode: 0644]
flake.nix [new file with mode: 0644]
mkpinmux.sh
nix/bigfloat.nix [new file with mode: 0644]
nix/c4m-jtag.nix [new file with mode: 0644]
nix/ecp5-program.nix [new file with mode: 0644]
nix/ecp5.nix [new file with mode: 0644]
nix/ieee754fpu.nix [new file with mode: 0644]
nix/litex.toml [new file with mode: 0644]
nix/ls180.nix [new file with mode: 0644]
nix/modgrammar.nix [new file with mode: 0644]
nix/nmutil.nix [new file with mode: 0644]
nix/openpower-isa.nix [new file with mode: 0644]
nix/pinmux.nix [new file with mode: 0644]
nix/soc.nix [new file with mode: 0644]
nix/verilog.nix [new file with mode: 0644]
pinmux
src/soc/config/ifetch.py
src/soc/config/pinouts.py
src/soc/config/test/test_fetch.py
src/soc/config/test/test_loadstore.py
src/soc/config/test/test_pi2ls.py
src/soc/debug/dmi.py
src/soc/debug/jtagutils.py
src/soc/debug/test/test_jtag_tap.py
src/soc/debug/test/test_jtag_tap_srv.py
src/soc/experiment/alu_fsm.py
src/soc/experiment/alu_hier.py
src/soc/experiment/compalu.py
src/soc/experiment/compalu_multi.py
src/soc/experiment/compldst_multi.py
src/soc/experiment/cscore.py
src/soc/experiment/dcache.py
src/soc/experiment/formal/proof_alu_fsm.py
src/soc/experiment/icache.py
src/soc/experiment/imem.py
src/soc/experiment/l0_cache.py
src/soc/experiment/lsmem.py
src/soc/experiment/mmu.py
src/soc/experiment/pi2ls.py
src/soc/experiment/pimem.py
src/soc/experiment/plru.py
src/soc/experiment/radix_walk_example.txt
src/soc/experiment/score6600.py
src/soc/experiment/score6600_multi.py
src/soc/experiment/sim.py
src/soc/experiment/test/pagetables.py [new file with mode: 0644]
src/soc/experiment/test/test_compalu_multi.py
src/soc/experiment/test/test_compldst_multi.py
src/soc/experiment/test/test_compldst_multi_mmu.py [new file with mode: 0644]
src/soc/experiment/test/test_compldst_multi_mmu_fsm.py [new file with mode: 0644]
src/soc/experiment/test/test_dcache.py
src/soc/experiment/test/test_dcache_tlb.py
src/soc/experiment/test/test_dcbz_pi.py [new file with mode: 0644]
src/soc/experiment/test/test_l0_cache_buffer2.py
src/soc/experiment/test/test_ldst_pi.py
src/soc/experiment/test/test_ldst_pi_misalign.py
src/soc/experiment/test/test_loadstore1.py [new file with mode: 0644]
src/soc/experiment/test/test_mmu_dcache.py
src/soc/experiment/test/test_mmu_dcache_pi.py
src/soc/experiment/test/test_wishbone.py [new file with mode: 0644]
src/soc/fu/alu/formal/proof_input_stage.py
src/soc/fu/alu/formal/proof_main_stage.py
src/soc/fu/alu/formal/proof_output_stage.py
src/soc/fu/alu/main_stage.py
src/soc/fu/alu/output_stage.py
src/soc/fu/alu/pipe_data.py
src/soc/fu/alu/pipeline.py
src/soc/fu/alu/test/test_pipe_caller.py
src/soc/fu/branch/formal/proof_input_stage.py
src/soc/fu/branch/formal/proof_main_stage.py
src/soc/fu/branch/test/test_pipe_caller.py
src/soc/fu/common_output_stage.py
src/soc/fu/compunits/compunits.py
src/soc/fu/compunits/formal/proof_fu.py
src/soc/fu/compunits/formal/test_compunit.py
src/soc/fu/compunits/test/test_compunit.py
src/soc/fu/compunits/test/test_div_compunit.py
src/soc/fu/cr/formal/proof_main_stage.py
src/soc/fu/cr/test/test_pipe_caller.py
src/soc/fu/div/core_stages.py
src/soc/fu/div/fsm.py
src/soc/fu/div/output_stage.py
src/soc/fu/div/pipe_data.py
src/soc/fu/div/setup_stage.py
src/soc/fu/div/test/helper.py
src/soc/fu/div/test/test_pipe_ilang.py
src/soc/fu/ldst/ldst_input_record.py
src/soc/fu/ldst/loadstore.py
src/soc/fu/ldst/pipe_data.py
src/soc/fu/logical/formal/proof_input_stage.py
src/soc/fu/logical/formal/proof_main_stage.py
src/soc/fu/logical/main_stage.py
src/soc/fu/logical/output_stage.py
src/soc/fu/logical/test/test_pipe_caller.py
src/soc/fu/mmu/fsm.py
src/soc/fu/mmu/mmu_input_record.py
src/soc/fu/mmu/pipe_data.py
src/soc/fu/mmu/test/test_issuer_mmu_data_path.py
src/soc/fu/mmu/test/test_non_production_core.py
src/soc/fu/mmu/test/test_pipe_caller.py
src/soc/fu/mul/formal/proof_main_stage.py
src/soc/fu/mul/main_stage.py
src/soc/fu/mul/post_stage.py
src/soc/fu/mul/pre_stage.py
src/soc/fu/mul/test/helper.py
src/soc/fu/mul/test/test_pipe_caller_long.py
src/soc/fu/mul/test/test_pipe_ilang.py
src/soc/fu/pipe_data.py
src/soc/fu/regspec.py
src/soc/fu/shift_rot/formal/proof_main_stage.py
src/soc/fu/shift_rot/main_stage.py
src/soc/fu/shift_rot/test/test_pipe_caller.py
src/soc/fu/spr/formal/proof_main_stage.py
src/soc/fu/spr/main_stage.py
src/soc/fu/spr/pipe_data.py
src/soc/fu/spr/test/test_pipe_caller.py
src/soc/fu/trap/formal/proof_main_stage.py
src/soc/fu/trap/main_stage.py
src/soc/fu/trap/pipe_data.py
src/soc/fu/trap/test/test_pipe_caller.py
src/soc/fu/trap/trap_input_record.py
src/soc/interrupts/xics.py
src/soc/litex/florent
src/soc/minerva/units/fetch.py
src/soc/minerva/units/loadstore.py
src/soc/minerva/wishbone.py
src/soc/regfile/regfile.py
src/soc/regfile/regfiles.py
src/soc/regfile/virtual_port.py
src/soc/scoreboard/addr_match.py
src/soc/scoreboard/addr_split.py
src/soc/scoreboard/dependence_cell.py
src/soc/scoreboard/fn_unit.py
src/soc/scoreboard/fu_fu_matrix.py
src/soc/scoreboard/fu_mem_matrix.py
src/soc/scoreboard/fu_reg_matrix.py
src/soc/scoreboard/fu_wr_pending.py
src/soc/scoreboard/group_picker.py
src/soc/scoreboard/instruction_q.py
src/soc/scoreboard/ldst_matrix.py
src/soc/scoreboard/mdm.py
src/soc/scoreboard/mem_dependence_cell.py
src/soc/scoreboard/mem_fu_matrix.py
src/soc/scoreboard/memfu.py
src/soc/scoreboard/reg_select.py
src/soc/scoreboard/shadow.py
src/soc/scoreboard/test_iq.py
src/soc/scoreboard/test_mem2_fu_matrix.py
src/soc/scoreboard/test_mem_fu_matrix.py
src/soc/simple/core.py
src/soc/simple/core_data.py [new file with mode: 0644]
src/soc/simple/inorder.py [new file with mode: 0644]
src/soc/simple/issuer.py
src/soc/simple/issuer_verilog.py
src/soc/simple/test/test_core.py
src/soc/simple/test/test_issuer.py
src/soc/simple/test/test_issuer_dcache.py [new file with mode: 0644]
src/soc/simple/test/test_issuer_linux_5_7.py [new file with mode: 0644]
src/soc/simple/test/test_issuer_mmu.py
src/soc/simple/test/test_issuer_mmu_ifetch.py [new file with mode: 0644]
src/soc/simple/test/test_issuer_mmu_microwatt.py [new file with mode: 0644]
src/soc/simple/test/test_microwatt.py
src/soc/simple/test/test_runner.py
src/soc/simple/test/teststate.py [new file with mode: 0644]
src/unused/TLB/.gitignore [deleted file]
src/unused/TLB/AddressEncoder.py [deleted file]
src/unused/TLB/Cam.py [deleted file]
src/unused/TLB/CamEntry.py [deleted file]
src/unused/TLB/LFSR.py [deleted file]
src/unused/TLB/LFSR.pyi [deleted file]
src/unused/TLB/Makefile [deleted file]
src/unused/TLB/MemorySet.py [deleted file]
src/unused/TLB/PermissionValidator.py [deleted file]
src/unused/TLB/PteEntry.py [deleted file]
src/unused/TLB/SetAssociativeCache.py [deleted file]
src/unused/TLB/TLB.py [deleted file]
src/unused/TLB/__init__.py [deleted file]
src/unused/TLB/ariane/TreePLRU.cpp [deleted file]
src/unused/TLB/ariane/__init__.py [deleted file]
src/unused/TLB/ariane/exceptcause.py [deleted file]
src/unused/TLB/ariane/miss_handler.py [deleted file]
src/unused/TLB/ariane/mmu.py [deleted file]
src/unused/TLB/ariane/p_lru.txt [deleted file]
src/unused/TLB/ariane/plru.py [deleted file]
src/unused/TLB/ariane/ptw.py [deleted file]
src/unused/TLB/ariane/test/__init__.py [deleted file]
src/unused/TLB/ariane/test/test_plru.py [deleted file]
src/unused/TLB/ariane/test/test_ptw.py [deleted file]
src/unused/TLB/ariane/test/test_tlb.py [deleted file]
src/unused/TLB/ariane/test/test_tlb_content.py [deleted file]
src/unused/TLB/ariane/tlb.py [deleted file]
src/unused/TLB/ariane/tlb_content.py [deleted file]
src/unused/TLB/test/__init__.py [deleted file]
src/unused/TLB/test/test_LFSR2.py [deleted file]
src/unused/TLB/test/test_address_encoder.py [deleted file]
src/unused/TLB/test/test_cam.py [deleted file]
src/unused/TLB/test/test_cam_entry.py [deleted file]
src/unused/TLB/test/test_permission_validator.py [deleted file]
src/unused/TLB/test/test_pte_entry.py [deleted file]
src/unused/TLB/test/test_set_associative_cache.py [deleted file]
src/unused/TLB/test/test_tlb.py [deleted file]
src/unused/__init__.py [deleted file]
src/unused/experiment/l0_cache.py [deleted file]
src/unused/iommu/__init__.py [deleted file]
src/unused/iommu/axi_rab/__init__.py [deleted file]
src/unused/iommu/axi_rab/axi4_ar_buffer.py [deleted file]
src/unused/iommu/axi_rab/axi4_ar_sender.py [deleted file]
src/unused/iommu/axi_rab/axi4_aw_buffer.py [deleted file]
src/unused/iommu/axi_rab/axi4_aw_sender.py [deleted file]
src/unused/iommu/axi_rab/axi4_b_buffer.py [deleted file]
src/unused/iommu/axi_rab/axi4_b_sender.py [deleted file]
src/unused/iommu/axi_rab/axi4_r_buffer.py [deleted file]
src/unused/iommu/axi_rab/axi4_r_sender.py [deleted file]
src/unused/iommu/axi_rab/axi4_w_buffer.py [deleted file]
src/unused/iommu/axi_rab/axi4_w_sender.py [deleted file]
src/unused/iommu/axi_rab/axi_buffer_rab.py [deleted file]
src/unused/iommu/axi_rab/axi_buffer_rab_bram.py [deleted file]
src/unused/iommu/axi_rab/axi_rab_cfg.py [deleted file]
src/unused/iommu/axi_rab/axi_rab_top.py [deleted file]
src/unused/iommu/axi_rab/check_ram.py [deleted file]
src/unused/iommu/axi_rab/coreconfig.py [deleted file]
src/unused/iommu/axi_rab/fsm.py [deleted file]
src/unused/iommu/axi_rab/l2_tlb.py [deleted file]
src/unused/iommu/axi_rab/rab_core.py [deleted file]
src/unused/iommu/axi_rab/rab_slice.py [deleted file]
src/unused/iommu/axi_rab/ram_tp_no_change.py [deleted file]
src/unused/iommu/axi_rab/ram_tp_write_first.py [deleted file]
src/unused/iommu/axi_rab/slice_top.py [deleted file]
src/unused/iommu/axi_rab/test/__init__.py [deleted file]
src/unused/iommu/axi_rab/test/test_ram_tp_no_change.py [deleted file]
src/unused/iommu/axi_rab/test/test_slice_top.py [deleted file]
src/unused/simulator/__init__.py [deleted file]
src/unused/simulator/internalop_sim.py [deleted file]
src/unused/simulator/test_sim.py [deleted file]
unused_please_ignore_completely/TLB/.gitignore [new file with mode: 0644]
unused_please_ignore_completely/TLB/AddressEncoder.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/Cam.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/CamEntry.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/LFSR.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/LFSR.pyi [new file with mode: 0644]
unused_please_ignore_completely/TLB/Makefile [new file with mode: 0644]
unused_please_ignore_completely/TLB/MemorySet.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/PermissionValidator.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/PteEntry.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/SetAssociativeCache.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/TLB.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/TreePLRU.cpp [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/exceptcause.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/miss_handler.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/mmu.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/p_lru.txt [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/plru.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/ptw.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/test/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/test/test_plru.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/test/test_ptw.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/test/test_tlb.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/test/test_tlb_content.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/tlb.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/ariane/tlb_content.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_LFSR2.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_address_encoder.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_cam.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_cam_entry.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_permission_validator.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_pte_entry.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_set_associative_cache.py [new file with mode: 0644]
unused_please_ignore_completely/TLB/test/test_tlb.py [new file with mode: 0644]
unused_please_ignore_completely/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/experiment/l0_cache.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_ar_buffer.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_ar_sender.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_aw_buffer.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_aw_sender.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_b_buffer.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_b_sender.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_r_buffer.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_r_sender.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_w_buffer.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi4_w_sender.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi_buffer_rab.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi_buffer_rab_bram.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi_rab_cfg.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/axi_rab_top.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/check_ram.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/coreconfig.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/fsm.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/l2_tlb.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/rab_core.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/rab_slice.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/ram_tp_no_change.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/ram_tp_write_first.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/slice_top.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/test/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/test/test_ram_tp_no_change.py [new file with mode: 0644]
unused_please_ignore_completely/iommu/axi_rab/test/test_slice_top.py [new file with mode: 0644]
unused_please_ignore_completely/simulator/__init__.py [new file with mode: 0644]
unused_please_ignore_completely/simulator/internalop_sim.py [new file with mode: 0644]

index d48dc7ff1f94bf7d776c91bb0774f1be6d66c98f..3478b52cb0cb70cc0f8c1db298f740628cef3894 100644 (file)
@@ -10,7 +10,7 @@ Waveforms
 *.il
 **/*.gtkw
 .eggs
-
+formal_test_temp
 .vscode/*
 build
 gen
index 5f359a060a4ae697c18535b1d6379f52556b1f22..c3ca516fc2695aced2e4d186b000f0a2694f399e 100644 (file)
@@ -5,10 +5,12 @@ cache:
         - ccache
         - .cache/pip
         - apt-cache
+    when: 'always'
 
 variables:
     PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
     GIT_SUBMODULE_STRATEGY: recursive
+    GIT_DEPTH: 500
 
 build:
     stage: build
@@ -111,7 +113,7 @@ build:
         - pip install dist/sfpy*.whl
         - popd
 
-        - cargo install maturin
+        - python3 -m pip install 'maturin>=0.11,<0.12'
         - git clone --depth 1 https://git.libre-soc.org/git/power-instruction-analyzer.git pia
         - pushd pia
         - maturin build --cargo-extra-args=--features=python-extension
@@ -119,4 +121,4 @@ build:
         - popd
 
         - python setup.py develop
-        - nosetests -v --processes=-1 --process-timeout=120
+        - nosetests -v --processes=-1 --process-timeout=120 -w src/
index 909a463a0d605f97d2c931c5e65015ca1d1b1406..3d4ea62db5a779f896d1f59665014783681f0523 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -11,15 +11,15 @@ mkpinmux:
        cp pinmux/ls180/ls180_pins.py src/soc/debug
        cp pinmux/ls180/ls180_pins.py src/soc/litex/florent/libresoc
 
-install: gitupdate develop mkpinmux svanalysis
+install: gitupdate develop mkpinmux
 
 # this is now actually part of openpower-isa repository
 pywriter:
-       pywriter
+       echo "pywriter is part of openpower-isa, run that instead"
 
 # this is now actually part of openpower-isa repository
 svanalysis:
-       svanalysis
+       echo "sv_analysis is part of openpower-isa, run that instead"
 
 develop:
        python3 setup.py develop # yes, develop, not install
diff --git a/flake.lock b/flake.lock
new file mode 100644 (file)
index 0000000..8193dbc
--- /dev/null
@@ -0,0 +1,131 @@
+{
+  "nodes": {
+    "c4m-jtag": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1619101523,
+        "narHash": "sha256-y1OY8URcE1lnu5L7IDFcJ8zT8sqlrfMP9VPNmVvACGk=",
+        "ref": "master",
+        "rev": "c2bf4810f9f91ced7fcda777b92b86ab353da288",
+        "revCount": 146,
+        "type": "git",
+        "url": "https://git.libre-soc.org/git/c4m-jtag.git"
+      },
+      "original": {
+        "type": "git",
+        "url": "https://git.libre-soc.org/git/c4m-jtag.git"
+      }
+    },
+    "migen": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1631614362,
+        "narHash": "sha256-BgYf4e7O/rbS5P1ZpDlcgCEUh2h2vK3FyHADdzyaMg0=",
+        "owner": "m-labs",
+        "repo": "migen",
+        "rev": "7bc4eb1387b39159a74c1dbd1b820728e0bfbbaa",
+        "type": "github"
+      },
+      "original": {
+        "owner": "m-labs",
+        "repo": "migen",
+        "type": "github"
+      }
+    },
+    "nix-litex": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1632150297,
+        "narHash": "sha256-ghlAJBZxLVkQB+9tXEOBOF1FfdT5Pn4292khF4iKCNA=",
+        "ref": "main",
+        "rev": "5ab6984eb1efad0c91d808c9b7b79e00e50ccc05",
+        "revCount": 31,
+        "type": "git",
+        "url": "https://git.sr.ht/~lschuermann/nix-litex"
+      },
+      "original": {
+        "ref": "main",
+        "type": "git",
+        "url": "https://git.sr.ht/~lschuermann/nix-litex"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1631723418,
+        "narHash": "sha256-Sbey1S81fXUKcEHVCMwlXMju/IoCQxMwP1PPkVYpGrc=",
+        "owner": "L-as",
+        "repo": "nixpkgs",
+        "rev": "8bfc1026477692b933df6eeec27bd494cac3e436",
+        "type": "github"
+      },
+      "original": {
+        "owner": "L-as",
+        "ref": "libresoc",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "nmigen": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1618220900,
+        "narHash": "sha256-Ol2SMZLUTikZWDLmK7F5lZuKBfGO71WmisATPNMTpHQ=",
+        "ref": "master",
+        "rev": "d824795c2c7cb43dcbc8ed8fac6d309d77284913",
+        "revCount": 1056,
+        "type": "git",
+        "url": "https://git.libre-soc.org/git/nmigen.git"
+      },
+      "original": {
+        "type": "git",
+        "url": "https://git.libre-soc.org/git/nmigen.git"
+      }
+    },
+    "nmigen-soc": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1601572554,
+        "narHash": "sha256-v9SH+KuIPydXCr363RUsMg9/tabuu+GjKPJOKq2Jze0=",
+        "ref": "master",
+        "rev": "692017c7eaf21ff37302790c4422db6bd08667be",
+        "revCount": 48,
+        "type": "git",
+        "url": "https://git.libre-soc.org/git/nmigen-soc.git"
+      },
+      "original": {
+        "type": "git",
+        "url": "https://git.libre-soc.org/git/nmigen-soc.git"
+      }
+    },
+    "root": {
+      "inputs": {
+        "c4m-jtag": "c4m-jtag",
+        "migen": "migen",
+        "nix-litex": "nix-litex",
+        "nixpkgs": "nixpkgs",
+        "nmigen": "nmigen",
+        "nmigen-soc": "nmigen-soc",
+        "yosys": "yosys"
+      }
+    },
+    "yosys": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1617979565,
+        "narHash": "sha256-M8ppe+lL/pgd2sXh7bM6/sbk1099KKECeWA5mXtqE6Y=",
+        "owner": "YosysHQ",
+        "repo": "yosys",
+        "rev": "a58571d0fe8971cb7d3a619a31b2c21be6d75bac",
+        "type": "github"
+      },
+      "original": {
+        "owner": "YosysHQ",
+        "repo": "yosys",
+        "rev": "a58571d0fe8971cb7d3a619a31b2c21be6d75bac",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644 (file)
index 0000000..90a976c
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,98 @@
+{
+  description = "FOSS CPU/GPU/VPU/SoC all in one, see https://libre-soc.org/";
+
+  inputs.nixpkgs.url = "github:L-as/nixpkgs?ref=libresoc"; # for alliance and migen
+  inputs.c4m-jtag.url = "git+https://git.libre-soc.org/git/c4m-jtag.git";
+  inputs.c4m-jtag.flake = false;
+  inputs.nmigen.url = "git+https://git.libre-soc.org/git/nmigen.git";
+  inputs.nmigen.flake = false;
+  inputs.nmigen-soc.url = "git+https://git.libre-soc.org/git/nmigen-soc.git";
+  inputs.nmigen-soc.flake = false;
+  inputs.migen.url = "github:m-labs/migen";
+  inputs.migen.flake = false;
+  inputs.yosys.url = "github:YosysHQ/yosys?rev=a58571d0fe8971cb7d3a619a31b2c21be6d75bac";
+  inputs.yosys.flake = false;
+  # submodules needed
+  inputs.nix-litex.url = "git+https://git.sr.ht/~lschuermann/nix-litex?ref=main";
+  inputs.nix-litex.flake = false;
+
+  outputs = { self, nixpkgs, c4m-jtag, nmigen, nmigen-soc, nix-litex, migen, yosys }:
+    let
+      getv = x: builtins.substring 0 8 x.lastModifiedDate;
+
+      supportedSystems = [ "x86_64-linux" "x86_64-darwin" "aarch64-linux" "aarch64-darwin" ];
+
+      forAllSystems = nixpkgs.lib.genAttrs supportedSystems;
+
+      litex = pkgs: import "${nix-litex}/pkgs" {
+        inherit pkgs;
+        pkgMetas = builtins.fromTOML (builtins.readFile ./nix/litex.toml);
+        skipChecks = true; # FIXME: remove once checks work
+      };
+
+      nixpkgsFor = forAllSystems (system: import nixpkgs { inherit system; overlays = [ self.overlay ]; });
+
+      lib = nixpkgs.lib;
+    in
+    {
+      overlay = final: prev: {
+        python37 = prev.python37.override {
+          packageOverrides = lib.composeExtensions (litex final).pythonOverlay (pfinal: pprev: {
+            libresoc-ieee754fpu = pfinal.callPackage ./nix/ieee754fpu.nix {};
+            libresoc-openpower-isa = pfinal.callPackage ./nix/openpower-isa.nix {};
+            c4m-jtag = pfinal.callPackage (import ./nix/c4m-jtag.nix { src = c4m-jtag; version = getv c4m-jtag; }) {};
+            bigfloat = pfinal.callPackage ./nix/bigfloat.nix {};
+            modgrammar = pfinal.callPackage ./nix/modgrammar.nix {};
+            libresoc-nmutil = pfinal.callPackage ./nix/nmutil.nix {};
+            libresoc-soc = pfinal.callPackage (import ./nix/soc.nix { version = getv self; }) {};
+
+            nmigen-soc = pprev.nmigen-soc.overrideAttrs (_: {
+              doCheck = false;
+              src = nmigen-soc;
+              setuptoolsCheckPhase = "true";
+            });
+
+            nmigen = pprev.nmigen.overrideAttrs (_: {
+              src = nmigen;
+            });
+
+            migen = pprev.migen.overrideAttrs (_: {
+              src = migen;
+            });
+          });
+        };
+
+        yosys = prev.yosys.overrideAttrs (_: {
+          version = "0.9+4052";
+          src = yosys;
+        });
+
+        libresoc-verilog = final.callPackage (import ./nix/verilog.nix { version = getv self; }) { python3Packages = final.python37Packages; };
+        libresoc-ls180 = final.callPackage (import ./nix/ls180.nix { version = getv self; }) { python3Packages = final.python37Packages; };
+        libresoc-ecp5 = final.callPackage (import ./nix/ecp5.nix { version = getv self; }) { python3Packages = final.python37Packages; };
+        libresoc-ecp5-program = final.callPackage (import ./nix/ecp5-program.nix { version = getv self; }) { python3Packages = final.python37Packages; };
+        libresoc-pinmux = final.callPackage (import ./nix/pinmux.nix { version = getv self; }) {};
+      };
+
+      apps = forAllSystems (system: {
+        ecp5 = {
+          type = "app";
+          program = "${nixpkgsFor.${system}.libresoc-ecp5-program}";
+        };
+      });
+      defaultApp = forAllSystems (system: self.apps.${system}.ecp5);
+
+      packages = forAllSystems (system: {
+        soc = nixpkgsFor.${system}.python37Packages.libresoc-soc;
+        verilog = nixpkgsFor.${system}.libresoc-verilog;
+        pinmux = nixpkgsFor.${system}.libresoc-pinmux;
+        ls180 = nixpkgsFor.${system}.libresoc-ls180;
+        ecp5 = nixpkgsFor.${system}.libresoc-ecp5;
+        ecp5-program = nixpkgsFor.${system}.libresoc-ecp5-program;
+        openpower-isa = nixpkgsFor.${system}.python37Packages.libresoc-openpower-isa;
+        debugNixpkgs = nixpkgsFor.${system};
+      });
+
+      defaultPackage = forAllSystems (system: self.packages.${system}.verilog);
+    };
+}
index b122611c5764140fec7bfa6876d366322043f3a9..c98e48044dfcf9019930997720ac5b431be7ac53 100755 (executable)
@@ -1,3 +1,5 @@
 #!/bin/sh
 cd pinmux
 python2 src/pinmux_generator.py -v -s ls180 -o ls180
+# temporary - return to older version of pinmux
+#python2 src/pinmux_generator.py -v -s ngi_router -o ngi_router
diff --git a/nix/bigfloat.nix b/nix/bigfloat.nix
new file mode 100644 (file)
index 0000000..4355ef0
--- /dev/null
@@ -0,0 +1,21 @@
+{ lib, buildPythonPackage, fetchPypi, gmp, mpfr, six }:
+
+buildPythonPackage rec {
+  pname = "bigfloat";
+  version = "0.4.0";
+
+  buildInputs = [ gmp mpfr ];
+  propagatedBuildInputs = [ six ];
+
+  src = fetchPypi {
+    inherit pname version;
+    sha256 = "WLlr3ocqylmJ0T2C66Os8qoblOIhF91yoWulkRsMDLg=";
+  };
+
+  doCheck = false;
+
+  meta = with lib; {
+    homepage = "https://pypi.org/project/bigfloat/";
+    license = licenses.lgpl3Plus;
+  };
+}
diff --git a/nix/c4m-jtag.nix b/nix/c4m-jtag.nix
new file mode 100644 (file)
index 0000000..cf301c6
--- /dev/null
@@ -0,0 +1,24 @@
+{ version, src }:
+
+{ lib, python, buildPythonPackage, nmigen-soc, nmigen, modgrammar, setuptools-scm }:
+
+buildPythonPackage {
+  pname = "c4m-jtag";
+  inherit src version;
+
+  nativeBuildInputs = [ setuptools-scm ];
+  propagatedBuildInputs = [ nmigen-soc nmigen modgrammar ];
+
+  doCheck = false;
+
+  pythonImportsCheck = [ "c4m.nmigen.jtag.tap" ];
+
+  prePatch = ''
+    export SETUPTOOLS_SCM_PRETEND_VERSION=${version}
+  '';
+
+  meta = with lib; {
+    homepage = "https://pypi.org/project/libresoc-openpower-isa/";
+    license = licenses.lgpl3Plus;
+  };
+}
diff --git a/nix/ecp5-program.nix b/nix/ecp5-program.nix
new file mode 100644 (file)
index 0000000..4d696b2
--- /dev/null
@@ -0,0 +1,24 @@
+{ version }:
+
+{ writeShellScript, openocd, python3Packages, libresoc-ecp5, nextpnr, trellis }:
+
+let
+  pythonWithEnv = python3Packages.python.withPackages (ps: with ps; [
+    requests migen libresoc-soc litex-boards litex litedram liteeth liteiclink litescope litesdcard
+  ]);
+in
+writeShellScript "program-ecp5-libresoc" ''
+  export PATH="${openocd}/bin:${pythonWithEnv}/bin:${trellis}/bin:${nextpnr}/bin:$PATH"
+
+  dir="$(mktemp -d)"
+  pushd "$dir"
+  echo "$dir"
+
+  export PYTHONPATH="${../src/soc/litex/florent}:$PYTHONPATH"
+
+  python ${../src/soc/litex/florent/versa_ecp5.py} --sys-clk-freq=55e6 --load-from ${libresoc-ecp5}
+
+  popd
+  rm -rf "$dir"
+  exit 0
+''
diff --git a/nix/ecp5.nix b/nix/ecp5.nix
new file mode 100644 (file)
index 0000000..1c82ee4
--- /dev/null
@@ -0,0 +1,40 @@
+{ version }:
+
+{ stdenv, python3Packages, yosys, libresoc-verilog, libresoc-pinmux, pkgsCross
+, nextpnr, trellis }:
+
+stdenv.mkDerivation {
+  pname = "libresoc-versa-ecp5.v";
+  inherit version;
+
+  src = ../src/soc/litex/florent;
+
+  nativeBuildInputs =
+    (with python3Packages; [
+    python libresoc-soc litex-boards litex litedram liteeth liteiclink litescope litesdcard
+    ])
+    ++ [ trellis nextpnr pkgsCross.powernv.buildPackages.gcc ];
+
+  postPatch = ''
+    patchShebangs --build .
+  '';
+
+  configurePhase = "true";
+
+  buildPhase = ''
+    runHook preBuild
+    export PINMUX="$(mktemp -d)"
+    ln -s ${libresoc-pinmux} "$PINMUX/ls180"
+    cp ${libresoc-verilog} libresoc/libresoc.v
+    ./versa_ecp5.py --sys-clk-freq=55e6 --build
+    runHook postBuild
+  '';
+
+  installPhase = ''
+    runHook preInstall
+    mv /build/florent/build/versa_ecp5/gateware/versa_ecp5.svf $out
+    runHook postInstall
+  '';
+
+  fixupPhase = "true";
+}
diff --git a/nix/ieee754fpu.nix b/nix/ieee754fpu.nix
new file mode 100644 (file)
index 0000000..e520437
--- /dev/null
@@ -0,0 +1,27 @@
+{ lib, buildPythonPackage, libresoc-nmutil, bigfloat, fetchgit }:
+
+buildPythonPackage {
+  pname = "libresoc-ieee754fpu";
+  version = "unstable-2021-06-05";
+
+  src = fetchgit {
+    url = "https://git.libre-soc.org/git/ieee754fpu.git";
+    rev = "c62fa3a7ee95832587d7725729dcdb9a002ae015";
+    sha256 = "wbr1vGFzUlUtBT6IcRsykADYeksiVoq/LacU/dbRQ0o=";
+  };
+
+  propagatedBuildInputs = [ libresoc-nmutil bigfloat ];
+
+  doCheck = false;
+
+  prePatch = ''
+    touch ./src/ieee754/part/__init__.py
+  '';
+
+  pythonImportsCheck = [ "ieee754.part" ];
+
+  meta = with lib; {
+    homepage = "https://pypi.org/project/libresoc-ieee754fpu/";
+    license = licenses.lgpl3Plus;
+  };
+}
diff --git a/nix/litex.toml b/nix/litex.toml
new file mode 100644 (file)
index 0000000..89317f0
--- /dev/null
@@ -0,0 +1,89 @@
+[litex]
+github_user = "enjoy-digital"
+github_repo = "litex"
+git_revision = "42d8fc226a4f4e8dfef104257a95f98eb9b10da7"
+github_archive_nix_hash = "16zb7mci2a09jc5bbr4342pn95iyl84705n566alpx696xk2l0zr"
+
+[litex-boards]
+github_user = "litex-hub"
+github_repo = "litex-boards"
+git_revision = "1781be166aee867421e0d943f6a62c3397524563"
+github_archive_nix_hash = "0ar41ibs6si03iyhcjn3blw1rkdsazn5rsa95ph8v061kg2yjbjh"
+
+[liteeth]
+github_user = "enjoy-digital"
+github_repo = "liteeth"
+git_revision = "64b85e621e740b9b7a9bdb03749758c703fea6e1"
+github_archive_nix_hash = "1gbscl36n6mgaz1y1b27nzhykrhrccl6ls5vp7dd6divpqdf328i"
+
+[litedram]
+github_user = "enjoy-digital"
+github_repo = "litedram"
+git_revision = "ac825e51124e926c67455292cd2b949954fc6f65"
+github_archive_nix_hash = "1acs4kgbsv8pgml1q7709afh46f8mpy8b1nw0p9n8a1zih8ang1r"
+
+[litehyperbus]
+github_user = "litex-hub"
+github_repo = "litehyperbus"
+git_revision = "c4b64d2c992cedf3e03ffdf87f389feb5ddfff52"
+github_archive_nix_hash = "1iwjwzz4wa9zzm6yqa7rkag9igmsawp8wpmkj6fqia20b7xjglnb"
+
+[liteiclink]
+github_user = "enjoy-digital"
+github_repo = "liteiclink"
+git_revision = "efd200fa9e625144131a310fc09fd1fecf1682e6"
+github_archive_nix_hash = "0g643ryfzc6iq0p80rhq116n5w6mh4fv4yg4adyy5i1vy2grlg8s"
+
+[litepcie]
+github_user = "enjoy-digital"
+github_repo = "litepcie"
+git_revision = "0718fd135fc30e0a3598eaf66ce2fcb54b62193c"
+github_archive_nix_hash = "1m3i4hv49438ik4qhdp7rx9nan5rddrqp7nzvya9xfbh7lfc59hl"
+
+[litescope]
+github_user = "enjoy-digital"
+github_repo = "litescope"
+git_revision = "2739d5a069386c8e834c7f660dce9f93dc2b4598"
+github_archive_nix_hash = "08r7dzlmlfs9pmfz4xkf61sal5zy3caby88bcb4993c43nzpw8a3"
+
+[litesdcard]
+github_user = "enjoy-digital"
+github_repo = "litesdcard"
+git_revision = "edee2467fcabc62c4b34e3daa2271a71e52ba09f"
+github_archive_nix_hash = "0n5x9cx61xij0hc61slabxa05pzmw8i5fyg54ydmxi2fl2p5p0rs"
+
+[litespi]
+github_user = "litex-hub"
+github_repo = "litespi"
+git_revision = "c0730ebdb3c976618bf24e9ec04911e7c9934adf"
+github_archive_nix_hash = "015irjdpii514aj4av02pglvvq0wgxkplyy09435crzy9j5i5v04"
+
+[pythondata-misc-tapcfg]
+github_user = "litex-hub"
+github_repo = "pythondata-misc-tapcfg"
+git_revision = "25c97a4a9ff9af85248028fe01e2c65b2e3640ee"
+github_archive_nix_hash = "0zr6d5giqzsjmqpfyf1b25r0y70bj09xjbfinfxcdc6s8cwwwz71"
+
+[pythondata-software-compiler_rt]
+github_user = "litex-hub"
+github_repo = "pythondata-software-compiler_rt"
+git_revision = "7cfcaed2e726027fd622650b58dd77e47c495ee0"
+github_archive_nix_hash = "0b65dj95418j4pjqqkqjq5npnn1ih1789ba9575kxcljgj7r8xb7"
+
+[pythondata-cpu-serv]
+github_user = "litex-hub"
+github_repo = "pythondata-cpu-serv"
+git_revision = "915cdf793395ab48cc52c0225660eb6eeff41133"
+github_archive_nix_hash = "1ndkjhh7r521cc9g63pmjvgvv9sa3s8n2mkdli91nr7ns3q3lxmk"
+
+[litevideo]
+github_user = "enjoy-digital"
+github_repo = "litevideo"
+git_revision = "41f30143075ece3fff5c33a332ed067d1837cbb3"
+github_archive_nix_hash = "06vw4rn8xby8is13275bmkrxlwp3wlznbdqfay78a5m8bp73kypy"
+
+[valentyusb-hw_cdc_eptri]
+github_user = "litex-hub"
+github_repo = "valentyusb"
+git_revision = "a0526ad053c394306ad7a585a7ddd463831ad09d"
+github_archive_nix_hash = "0nad2x5j5rnjyciwm0abxhzng8nrv06ri8g9qdi39zk8n9zy7cmf"
diff --git a/nix/ls180.nix b/nix/ls180.nix
new file mode 100644 (file)
index 0000000..028fbcb
--- /dev/null
@@ -0,0 +1,44 @@
+{ version }:
+
+{ stdenv, python3Packages, yosys, libresoc-verilog, libresoc-pinmux, pkgsCross }:
+
+stdenv.mkDerivation {
+  pname = "libresoc-ls1804k";
+  inherit version;
+
+  src = ../src/soc/litex/florent;
+
+  nativeBuildInputs =
+    (with python3Packages; [
+    python libresoc-soc litex litedram liteeth liteiclink litescope litesdcard
+    ])
+    ++ [ pkgsCross.powernv.buildPackages.gcc ];
+
+  postPatch = ''
+    patchShebangs --build .
+  '';
+
+  configurePhase = "true";
+
+  buildPhase = ''
+    runHook preBuild
+    export PINMUX="$(mktemp -d)"
+    ln -s ${libresoc-pinmux} "$PINMUX/ls180"
+    cp ${libresoc-verilog} libresoc/libresoc.v
+    ./ls180soc.py --build --platform=ls180sram4k --num-srams=2 --srams4k
+    runHook postBuild
+  '';
+
+  installPhase = ''
+    runHook preInstall
+    mkdir $out
+    mv build/ls180sram4k/gateware/ls180sram4k.v $out/ls180.v
+    mv build/ls180sram4k/gateware/mem.init $out
+    mv build/ls180sram4k/gateware/mem_1.init $out
+    mv libresoc/libresoc.v $out
+    mv libresoc/SPBlock_512W64B8W.v $out
+    runHook postInstall
+  '';
+
+  fixupPhase = "true";
+}
diff --git a/nix/modgrammar.nix b/nix/modgrammar.nix
new file mode 100644 (file)
index 0000000..ce0f348
--- /dev/null
@@ -0,0 +1,20 @@
+{ lib, buildPythonPackage, fetchFromGitHub }:
+
+buildPythonPackage rec {
+  pname = "modgrammar";
+  version = "unstable-2020-09-20";
+
+  src = fetchFromGitHub {
+    owner = "bloerwald";
+    repo = "modgrammar";
+    rev = "d363ad5a86584e560a8b03cbe11c0168d7610691";
+    sha256 = "SO2qjfEVaJfgbA5HLJYwXlaeUzt5EFoljYQ2SsdDCbc=";
+  };
+
+  doCheck = false;
+
+  meta = with lib; {
+    homepage = "https://pypi.org/project/modgrammar/";
+    # license = licenses.bsd; # FIXME: Which BSD?
+  };
+}
diff --git a/nix/nmutil.nix b/nix/nmutil.nix
new file mode 100644 (file)
index 0000000..3489e77
--- /dev/null
@@ -0,0 +1,21 @@
+{ lib, buildPythonPackage, bigfloat, fetchgit, pyvcd }:
+
+buildPythonPackage {
+  pname = "libresoc-nmutil";
+  version = "unstable-2021-08-24";
+
+  propagatedBuildInputs = [ pyvcd ];
+
+  src = fetchgit {
+    url = "https://git.libre-soc.org/git/nmutil.git";
+    rev = "efda080db6978d249a23003bec734f1cc07de329";
+    sha256 = "nTgUiZc4CC0VoUND29kHSIyMlP9IB3oZfehutoNK07w=";
+  };
+
+  doCheck = false;
+
+  meta = with lib; {
+    homepage = "https://pypi.org/project/libresoc-ieee754fpu/";
+    license = licenses.lgpl3Plus;
+  };
+}
diff --git a/nix/openpower-isa.nix b/nix/openpower-isa.nix
new file mode 100644 (file)
index 0000000..5aee8b1
--- /dev/null
@@ -0,0 +1,31 @@
+{ lib, python, buildPythonPackage, fetchgit, libresoc-nmutil, astor, nmigen, ply, pygdbmi }:
+
+buildPythonPackage {
+  pname = "libresoc-openpower-isa";
+  version = "unstable-2021-09-04";
+
+  src = fetchgit {
+    url = "https://git.libre-soc.org/git/openpower-isa.git";
+    rev = "6e43a194f3d07ed5a8daa297187a32746c4c4d3c";
+    sha256 = "0EekUouTQruTXGO5jlPJtqh0DOudghILy0nca5eaZz8=";
+  };
+
+  propagatedBuildInputs = [ libresoc-nmutil astor nmigen ply pygdbmi ];
+
+  doCheck = false;
+
+  prePatch = ''
+    touch ./src/openpower/sv/__init__.py # TODO: fix upstream
+  '';
+
+  postInstall = ''
+    cp -rT ./openpower $out/${python.sitePackages}/../openpower/
+  '';
+
+  pythonImportsCheck = [ "openpower.decoder.power_decoder2" "openpower" ];
+
+  meta = with lib; {
+    homepage = "https://pypi.org/project/libresoc-openpower-isa/";
+    license = licenses.lgpl3Plus;
+  };
+}
diff --git a/nix/pinmux.nix b/nix/pinmux.nix
new file mode 100644 (file)
index 0000000..fc9ca7e
--- /dev/null
@@ -0,0 +1,28 @@
+{ version }:
+
+{ stdenv, python2 }:
+
+stdenv.mkDerivation {
+  pname = "libresoc-pinmux";
+  inherit version;
+
+  src = ../pinmux;
+
+  nativeBuildInputs = [ python2 ];
+
+  configurePhase = "true";
+
+  buildPhase = ''
+    runHook preBuild
+    python src/pinmux_generator.py -v -s ls180 -o ls180
+    runHook postBuild
+  '';
+
+  installPhase = ''
+    runHook preInstall
+    mv ls180 $out
+    runHook postInstall
+  '';
+
+  fixupPhase = "true";
+}
diff --git a/nix/soc.nix b/nix/soc.nix
new file mode 100644 (file)
index 0000000..a4ed136
--- /dev/null
@@ -0,0 +1,38 @@
+{ version }:
+
+{ lib, buildPythonPackage, yosys, runCommand, c4m-jtag, nmigen-soc
+, libresoc-ieee754fpu, libresoc-openpower-isa, python }:
+
+let
+  # If we use ../. as source, then any change to
+  # any unrelated Nix file would cause a rebuild,
+  # since the build would have access to it.
+  src = runCommand "libresoc-soc-source" {} ''
+    mkdir $out
+    cp -r ${../src} -T $out/src
+    cp -r ${../setup.py} -T $out/setup.py
+    cp -r ${../README.md} -T $out/README.md
+    cp -r ${../NEWS.txt} -T $out/NEWS.txt
+  '';
+in
+buildPythonPackage {
+  pname = "libresoc-soc";
+  inherit version src;
+
+  propagatedBuildInputs = [
+    c4m-jtag nmigen-soc python libresoc-ieee754fpu libresoc-openpower-isa yosys
+  ];
+
+  doCheck = false;
+
+  prePatch = ''
+    rm -r src/soc/litex
+  '';
+
+  pythonImportsCheck = [ "soc" ];
+
+  meta = with lib; {
+    homepage = "https://libre-soc.org/";
+    license = licenses.lgpl3Plus;
+  };
+}
diff --git a/nix/verilog.nix b/nix/verilog.nix
new file mode 100644 (file)
index 0000000..600b693
--- /dev/null
@@ -0,0 +1,20 @@
+{ version }:
+
+{ runCommand, python3Packages, libresoc-pinmux }:
+
+let script = ''
+  mkdir pinmux
+  ln -s ${libresoc-pinmux} pinmux/ls180
+  export PINMUX="$(realpath ./pinmux)"
+  python3 -m soc.simple.issuer_verilog \
+    --debug=jtag --enable-core --enable-pll \
+    --enable-xics --enable-sram4x4kblock --disable-svp64 \
+    $out
+''; in
+runCommand "libresoc.v" {
+  inherit version;
+
+  nativeBuildInputs = (with python3Packages; [
+    libresoc-soc
+  ]) ++ [ libresoc-pinmux ];
+} script
diff --git a/pinmux b/pinmux
index 096caad8418250693c93ccf90047750704adcaa7..d96f737c0a53dde983060522816bbef016b449ce 160000 (submodule)
--- a/pinmux
+++ b/pinmux
@@ -1 +1 @@
-Subproject commit 096caad8418250693c93ccf90047750704adcaa7
+Subproject commit d96f737c0a53dde983060522816bbef016b449ce
index a73a89bc6c5b2fe8962d0072a0b2939010ffd3fe..35a9ddec0d230aa8f6354871faad6aa202dd7a33 100644 (file)
@@ -18,6 +18,18 @@ class ConfigFetchUnit:
                    'bare_wb': BareFetchUnit,
                    #'test_cache_wb': TestCacheFetchUnit
                   }
+        self.pspec = pspec
+        if self.pspec.imem_ifacetype in ['mmu_cache_wb', 'test_mmu_cache_wb']:
+            # XXX BLECH! use pspec to transfer the I-Cache which is
+            # created down inside LoadStore1!
+            self.fu = icache = pspec.icache # ICache already FetchUnitInterface
+            # tell I-Cache to connect up to its FetchUnitInterface
+            icache.use_fetch_interface()
+            return
+
         fukls = fudict[pspec.imem_ifacetype]
         self.fu = fukls(pspec)
 
+    def wb_bus(self):
+        return self.fu.ibus
+
index a1828c6a4aa434650270ec5421737ea11ee65aa8..95129b1999e733b44c5f46c265d9b1c478c4a8f4 100644 (file)
@@ -3,6 +3,8 @@ import sys
 import json
 from pprint import pprint
 from collections import OrderedDict
+from openpower.util import log
+from nmigen.build.dsl import Resource, Subsignal, Pins
 
 
 def _byteify(data, ignore_dicts = False):
@@ -25,7 +27,40 @@ def _byteify(data, ignore_dicts = False):
     return data
 
 
+def get_pinspec_resources(chipname=None, subset=None, conn=None):
+    """get_pinspec_resources - returns an auto-generated list of resources
+    """
+    chip = load_pinouts(chipname)
+    pinmap = chip['pins.map']
+    specs = []
+    for k, bus in chip['pins.specs'].items():
+        k, num = k.lower().split(":")
+        name = '%s%s' % (k, num)
+        if subset is None or name in subset:
+            io = []
+            for pin in bus:
+                pin = pin.lower()
+                pin, pin_dir = pin[:-1], pin[-1] # split pin+ into pin, +
+                pname = '%s_%s' % (name, pin)
+                if pname in pinmap:
+                    newpin = pinmap[pname][2:]
+                    newpin = '_'.join(newpin.split("_")[1:])
+                    # turn direction into nmigen Pins direction format
+                    dirn = {'-': 'i', '+': 'o', '*': 'io'}[pin_dir]
+                # TODO: make assert_width not have to be 1
+                p = Pins(newpin, dir=dirn, conn=conn, assert_width=1)
+                io.append(Subsignal(pin, p))
+            spec = Resource.family(name, num, default_name=name, ios=io)
+            log("pinspec", name, repr(spec))
+            specs.append(spec)
+    return specs
+
+
 def get_pinspecs(chipname=None, subset=None):
+    """get_pinspecs - returns a dictionary of lists of pins for an IO function
+    example: {'uart': ['tx+', 'rx-'],
+             'i2c': ['sda*', 'scl+']}
+    """
     chip = load_pinouts(chipname)
     pinmap = chip['pins.map']
     specs = OrderedDict() # preserve order
@@ -62,7 +97,8 @@ def load_pinouts(chipname=None):
     pth = os.path.split(pth)[0]
 
     # path is relative to this filename, in the pinmux submodule
-    fname = "%s/../../../pinmux/%s/litex_pinpads.json" % (pth, chipname)
+    pinmux = os.getenv("PINMUX", "%s/../../../pinmux" % pth)
+    fname = "%s/%s/litex_pinpads.json" % (pinmux, chipname)
     with open(fname) as f:
         txt = f.read()
 
@@ -73,7 +109,12 @@ def load_pinouts(chipname=None):
     return chip
 
 if __name__ == '__main__':
-    if sys.argv == 2:
+    # run this with:
+    # git submodule update --init --remote --recursive
+    # make mkpinmux
+    # python3 soc/config/pinouts.py ngi_pointer (or ls180, or other)
+    # it will print out a stack of debug stuff
+    if len(sys.argv) == 2:
         chipname = sys.argv[1]
     else:
         chipname = None
@@ -81,3 +122,5 @@ if __name__ == '__main__':
     for k, v in chip.items():
         print ("\n****", k, "****")
         pprint(v)
+    print ("chipname pinspec resources", sys.argv, chipname)
+    specs = get_pinspec_resources(chipname, subset=None)
index df9caf68871a7100950ec748a5c27b30d4ce3399..39437b3c94d63ee86785c8350438f72238e6b595 100644 (file)
@@ -13,21 +13,22 @@ import sys
 sys.setrecursionlimit(10**6)
 
 
-def read_from_addr(dut, addr):
+def read_from_addr(dut, addr, stall=True):
     yield dut.a_pc_i.eq(addr)
-    yield dut.a_valid_i.eq(1)
-    yield dut.f_valid_i.eq(1)
-    yield dut.a_stall_i.eq(1)
-    yield
-    yield dut.a_stall_i.eq(0)
+    yield dut.a_i_valid.eq(1)
+    yield dut.f_i_valid.eq(1)
+    if stall:
+        yield dut.a_stall_i.eq(1)
+        yield
+        yield dut.a_stall_i.eq(0)
     yield
     yield Settle()
     while (yield dut.f_busy_o):
         yield
     res = (yield dut.f_instr_o)
 
-    yield dut.a_valid_i.eq(0)
-    yield dut.f_valid_i.eq(0)
+    yield dut.a_i_valid.eq(0)
+    yield dut.f_i_valid.eq(0)
     yield
     return res
 
index 02c491f98c8aec9b4be3f4ead326468cdac25300..91435bd70edae0137b138e315f38453f5f0918f5 100644 (file)
@@ -16,9 +16,9 @@ def write_to_addr(dut, addr, value):
     yield dut.x_st_data_i.eq(value)
     yield dut.x_st_i.eq(1)
     yield dut.x_mask_i.eq(-1)
-    yield dut.x_valid_i.eq(1)
+    yield dut.x_i_valid.eq(1)
     yield dut.x_stall_i.eq(1)
-    yield dut.m_valid_i.eq(1)
+    yield dut.m_i_valid.eq(1)
     yield
     yield
 
@@ -33,7 +33,7 @@ def write_to_addr(dut, addr, value):
 def read_from_addr(dut, addr):
     yield dut.x_addr_i.eq(addr)
     yield dut.x_ld_i.eq(1)
-    yield dut.x_valid_i.eq(1)
+    yield dut.x_i_valid.eq(1)
     yield dut.x_stall_i.eq(1)
     yield
     yield dut.x_stall_i.eq(0)
@@ -42,7 +42,7 @@ def read_from_addr(dut, addr):
     yield Settle()
     while (yield dut.x_busy_o):
         yield
-    assert (yield dut.x_valid_i)
+    assert (yield dut.x_i_valid)
     return (yield dut.m_ld_data_o)
 
 
@@ -53,8 +53,8 @@ def write_byte(dut, addr, val):
     yield dut.x_st_i.eq(1)
     yield dut.x_mask_i.eq(1 << offset)
     print("write_byte", addr, bin(1 << offset), hex(val << (offset*8)))
-    yield dut.x_valid_i.eq(1)
-    yield dut.m_valid_i.eq(1)
+    yield dut.x_i_valid.eq(1)
+    yield dut.m_i_valid.eq(1)
 
     yield
     yield dut.x_st_i.eq(0)
@@ -66,13 +66,13 @@ def read_byte(dut, addr):
     offset = addr & 0x3
     yield dut.x_addr_i.eq(addr)
     yield dut.x_ld_i.eq(1)
-    yield dut.x_valid_i.eq(1)
+    yield dut.x_i_valid.eq(1)
     yield
     yield dut.x_ld_i.eq(0)
     yield Settle()
     while (yield dut.x_busy_o):
         yield
-    assert (yield dut.x_valid_i)
+    assert (yield dut.x_i_valid)
     val = (yield dut.m_ld_data_o)
     print("read_byte", addr, offset, hex(val))
     return (val >> (offset * 8)) & 0xff
index eb20177c2117e43ff1355be96877950d0b712bdb..b26e64b5dad3cf9c4e75621127986db2eb664dbe 100644 (file)
@@ -6,24 +6,30 @@ from nmigen.cli import rtlil
 import unittest
 from soc.config.test.test_loadstore import TestMemPspec
 from soc.config.loadstore import ConfigMemoryPortInterface
+from openpower.exceptions import LDSTExceptionTuple
 
 
-def wait_busy(port, no=False):
+def wait_busy(port, no=False, debug=None):
+    cnt = 0
     while True:
         busy = yield port.busy_o
-        print("busy", no, busy)
+        print("busy", no, busy, cnt, debug)
         if bool(busy) == no:
             break
         yield
+        cnt += 1
 
 
-def wait_addr(port):
+def wait_addr(port,debug=None):
+    cnt = 0
     while True:
         addr_ok = yield port.addr_ok_o
-        print("addrok", addr_ok)
-        if addr_ok:
+        exc_happened = yield port.exc_o.happened
+        print("addrok", addr_ok,cnt,debug,exc_happened)
+        if addr_ok or exc_happened:
             break
         yield
+        cnt += 1
 
 
 def wait_ldok(port):
@@ -38,20 +44,48 @@ def wait_ldok(port):
         yield
 
 
-def pi_st(port1, addr, data, datalen, msr_pr=0):
+def pi_st(port1, addr, data, datalen, msr, is_dcbz=0):
 
     # have to wait until not busy
-    yield from wait_busy(port1, no=False)    # wait until not busy
+    yield from wait_busy(port1,debug="pi_st_A") # wait while busy
 
     # set up a ST on the port.  address first:
+    yield port1.is_dcbz_i.eq(is_dcbz)  # reset dcbz too
     yield port1.is_st_i.eq(1)  # indicate ST
     yield port1.data_len.eq(datalen)  # ST length (1/2/4/8)
-    yield port1.msr_pr.eq(msr_pr)  # MSR PR bit (1==>virt, 0==>real)
+    yield port1.priv_mode.eq(~msr.pr)
+    yield port1.virt_mode.eq(msr.dr)
+    yield port1.mode_32bit.eq(~msr.sf)
 
     yield port1.addr.data.eq(addr)  # set address
     yield port1.addr.ok.eq(1)  # set ok
     yield Settle()
+
+    # must check exception even before waiting for address.
+    # XXX TODO: wait_addr should check for exception
+    exc_info = yield from get_exception_info(port1.exc_o)
+    exc_happened = exc_info.happened
+    if exc_happened:
+        print("print fast ST exception happened")
+        yield # MUST wait for one clock cycle before de-asserting these
+        yield port1.is_st_i.eq(0)  # end
+        yield port1.addr.ok.eq(0)  # set !ok
+        yield port1.is_dcbz_i.eq(0)  # reset dcbz too
+        return "fast", exc_info
+
     yield from wait_addr(port1)             # wait until addr ok
+
+    exc_info = yield from get_exception_info(port1.exc_o)
+    exc_happened = exc_info.happened
+    if exc_happened:
+        print("print fast ST exception happened")
+        yield # MUST wait for one clock cycle before de-asserting these
+        yield port1.is_st_i.eq(0)  # end
+        yield port1.addr.ok.eq(0)  # set !ok
+        yield port1.is_dcbz_i.eq(0)  # reset dcbz too
+        return "fast", exc_info
+
+
     # yield # not needed, just for checking
     # yield # not needed, just for checking
     # assert "ST" for one cycle (required by the API)
@@ -59,44 +93,95 @@ def pi_st(port1, addr, data, datalen, msr_pr=0):
     yield port1.st.ok.eq(1)
     yield
     yield port1.st.ok.eq(0)
-    yield from wait_busy(port1, True)    # wait while busy
+    exc_info = yield from get_exception_info(port1.exc_o)
+    exc_happened = exc_info.happened
+    if exc_happened:
+        print("print fast ST exception happened")
+        yield # MUST wait for one clock cycle before de-asserting these
+        yield port1.is_st_i.eq(0)  # end
+        yield port1.addr.ok.eq(0)  # set !ok
+        yield port1.is_dcbz_i.eq(0)  # reset dcbz too
+        return "fast", exc_info
+
+    yield from wait_busy(port1,debug="pi_st_E") # wait while busy
+    exc_info = yield from get_exception_info(port1.exc_o)
+    exc_happened = exc_info.happened
+    if exc_happened:
+        yield  # needed if mmu/dache is used
+        yield port1.is_st_i.eq(0)  # end
+        yield port1.addr.ok.eq(0)  # set !ok
+        yield port1.is_dcbz_i.eq(0)  # reset dcbz too
+        yield  # needed if mmu/dache is used
+        return "slow", exc_info
 
     # can go straight to reset.
     yield port1.is_st_i.eq(0)  # end
     yield port1.addr.ok.eq(0)  # set !ok
+    yield port1.is_dcbz_i.eq(0)  # reset dcbz too
+    yield  # needed if mmu/dache is used
+
+    return None, None
+
+def get_exception_info(exc_o):
+    attrs = []
+    for fname in LDSTExceptionTuple._fields:
+        attr = getattr(exc_o, fname)
+        val = yield attr
+        attrs.append(val)
+    return LDSTExceptionTuple(*attrs)
 
 
-def pi_ld(port1, addr, datalen, msr_pr=0):
+# copy of pi_st removed
+
+def pi_ld(port1, addr, datalen, msr):
 
     # have to wait until not busy
-    yield from wait_busy(port1, no=False)    # wait until not busy
+    yield from wait_busy(port1,debug="pi_ld_A") # wait while busy
 
     # set up a LD on the port.  address first:
     yield port1.is_ld_i.eq(1)  # indicate LD
     yield port1.data_len.eq(datalen)  # LD length (1/2/4/8)
-    yield port1.msr_pr.eq(msr_pr)  # MSR PR bit (1==>virt, 0==>real)
+    yield port1.priv_mode.eq(~msr.pr)
+    yield port1.virt_mode.eq(msr.dr)
+    yield port1.mode_32bit.eq(~msr.sf)
 
     yield port1.addr.data.eq(addr)  # set address
     yield port1.addr.ok.eq(1)  # set ok
     yield Settle()
     yield from wait_addr(port1)             # wait until addr ok
+    exc_info = yield from get_exception_info(port1.exc_o)
+    exc_happened = exc_info.happened
+    if exc_happened:
+        print("print fast LD exception happened")
+        yield # MUST wait for one clock cycle before de-asserting these
+        yield port1.is_ld_i.eq(0)  # end
+        yield port1.addr.ok.eq(0)  # set !ok
+        return None, "fast", exc_info
+
     yield
     yield from wait_ldok(port1)             # wait until ld ok
     data = yield port1.ld.data
+    exc_info = yield from get_exception_info(port1.exc_o)
     exc_happened = yield port1.exc_o.happened
+    exc_happened = exc_info.happened
 
     # cleanup
     yield port1.is_ld_i.eq(0)  # end
     yield port1.addr.ok.eq(0)  # set !ok
     if exc_happened:
-        return 0
+        return None, "slow", exc_info
 
-    yield from wait_busy(port1, no=False)    # wait while not busy
+    yield from wait_busy(port1, debug="pi_ld_E") # wait while busy
+
+    exc_info = yield from get_exception_info(port1.exc_o)
+    exc_happened = exc_info.happened
+    if exc_happened:
+        return None, "slow", exc_info
 
-    return data
+    return data, None, None
 
 
-def pi_ldst(arg, dut, msr_pr=0):
+def pi_ldst(arg, dut, msr):
 
     # do two half-word stores at consecutive addresses, then two loads
     addr1 = 0x04
@@ -104,16 +189,19 @@ def pi_ldst(arg, dut, msr_pr=0):
     data = 0xbeef
     data2 = 0xf00f
     #data = 0x4
-    yield from pi_st(dut, addr1, data, 2, msr_pr)
-    yield from pi_st(dut, addr2, data2, 2, msr_pr)
-    result = yield from pi_ld(dut, addr1, 2, msr_pr)
-    result2 = yield from pi_ld(dut, addr2, 2, msr_pr)
+    assert(yield from pi_st(dut, addr1, data, 2, msr) is None)
+    assert(yield from pi_st(dut, addr2, data2, 2, msr) is None)
+    result, exc = yield from pi_ld(dut, addr1, 2, msr)
+    result2, exc2 = yield from pi_ld(dut, addr2, 2, msr)
+    assert(exc is None)
+    assert(exc2 is None)
     arg.assertEqual(data, result, "data %x != %x" % (result, data))
     arg.assertEqual(data2, result2, "data2 %x != %x" % (result2, data2))
 
     # now load both in a 32-bit load to make sure they're really consecutive
     data3 = data | (data2 << 16)
-    result3 = yield from pi_ld(dut, addr1, 4, msr_pr)
+    result3, exc3 = yield from pi_ld(dut, addr1, 4, msr)
+    assert(exc3 is None)
     arg.assertEqual(data3, result3, "data3 %x != %x" % (result3, data3))
 
 
@@ -123,7 +211,7 @@ def tst_config_pi(testcls, ifacetype):
     dut = Module()
     pspec = TestMemPspec(ldst_ifacetype=ifacetype,
                          imem_ifacetype='',
-                         addr_wid=48,
+                         addr_wid=64,
                          mask_wid=8,
                          reg_wid=64)
     cmpi = ConfigMemoryPortInterface(pspec)
index 4d897699737672ee9b58ebd8e2f585eed4f52803..e83491824322888050cd853596bf2ae335467553 100644 (file)
@@ -11,12 +11,13 @@ from nmutil.iocontrol import RecordObject
 from nmigen.utils import log2_int
 from nmigen.cli import rtlil
 from soc.config.state import CoreState
+from openpower.consts import FastRegsEnum
 
 
 # DMI register addresses
 class DBGCore:
-    CTRL         = 0b0000
-    STAT         = 0b0001
+    CTRL         = 0b0000 # Control: start/stop/reset
+    STAT         = 0b0001 # Status (read started/stopped/stopping)
     NIA          = 0b0010 # NIA register (read only for now)
     MSR          = 0b0011 # MSR (read only)
     GSPR_IDX     = 0b0100 # GSPR register index
@@ -26,6 +27,7 @@ class DBGCore:
     CR           = 0b1000 # CR (read only)
     XER          = 0b1001 # XER (read only) - note this is a TEMPORARY hack
     SVSTATE      = 0b1010 # SVSTATE register (read only for now)
+    STOPADDR     = 0b1011 # Address at which the core automatically stops
 
 
 # CTRL register (direct actions, write 1 to act, read back 0)
@@ -98,20 +100,17 @@ class CoreDebug(Elaboratable):
         self.core_stop_o       = Signal()
         self.core_rst_o        = Signal()
         self.icache_rst_o      = Signal()
+        self.stopping_o = Signal(name="stopping")
 
         # Core status inputs
         self.terminate_i    = Signal()
         self.core_stopped_i = Signal()
         self.state = CoreState("core_dbg")
 
-        # GSPR register read port
-        self.d_gpr = DbgReg("d_gpr")
-
-        # CR register read port
-        self.d_cr = DbgReg("d_cr")
-
-        # XER register read port
-        self.d_xer = DbgReg("d_xer")
+        self.d_gpr = DbgReg("d_gpr") # GSPR register read port
+        self.d_fast = DbgReg("d_fast") # GSPR register read port
+        self.d_cr = DbgReg("d_cr")   # CR register read port
+        self.d_xer = DbgReg("d_xer") # XER register read port
 
         # Core logging data
         self.log_data_i        = Signal(256)
@@ -119,6 +118,10 @@ class CoreDebug(Elaboratable):
         self.log_read_data_o   = Signal(64)
         self.log_write_addr_o  = Signal(32)
 
+        # address at which the processor stops automatically
+        # set to 0xffffffffffffffff by default (impossible to reach)
+        self.stop_addr_o = Signal(64, reset=-1)
+
         # Misc
         self.terminated_o  = Signal()
 
@@ -127,6 +130,7 @@ class CoreDebug(Elaboratable):
         m = Module()
         comb, sync = m.d.comb, m.d.sync
         dmi, d_gpr, d_cr, d_xer, = self.dmi, self.d_gpr, self.d_cr, self.d_xer
+        d_fast = self.d_fast
 
         # DMI needs fixing... make a one clock pulse
         dmi_req_i_1 = Signal()
@@ -135,13 +139,17 @@ class CoreDebug(Elaboratable):
         stat_reg = Signal(64)
 
         # Some internal latches
-        stopping     = Signal()
+        stopping     = self.stopping_o
         do_step      = Signal()
         do_reset     = Signal()
         do_icreset   = Signal()
         terminated   = Signal()
         do_gspr_rd   = Signal()
+        # select either GPRs or FAST regs to read, based on GSPR_IDX
         gspr_index   = Signal.like(d_gpr.addr)
+        fast_index   = Signal.like(d_gpr.addr)
+        gspr_en      = Signal()
+        fast_en      = Signal()
 
         log_dmi_addr = Signal(32)
         log_dmi_data = Signal(64)
@@ -151,11 +159,15 @@ class CoreDebug(Elaboratable):
 
         LOG_INDEX_BITS = log2_int(self.LOG_LENGTH)
 
-        # Single cycle register accesses on DMI except for GSPR data
+        # Single cycle register accesses on DMI except for registers
         with m.Switch(dmi.addr_i):
             with m.Case(DBGCore.GSPR_DATA):
-                comb += dmi.ack_o.eq(d_gpr.ack)
-                comb += d_gpr.req.eq(dmi.req_i)
+                with m.If(gspr_en): # GPR requested, acknowledge GPR
+                    comb += dmi.ack_o.eq(d_gpr.ack)
+                    comb += d_gpr.req.eq(dmi.req_i)
+                with m.If(fast_en): # FAST requested
+                    comb += dmi.ack_o.eq(d_fast.ack)
+                    comb += d_fast.req.eq(dmi.req_i)
             with m.Case(DBGCore.CR):
                 comb += dmi.ack_o.eq(d_cr.ack)
                 comb += d_cr.req.eq(dmi.req_i)
@@ -163,6 +175,7 @@ class CoreDebug(Elaboratable):
                 comb += dmi.ack_o.eq(d_xer.ack)
                 comb += d_xer.req.eq(dmi.req_i)
             with m.Default():
+                # everything else is immediate-acknowledgement (combinatorial)
                 comb += dmi.ack_o.eq(dmi.req_i)
 
         # Status register read composition (DBUG_CORE_STAT_xxx)
@@ -172,24 +185,29 @@ class CoreDebug(Elaboratable):
 
         # DMI read data mux
         with m.Switch(dmi.addr_i):
-            with m.Case( DBGCore.STAT):
+            with m.Case( DBGCore.STAT):               # Status register
                 comb += dmi.dout.eq(stat_reg)
-            with m.Case( DBGCore.NIA):
+            with m.Case( DBGCore.NIA):                # NIA (PC)
                 comb += dmi.dout.eq(self.state.pc)
-            with m.Case( DBGCore.MSR):
+            with m.Case( DBGCore.MSR):                # MSR
                 comb += dmi.dout.eq(self.state.msr)
-            with m.Case( DBGCore.SVSTATE):
+            with m.Case( DBGCore.SVSTATE):            # SVSTATE
                 comb += dmi.dout.eq(self.state.svstate)
-            with m.Case( DBGCore.GSPR_DATA):
-                comb += dmi.dout.eq(d_gpr.data)
-            with m.Case( DBGCore.LOG_ADDR):
+            with m.Case( DBGCore.GSPR_DATA):          # GPR/FAST regs
+                with m.If(gspr_en):
+                    comb += dmi.dout.eq(d_gpr.data)   # GPR data selected
+                with m.If(fast_en):
+                    comb += dmi.dout.eq(d_fast.data)  # FAST reg read selected
+            with m.Case( DBGCore.LOG_ADDR):           # Logging
                 comb += dmi.dout.eq(Cat(log_dmi_addr, self.log_write_addr_o))
             with m.Case( DBGCore.LOG_DATA):
                 comb += dmi.dout.eq(log_dmi_data)
-            with m.Case(DBGCore.CR):
+            with m.Case(DBGCore.CR):                  # CR
                 comb += dmi.dout.eq(d_cr.data)
-            with m.Case(DBGCore.XER):
+            with m.Case(DBGCore.XER):                 # XER
                 comb += dmi.dout.eq(d_xer.data)
+            with m.Case(DBGCore.STOPADDR):            # Halt PC
+                comb += dmi.dout.eq(self.stop_addr_o)
 
         # DMI writes
         # Reset the 1-cycle "do" signals
@@ -224,12 +242,31 @@ class CoreDebug(Elaboratable):
 
                 # GSPR address
                 with m.Elif(dmi.addr_i == DBGCore.GSPR_IDX):
-                    sync += gspr_index.eq(dmi.din)
+                    sync += gspr_index.eq(0)
+                    sync += fast_index.eq(0)
+                    sync += gspr_en.eq(0)
+                    sync += fast_en.eq(0)
+                    with m.If(dmi.din <= 31):
+                        sync += gspr_index.eq(dmi.din)
+                        sync += gspr_en.eq(1)
+                    # cover the FastRegs LR, CTR, SRR0, SRR1 etc.
+                    # numbering is from microwatt
+                    for x, i in FastRegsEnum.__dict__.items():
+                        if not isinstance(i, int) or x == 'N_REGS':
+                            continue
+                        with m.If(dmi.din == 32+i):
+                            sync += fast_index.eq(i)
+                            sync += fast_en.eq(1)
 
                 # Log address
                 with m.Elif(dmi.addr_i == DBGCore.LOG_ADDR):
                     sync += log_dmi_addr.eq(dmi.din)
                     sync += do_dmi_log_rd.eq(1)
+
+                # set PC Halt address
+                with m.Elif(dmi.addr_i == DBGCore.STOPADDR):
+                    sync += self.stop_addr_o.eq(dmi.din)
+
             with m.Else():
                 # sync += Display("DMI read from " & to_string(dmi_addr))
                 pass
@@ -252,12 +289,13 @@ class CoreDebug(Elaboratable):
             sync += terminated.eq(1)
 
         comb += d_gpr.addr.eq(gspr_index)
+        comb += d_fast.addr.eq(fast_index)
 
         # Core control signals generated by the debug module
-        comb += self.core_stop_o.eq(stopping & ~do_step)
+        comb += self.core_stop_o.eq((stopping & ~do_step) | self.terminate_i)
         comb += self.core_rst_o.eq(do_reset)
         comb += self.icache_rst_o.eq(do_icreset)
-        comb += self.terminated_o.eq(terminated)
+        comb += self.terminated_o.eq(terminated | self.terminate_i)
 
         # Logging RAM (none)
 
@@ -356,6 +394,7 @@ class CoreDebug(Elaboratable):
         yield from self.d_gpr
         yield from self.d_cr
         yield from self.d_xer
+        yield from self.d_fast
         yield self.log_data_i
         yield self.log_read_addr_i
         yield self.log_read_data_o
index a642bd1f358cbab7f639505a3f5475ec0625ae7c..79418e28ed27c88037fac58268e758f2aedfacbc 100644 (file)
@@ -157,7 +157,7 @@ class JTAGServer:
 
     def jtagremote_server_recv(self, tdo):
         data = self.get_data(1, 0) # read 1 byte, non-blocking
-        if data is None:
+        if data is None or len(data) == 0:
             return None # no data read
         data = bytes.decode(data)
         if self.debug:
index 528aa34ab2e19fb8bc29a948b8db3e967f25cb17..3981904a1a88bacaa9f65c0976dd675aa7104978 100644 (file)
@@ -115,7 +115,7 @@ def jtag_sim(dut):
     # read DMI CTRL register
     status = yield from jtag_read_write_reg(dut, DMI_READ, 64)
     print ("dmi ctrl status", hex(status))
-    assert status == 0
+    assert status == 6
 
     # write DMI MSR address
     yield from jtag_read_write_reg(dut, DMI_ADDR, 8, DBGCore.MSR)
index a72145754974946a705cb126d68064e5cf6ab5e1..b92f41ee355005e0782abf0ec50169adb30fa46f 100644 (file)
@@ -188,7 +188,7 @@ def jtag_sim(dut, srv_dut):
     # read DMI CTRL register
     status = yield from jtag_read_write_reg(dut, DMI_READ, 64)
     print ("dmi ctrl status", hex(status))
-    assert status == 0
+    assert status == 6
 
     # write DMI MSR address
     yield from jtag_read_write_reg(dut, DMI_ADDR, 8, DBGCore.MSR)
@@ -221,7 +221,7 @@ def jtag_sim(dut, srv_dut):
 
 
 if __name__ == '__main__':
-    dut = JTAG(test_pinset(), wb_data_wid=64)
+    dut = JTAG(test_pinset(), wb_data_wid=64, domain="sync")
     dut.stop = False
 
     # rather than the client access the JTAG bus directly
@@ -236,6 +236,8 @@ if __name__ == '__main__':
         cdut.c = JTAGClient()
         dut.s.get_connection()
     else:
+        print ("running server only as requested, use openocd remote to test")
+        sys.stdout.flush()
         dut.s.get_connection(None) # block waiting for connection
 
     # take copy of ir_width and scan_len
@@ -255,8 +257,6 @@ if __name__ == '__main__':
     sim.add_sync_process(wrap(jtag_srv(dut))) # jtag server
     if len(sys.argv) != 2 or sys.argv[1] != 'server':
         sim.add_sync_process(wrap(jtag_sim(cdut, dut))) # actual jtag tester
-    else:
-        print ("running server only as requested, use openocd remote to test")
     sim.add_sync_process(wrap(dmi_sim(dut)))  # handles (pretends to be) DMI
 
     with sim.write_vcd("dmi2jtag_test_srv.vcd"):
index 3fb1c6cfe595d627506a941f87f5d39ab2bb5079..e2e737733c13af80b3e8f92bb57a2c0ccefc9cd1 100644 (file)
@@ -7,11 +7,11 @@ intended to comply with both the CompALU API and the nmutil Pipeline API
 
 The basic rules are:
 
-1) p.ready_o is asserted on the initial ("Idle") state, otherwise it keeps low.
-2) n.valid_o is asserted on the final ("Done") state, otherwise it keeps low.
-3) The FSM stays in the Idle state while p.valid_i is low, otherwise
+1) p.o_ready is asserted on the initial ("Idle") state, otherwise it keeps low.
+2) n.o_valid is asserted on the final ("Done") state, otherwise it keeps low.
+3) The FSM stays in the Idle state while p.i_valid is low, otherwise
    it accepts the input data and moves on.
-4) The FSM stays in the Done state while n.ready_i is low, otherwise
+4) The FSM stays in the Done state while n.i_ready is low, otherwise
    it releases the output data and goes back to the Idle state.
 
 """
@@ -44,8 +44,8 @@ class Shifter(Elaboratable):
     """Simple sequential shifter
 
     Prev port data:
-    * p.data_i.data:  value to be shifted
-    * p.data_i.shift: shift amount
+    * p.i_data.data:  value to be shifted
+    * p.i_data.shift: shift amount
     *                 When zero, no shift occurs.
     *                 On POWER, range is 0 to 63 for 32-bit,
     *                 and 0 to 127 for 64-bit.
@@ -55,11 +55,11 @@ class Shifter(Elaboratable):
     * op.sdir:       shift direction (0 = left, 1 = right)
 
     Next port data:
-    * n.data_o.data: shifted value
+    * n.o_data.data: shifted value
     """
     class PrevData:
         def __init__(self, width):
-            self.data = Signal(width, name="p_data_i")
+            self.data = Signal(width, name="p_i_data")
             self.shift = Signal(width, name="p_shift_i")
             self.ctx = Dummy()  # comply with CompALU API
 
@@ -68,7 +68,7 @@ class Shifter(Elaboratable):
 
     class NextData:
         def __init__(self, width):
-            self.data = Signal(width, name="n_data_o")
+            self.data = Signal(width, name="n_o_data")
 
         def _get_data(self):
             return [self.data]
@@ -77,14 +77,14 @@ class Shifter(Elaboratable):
         self.width = width
         self.p = PrevControl()
         self.n = NextControl()
-        self.p.data_i = Shifter.PrevData(width)
-        self.n.data_o = Shifter.NextData(width)
+        self.p.i_data = Shifter.PrevData(width)
+        self.n.o_data = Shifter.NextData(width)
 
         # more pieces to make this example class comply with the CompALU API
         self.op = CompFSMOpSubset(name="op")
-        self.p.data_i.ctx.op = self.op
-        self.i = self.p.data_i._get_data()
-        self.out = self.n.data_o._get_data()
+        self.p.i_data.ctx.op = self.op
+        self.i = self.p.i_data._get_data()
+        self.out = self.n.o_data._get_data()
 
     def elaborate(self, platform):
         m = Module()
@@ -115,8 +115,8 @@ class Shifter(Elaboratable):
         # build the data flow
         m.d.comb += [
             # connect input and output
-            shift_in.eq(self.p.data_i.data),
-            self.n.data_o.data.eq(shift_reg),
+            shift_in.eq(self.p.i_data.data),
+            self.n.o_data.data.eq(shift_reg),
             # generate shifted views of the register
             shift_left_by_1.eq(Cat(0, shift_reg[:-1])),
             shift_right_by_1.eq(Cat(shift_reg[1:], 0)),
@@ -152,15 +152,15 @@ class Shifter(Elaboratable):
         with m.FSM():
             with m.State("IDLE"):
                 m.d.comb += [
-                    # keep p.ready_o active on IDLE
-                    self.p.ready_o.eq(1),
+                    # keep p.o_ready active on IDLE
+                    self.p.o_ready.eq(1),
                     # keep loading the shift register and shift count
                     load.eq(1),
-                    next_count.eq(self.p.data_i.shift),
+                    next_count.eq(self.p.i_data.shift),
                 ]
                 # capture the direction bit as well
                 m.d.sync += direction.eq(self.op.sdir)
-                with m.If(self.p.valid_i):
+                with m.If(self.p.i_valid):
                     # Leave IDLE when data arrives
                     with m.If(next_count == 0):
                         # short-circuit for zero shift
@@ -178,9 +178,9 @@ class Shifter(Elaboratable):
                     # exit when shift counter goes to zero
                     m.next = "DONE"
             with m.State("DONE"):
-                # keep n.valid_o active while the data is not accepted
-                m.d.comb += self.n.valid_o.eq(1)
-                with m.If(self.n.ready_i):
+                # keep n.o_valid active while the data is not accepted
+                m.d.comb += self.n.o_valid.eq(1)
+                with m.If(self.n.i_ready):
                     # go back to IDLE when the data is accepted
                     m.next = "IDLE"
 
@@ -188,13 +188,13 @@ class Shifter(Elaboratable):
 
     def __iter__(self):
         yield self.op.sdir
-        yield self.p.data_i.data
-        yield self.p.data_i.shift
-        yield self.p.valid_i
-        yield self.p.ready_o
-        yield self.n.ready_i
-        yield self.n.valid_o
-        yield self.n.data_o.data
+        yield self.p.i_data.data
+        yield self.p.i_data.shift
+        yield self.p.i_valid
+        yield self.p.o_ready
+        yield self.n.i_ready
+        yield self.n.o_valid
+        yield self.n.o_data.data
 
     def ports(self):
         return list(self)
@@ -222,20 +222,20 @@ def test_shifter():
         {'comment': 'Shifter Demonstration'},
         ('prev port', [
             ('op__sdir', 'in'),
-            ('p_data_i[7:0]', 'in'),
+            ('p_i_data[7:0]', 'in'),
             ('p_shift_i[7:0]', 'in'),
             ({'submodule': 'p'}, [
-                ('p_valid_i', 'in'),
-                ('p_ready_o', 'out')])]),
+                ('p_i_valid', 'in'),
+                ('p_o_ready', 'out')])]),
         ('internal', [
             'fsm_state' if is_engine_pysim() else 'fsm_state[1:0]',
             'count[3:0]',
             'shift_reg[7:0]']),
         ('next port', [
-            ('n_data_o[7:0]', 'out'),
+            ('n_o_data[7:0]', 'out'),
             ({'submodule': 'n'}, [
-                ('n_valid_o', 'out'),
-                ('n_ready_i', 'in')])])]
+                ('n_o_valid', 'out'),
+                ('n_i_ready', 'in')])])]
 
     write_gtkw("test_shifter.gtkw", "test_shifter.vcd",
                gtkwave_desc,  gtkwave_style,
@@ -245,32 +245,32 @@ def test_shifter():
     sim.add_clock(1e-6)
 
     def send(data, shift, direction):
-        # present input data and assert valid_i
-        yield dut.p.data_i.data.eq(data)
-        yield dut.p.data_i.shift.eq(shift)
+        # present input data and assert i_valid
+        yield dut.p.i_data.data.eq(data)
+        yield dut.p.i_data.shift.eq(shift)
         yield dut.op.sdir.eq(direction)
-        yield dut.p.valid_i.eq(1)
+        yield dut.p.i_valid.eq(1)
         yield
-        # wait for p.ready_o to be asserted
-        while not (yield dut.p.ready_o):
+        # wait for p.o_ready to be asserted
+        while not (yield dut.p.o_ready):
             yield
-        # clear input data and negate p.valid_i
-        yield dut.p.valid_i.eq(0)
-        yield dut.p.data_i.data.eq(0)
-        yield dut.p.data_i.shift.eq(0)
+        # clear input data and negate p.i_valid
+        yield dut.p.i_valid.eq(0)
+        yield dut.p.i_data.data.eq(0)
+        yield dut.p.i_data.shift.eq(0)
         yield dut.op.sdir.eq(0)
 
     def receive(expected):
         # signal readiness to receive data
-        yield dut.n.ready_i.eq(1)
+        yield dut.n.i_ready.eq(1)
         yield
-        # wait for n.valid_o to be asserted
-        while not (yield dut.n.valid_o):
+        # wait for n.o_valid to be asserted
+        while not (yield dut.n.o_valid):
             yield
         # read result
-        result = yield dut.n.data_o.data
-        # negate n.ready_i
-        yield dut.n.ready_i.eq(0)
+        result = yield dut.n.o_data.data
+        # negate n.i_ready
+        yield dut.n.i_ready.eq(0)
         # check result
         assert result == expected
 
index dbe8465fbdff095736ad2a47332f1cab8172535b..459bbd951cb41a35e5f06089162e365fd8b03d9b 100644 (file)
@@ -9,7 +9,7 @@ A "real" integer ALU would place the answers onto the output bus after
 only one cycle (sync)
 """
 
-from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
+from nmigen import Elaboratable, Signal, Module, Const, Mux
 from nmigen.hdl.rec import Record, Layout
 from nmigen.cli import main
 from nmigen.cli import verilog, rtlil
@@ -28,6 +28,10 @@ from openpower.decoder.power_enums import MicrOp, Function, CryIn
 from soc.fu.alu.alu_input_record import CompALUOpSubset
 from soc.fu.cr.cr_input_record import CompCROpSubset
 
+from soc.fu.pipe_data import FUBaseData
+from soc.fu.alu.pipe_data import CommonPipeSpec
+from soc.fu.compunits.compunits import FunctionUnitBaseSingle
+
 import operator
 
 
@@ -105,42 +109,42 @@ class Dummy:
 class DummyALU(Elaboratable):
     def __init__(self, width):
         self.p = Dummy()  # make look like nmutil pipeline API
-        self.p.data_i = Dummy()
-        self.p.data_i.ctx = Dummy()
+        self.p.i_data = Dummy()
+        self.p.i_data.ctx = Dummy()
         self.n = Dummy()  # make look like nmutil pipeline API
-        self.n.data_o = Dummy()
-        self.p.valid_i = Signal()
-        self.p.ready_o = Signal()
-        self.n.ready_i = Signal()
-        self.n.valid_o = Signal()
+        self.n.o_data = Dummy()
+        self.p.i_valid = Signal()
+        self.p.o_ready = Signal()
+        self.n.i_ready = Signal()
+        self.n.o_valid = Signal()
         self.counter = Signal(4)
         self.op = CompCROpSubset()
         i = []
         i.append(Signal(width, name="i1"))
         i.append(Signal(width, name="i2"))
         i.append(Signal(width, name="i3"))
-        self.i = Array(i)
+        self.i = i
         self.a, self.b, self.c = i[0], i[1], i[2]
-        self.out = Array([Signal(width, name="alu_o")])
+        self.out = tuple([Signal(width, name="alu_o")])
         self.o = self.out[0]
         self.width = width
         # more "look like nmutil pipeline API"
-        self.p.data_i.ctx.op = self.op
-        self.p.data_i.a = self.a
-        self.p.data_i.b = self.b
-        self.p.data_i.c = self.c
-        self.n.data_o.o = self.o
+        self.p.i_data.ctx.op = self.op
+        self.p.i_data.a = self.a
+        self.p.i_data.b = self.b
+        self.p.i_data.c = self.c
+        self.n.o_data.o = self.o
 
     def elaborate(self, platform):
         m = Module()
 
         go_now = Signal(reset_less=True)  # testing no-delay ALU
 
-        with m.If(self.p.valid_i):
+        with m.If(self.p.i_valid):
             # input is valid. next check, if we already said "ready" or not
-            with m.If(~self.p.ready_o):
+            with m.If(~self.p.o_ready):
                 # we didn't say "ready" yet, so say so and initialise
-                m.d.sync += self.p.ready_o.eq(1)
+                m.d.sync += self.p.o_ready.eq(1)
 
                 m.d.sync += self.o.eq(self.a)
                 m.d.comb += go_now.eq(1)
@@ -149,14 +153,14 @@ class DummyALU(Elaboratable):
         with m.Else():
             # input says no longer valid, so drop ready as well.
             # a "proper" ALU would have had to sync in the opcode and a/b ops
-            m.d.sync += self.p.ready_o.eq(0)
+            m.d.sync += self.p.o_ready.eq(0)
 
         # ok so the counter's running: when it gets to 1, fire the output
         with m.If((self.counter == 1) | go_now):
             # set the output as valid if the recipient is ready for it
-            m.d.sync += self.n.valid_o.eq(1)
-        with m.If(self.n.ready_i & self.n.valid_o):
-            m.d.sync += self.n.valid_o.eq(0)
+            m.d.sync += self.n.o_valid.eq(1)
+        with m.If(self.n.i_ready & self.n.o_valid):
+            m.d.sync += self.n.o_valid.eq(0)
             # recipient said it was ready: reset back to known-good.
             m.d.sync += self.counter.eq(0)  # reset the counter
             m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
@@ -177,38 +181,86 @@ class DummyALU(Elaboratable):
     def ports(self):
         return list(self)
 
+#####################
+# converting even this dummy ALU over to the FunctionUnit RegSpecs API
+# which, errr, note that the regspecs are totally ignored below, but
+# at least the widths are all 64-bit so it's okay.
+#####################
+
+# input (and output) for logical initial stage (common input)
+
+
+class ALUInputData(FUBaseData):
+    regspec = [('INT', 'a', '0:63'),  # RA
+               ('INT', 'b', '0:63'),  # RB/immediate
+               ]
+
+    def __init__(self, pspec):
+        super().__init__(pspec, False)
+
+
+# output from ALU final stage
+class ALUOutputData(FUBaseData):
+    regspec = [('INT', 'o', '0:63'),        # RT
+               ]
+
+    def __init__(self, pspec):
+        super().__init__(pspec, True)
+
+
+# ALU pipe specification class
+class ALUPipeSpec(CommonPipeSpec):
+    regspec = (ALUInputData.regspec, ALUOutputData.regspec)
+    opsubsetkls = CompALUOpSubset
+
+
+class ALUFunctionUnit(FunctionUnitBaseSingle):
+    # class ALUFunctionUnit(FunctionUnitBaseMulti):
+    fnunit = Function.ALU
+
+    def __init__(self, idx, parent_pspec):
+        super().__init__(ALUPipeSpec, ALU, 1, parent_pspec)
+
 
 class ALU(Elaboratable):
     def __init__(self, width):
+        # XXX major temporary hack: attempting to convert
+        # ALU over to RegSpecs API, FunctionUnitBaseSingle passes in
+        # a regspec here which we can't cope with.  therefore, errr...
+        # just throw it away and set the width to 64
+        if not isinstance(width, int):
+            width = 64
+        # TODO, really this should just inherit from ControlBase it would
+        # be a lot less messy.
         self.p = Dummy()  # make look like nmutil pipeline API
-        self.p.data_i = Dummy()
-        self.p.data_i.ctx = Dummy()
+        self.p.i_data = Dummy()
+        self.p.i_data.ctx = Dummy()
         self.n = Dummy()  # make look like nmutil pipeline API
-        self.n.data_o = Dummy()
-        self.p.valid_i = Signal()
-        self.p.ready_o = Signal()
-        self.n.ready_i = Signal()
-        self.n.valid_o = Signal()
+        self.n.o_data = Dummy()
+        self.p.i_valid = Signal()
+        self.p.o_ready = Signal()
+        self.n.i_ready = Signal()
+        self.n.o_valid = Signal()
         self.counter = Signal(4)
         self.op = CompALUOpSubset(name="op")
         i = []
         i.append(Signal(width, name="i1"))
         i.append(Signal(width, name="i2"))
-        self.i = Array(i)
+        self.i = i
         self.a, self.b = i[0], i[1]
         out = []
         out.append(Data(width, name="alu_o"))
         out.append(Data(width, name="alu_cr"))
-        self.out = Array(out)
+        self.out = tuple(out)
         self.o = self.out[0]
         self.cr = self.out[1]
         self.width = width
-        # more "look like nmutil pipeline API"
-        self.p.data_i.ctx.op = self.op
-        self.p.data_i.a = self.a
-        self.p.data_i.b = self.b
-        self.n.data_o.o = self.o
-        self.n.data_o.cr = self.cr
+        # more "look like nmutil ControlBase pipeline API" stuff
+        self.p.i_data.ctx.op = self.op
+        self.p.i_data.a = self.a
+        self.p.i_data.b = self.b
+        self.n.o_data.o = self.o
+        self.n.o_data.cr = self.cr
 
     def elaborate(self, platform):
         m = Module()
@@ -254,16 +306,16 @@ class ALU(Elaboratable):
         with m.If(go_now):
             # with a combinatorial, no-delay ALU, just pass through
             # the handshake signals to the other side
-            m.d.comb += self.p.ready_o.eq(self.n.ready_i)
-            m.d.comb += self.n.valid_o.eq(self.p.valid_i)
+            m.d.comb += self.p.o_ready.eq(self.n.i_ready)
+            m.d.comb += self.n.o_valid.eq(self.p.i_valid)
         with m.Else():
             # sequential ALU handshake:
-            # ready_o responds to valid_i, but only if the ALU is idle
-            m.d.comb += self.p.ready_o.eq(alu_idle)
-            # select the internally generated valid_o, above
-            m.d.comb += self.n.valid_o.eq(alu_done)
+            # o_ready responds to i_valid, but only if the ALU is idle
+            m.d.comb += self.p.o_ready.eq(alu_idle)
+            # select the internally generated o_valid, above
+            m.d.comb += self.n.o_valid.eq(alu_done)
 
-        # hold the ALU result until ready_o is asserted
+        # hold the ALU result until o_ready is asserted
         alu_r = Signal(self.width)
 
         # output masks
@@ -275,7 +327,7 @@ class ALU(Elaboratable):
         m.d.comb += self.cr.ok.eq(self.op.rc.rc)
 
         with m.If(alu_idle):
-            with m.If(self.p.valid_i):
+            with m.If(self.p.i_valid):
 
                 # as this is a "fake" pipeline, just grab the output right now
                 with m.If(self.op.insn_type == MicrOp.OP_ADD):
@@ -311,7 +363,7 @@ class ALU(Elaboratable):
                 with m.Else():
                     m.d.comb += go_now.eq(1)
 
-        with m.Elif(~alu_done | self.n.ready_i):
+        with m.Elif(~alu_done | self.n.i_ready):
             # decrement the counter while the ALU is neither idle nor finished
             m.d.sync += self.counter.eq(self.counter - 1)
 
@@ -337,10 +389,10 @@ class ALU(Elaboratable):
         yield self.a
         yield self.b
         yield from self.o.ports()
-        yield self.p.valid_i
-        yield self.p.ready_o
-        yield self.n.valid_o
-        yield self.n.ready_i
+        yield self.p.i_valid
+        yield self.p.o_ready
+        yield self.n.o_valid
+        yield self.n.i_ready
 
     def ports(self):
         return list(self)
@@ -362,22 +414,22 @@ class BranchOp(Elaboratable):
 class BranchALU(Elaboratable):
     def __init__(self, width):
         self.p = Dummy()  # make look like nmutil pipeline API
-        self.p.data_i = Dummy()
-        self.p.data_i.ctx = Dummy()
+        self.p.i_data = Dummy()
+        self.p.i_data.ctx = Dummy()
         self.n = Dummy()  # make look like nmutil pipeline API
-        self.n.data_o = Dummy()
-        self.p.valid_i = Signal()
-        self.p.ready_o = Signal()
-        self.n.ready_i = Signal()
-        self.n.valid_o = Signal()
+        self.n.o_data = Dummy()
+        self.p.i_valid = Signal()
+        self.p.o_ready = Signal()
+        self.n.i_ready = Signal()
+        self.n.o_valid = Signal()
         self.counter = Signal(4)
         self.op = Signal(2)
         i = []
         i.append(Signal(width, name="i1"))
         i.append(Signal(width, name="i2"))
-        self.i = Array(i)
+        self.i = i
         self.a, self.b = i[0], i[1]
-        self.out = Array([Signal(width)])
+        self.out = tuple([Signal(width)])
         self.o = self.out[0]
         self.width = width
 
@@ -399,11 +451,11 @@ class BranchALU(Elaboratable):
             ]
 
         go_now = Signal(reset_less=True)  # testing no-delay ALU
-        with m.If(self.p.valid_i):
+        with m.If(self.p.i_valid):
             # input is valid. next check, if we already said "ready" or not
-            with m.If(~self.p.ready_o):
+            with m.If(~self.p.o_ready):
                 # we didn't say "ready" yet, so say so and initialise
-                m.d.sync += self.p.ready_o.eq(1)
+                m.d.sync += self.p.o_ready.eq(1)
 
                 # as this is a "fake" pipeline, just grab the output right now
                 with m.Switch(self.op):
@@ -416,14 +468,14 @@ class BranchALU(Elaboratable):
         with m.Else():
             # input says no longer valid, so drop ready as well.
             # a "proper" ALU would have had to sync in the opcode and a/b ops
-            m.d.sync += self.p.ready_o.eq(0)
+            m.d.sync += self.p.o_ready.eq(0)
 
         # ok so the counter's running: when it gets to 1, fire the output
         with m.If((self.counter == 1) | go_now):
             # set the output as valid if the recipient is ready for it
-            m.d.sync += self.n.valid_o.eq(1)
-        with m.If(self.n.ready_i & self.n.valid_o):
-            m.d.sync += self.n.valid_o.eq(0)
+            m.d.sync += self.n.o_valid.eq(1)
+        with m.If(self.n.i_ready & self.n.o_valid):
+            m.d.sync += self.n.o_valid.eq(0)
             # recipient said it was ready: reset back to known-good.
             m.d.sync += self.counter.eq(0)  # reset the counter
             m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
@@ -449,28 +501,28 @@ def run_op(dut, a, b, op, inv_a=0):
     yield dut.b.eq(b)
     yield dut.op.insn_type.eq(op)
     yield dut.op.invert_in.eq(inv_a)
-    yield dut.n.ready_i.eq(0)
-    yield dut.p.valid_i.eq(1)
-    yield dut.n.ready_i.eq(1)
+    yield dut.n.i_ready.eq(0)
+    yield dut.p.i_valid.eq(1)
+    yield dut.n.i_ready.eq(1)
     yield
 
     # wait for the ALU to accept our input data
-    while not (yield dut.p.ready_o):
+    while not (yield dut.p.o_ready):
         yield
 
-    yield dut.p.valid_i.eq(0)
+    yield dut.p.i_valid.eq(0)
     yield dut.a.eq(0)
     yield dut.b.eq(0)
     yield dut.op.insn_type.eq(0)
     yield dut.op.invert_in.eq(0)
 
     # wait for the ALU to present the output data
-    while not (yield dut.n.valid_o):
+    while not (yield dut.n.o_valid):
         yield
 
     # latch the result and lower read_i
     result = yield dut.o.data
-    yield dut.n.ready_i.eq(0)
+    yield dut.n.i_ready.eq(0)
 
     return result
 
@@ -520,21 +572,21 @@ def test_alu_parallel():
     sim.add_clock(1e-6)
 
     def send(a, b, op, inv_a=0, rc=0):
-        # present input data and assert valid_i
+        # present input data and assert i_valid
         yield dut.a.eq(a)
         yield dut.b.eq(b)
         yield dut.op.insn_type.eq(op)
         yield dut.op.invert_in.eq(inv_a)
         yield dut.op.rc.rc.eq(rc)
-        yield dut.p.valid_i.eq(1)
+        yield dut.p.i_valid.eq(1)
         yield
-        # wait for ready_o to be asserted
-        while not (yield dut.p.ready_o):
+        # wait for o_ready to be asserted
+        while not (yield dut.p.o_ready):
             yield
-        # clear input data and negate valid_i
+        # clear input data and negate i_valid
         # if send is called again immediately afterwards, there will be no
         # visible transition (they will not be negated, after all)
-        yield dut.p.valid_i.eq(0)
+        yield dut.p.i_valid.eq(0)
         yield dut.a.eq(0)
         yield dut.b.eq(0)
         yield dut.op.insn_type.eq(0)
@@ -543,18 +595,18 @@ def test_alu_parallel():
 
     def receive():
         # signal readiness to receive data
-        yield dut.n.ready_i.eq(1)
+        yield dut.n.i_ready.eq(1)
         yield
-        # wait for valid_o to be asserted
-        while not (yield dut.n.valid_o):
+        # wait for o_valid to be asserted
+        while not (yield dut.n.o_valid):
             yield
         # read results
         result = yield dut.o.data
         cr = yield dut.cr.data
-        # negate ready_i
+        # negate i_ready
         # if receive is called again immediately afterwards, there will be no
         # visible transition (it will not be negated, after all)
-        yield dut.n.ready_i.eq(0)
+        yield dut.n.i_ready.eq(0)
         return result, cr
 
     def producer():
@@ -650,10 +702,10 @@ def write_alu_gtkw(gtkw_name, clk_period=1e-6, sub_module=None,
         'i2[15:0]',
         'op__insn_type' if pysim else 'op__insn_type[6:0]',
         'op__invert_in',
-        'valid_i',
-        'ready_o',
-        'valid_o',
-        'ready_i',
+        'i_valid',
+        'o_ready',
+        'o_valid',
+        'i_ready',
         'alu_o[15:0]',
         'alu_o_ok',
         'alu_cr[15:0]',
index 05539cd485ac833266595e23a92d7034c90d5d67..ff05b48f5717b642ee51d96d830391b9e21967ab 100644 (file)
@@ -61,9 +61,9 @@ class ComputationUnitNoDelay(Elaboratable):
         self.src2_i = Signal(rwid, reset_less=True)  # oper2 in
 
         self.busy_o = Signal(reset_less=True)  # fn busy out
-        self.data_o = Signal(rwid, reset_less=True)  # Dest out
+        self.o_data = Signal(rwid, reset_less=True)  # Dest out
         self.rd_rel_o = Signal(reset_less=True)  # release src1/src2 request
-        # release request out (valid_o)
+        # release request out (o_valid)
         self.req_rel_o = Signal(reset_less=True)
         self.done_o = self.req_rel_o  # 'normalise' API
 
@@ -133,21 +133,21 @@ class ComputationUnitNoDelay(Elaboratable):
         # NOTE: this spells TROUBLE if the ALU isn't ready!
         # go_read is only valid for one clock!
         with m.If(self.go_rd_i):                     # src operands ready, GO!
-            with m.If(~self.alu.p_ready_o):          # no ACK yet
-                m.d.comb += self.alu.p_valid_i.eq(1)  # so indicate valid
+            with m.If(~self.alu.p_o_ready):          # no ACK yet
+                m.d.comb += self.alu.p_i_valid.eq(1)  # so indicate valid
 
         # only proceed if ALU says its output is valid
-        with m.If(self.alu.n_valid_o):
+        with m.If(self.alu.n_o_valid):
             # when ALU ready, write req release out. waits for shadow
             m.d.comb += self.req_rel_o.eq(req_l.q & busy_o & self.shadown_i)
             # when output latch is ready, and ALU says ready, accept ALU output
             with m.If(self.req_rel_o & self.go_wr_i):
                 # tells ALU "thanks got it"
-                m.d.comb += self.alu.n_ready_i.eq(1)
+                m.d.comb += self.alu.n_i_ready.eq(1)
 
         # output the data from the latch on go_write
         with m.If(self.go_wr_i):
-            m.d.comb += self.data_o.eq(data_r)
+            m.d.comb += self.o_data.eq(data_r)
 
         return m
 
@@ -163,7 +163,7 @@ class ComputationUnitNoDelay(Elaboratable):
         yield self.busy_o
         yield self.rd_rel_o
         yield self.req_rel_o
-        yield self.data_o
+        yield self.o_data
 
     def ports(self):
         return list(self)
@@ -192,18 +192,18 @@ def op_sim(dut, a, b, op, inv_a=0, imm=0, imm_ok=0):
     yield
     yield dut.go_rd_i.eq(0)
     req_rel_o = yield dut.req_rel_o
-    result = yield dut.data_o
+    result = yield dut.o_data
     print("req_rel", req_rel_o, result)
     while True:
         req_rel_o = yield dut.req_rel_o
-        result = yield dut.data_o
+        result = yield dut.o_data
         print("req_rel", req_rel_o, result)
         if req_rel_o:
             break
         yield
     yield dut.go_wr_i.eq(1)
     yield
-    result = yield dut.data_o
+    result = yield dut.o_data
     print("result", result)
     yield dut.go_wr_i.eq(0)
     yield
index d7e32f28c556e76aff9be146ce280eba9745bb09..f76e40660ac3181bda5e4af48ef8aba97f952964 100644 (file)
@@ -106,10 +106,12 @@ class CompUnitRecord(RegSpec, RecordObject):
         # output (busy/done)
         self.busy_o = Signal(name="cu_busy_o", reset_less=True)  # fn busy out
         self.done_o = Signal(name="cu_done_o", reset_less=True)
+        self.alu_done_o = Signal(name="cu_alu_done_o", reset_less=True)
 
 
 class MultiCompUnit(RegSpecALUAPI, Elaboratable):
-    def __init__(self, rwid, alu, opsubsetkls, n_src=2, n_dst=1, name=None):
+    def __init__(self, rwid, alu, opsubsetkls, n_src=2, n_dst=1, name=None,
+                       sync_rw=True):
         """MultiCompUnit
 
         * :rwid:        width of register latches (TODO: allocate per regspec)
@@ -119,6 +121,7 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         * :n_dst:       number of destination operands
         """
         RegSpecALUAPI.__init__(self, rwid, alu)
+        self.sync_rw = sync_rw
         self.alu_name = name or "alu"
         self.opsubsetkls = opsubsetkls
         self.cu = cu = CompUnitRecord(opsubsetkls, rwid, n_src, n_dst,
@@ -143,6 +146,7 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         self.wr = cu.wr
         self.rdmaskn = cu.rdmaskn
         self.wrmask = cu.wrmask
+        self.alu_done_o = cu.alu_done_o
         self.go_rd_i = self.rd.go_i  # temporary naming
         self.go_wr_i = self.wr.go_i  # temporary naming
         self.rd_rel_o = self.rd.rel_o  # temporary naming
@@ -157,7 +161,7 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
 
         self.busy_o = cu.busy_o
         self.dest = cu._dest
-        self.data_o = self.dest[0]  # Dest out
+        self.o_data = self.dest[0]  # Dest out
         self.done_o = cu.done_o
 
     def _mux_op(self, m, sl, op_is_imm, imm, i):
@@ -174,7 +178,16 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
 
     def elaborate(self, platform):
         m = Module()
-        setattr(m.submodules, self.alu_name, self.alu)
+        if self.sync_rw:
+            rw_domain = m.d.sync
+        else:
+            rw_domain = m.d.comb
+        # add the ALU to the MultiCompUnit only if it is a "real" ALU
+        # see AllFunctionUnits as to why: a FunctionUnitBaseMulti
+        # only has one "real" ALU but multiple pseudo front-ends,
+        # aka "ReservationStations" (ALUProxy "fronts")
+        if isinstance(self.alu, Elaboratable):
+            setattr(m.submodules, self.alu_name, self.alu)
         m.submodules.src_l = src_l = SRLatch(False, self.n_src, name="src")
         m.submodules.opc_l = opc_l = SRLatch(sync=False, name="opc")
         m.submodules.req_l = req_l = SRLatch(False, self.n_dst, name="req")
@@ -193,10 +206,10 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         m.d.comb += all_rd_pulse.eq(rising_edge(m, all_rd))
 
         # create rising pulse from alu valid condition.
-        alu_done = Signal(reset_less=True)
+        alu_done = self.cu.alu_done_o
         alu_pulse = Signal(reset_less=True)
         alu_pulsem = Signal(self.n_dst, reset_less=True)
-        m.d.comb += alu_done.eq(self.alu.n.valid_o)
+        m.d.comb += alu_done.eq(self.alu.n.o_valid)
         m.d.comb += alu_pulse.eq(rising_edge(m, alu_done))
         m.d.comb += alu_pulsem.eq(Repl(alu_pulse, self.n_dst))
 
@@ -210,16 +223,14 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         # is enough, when combined with when read-phase is done (rst_l.q)
         wr_any = Signal(reset_less=True)
         req_done = Signal(reset_less=True)
-        m.d.comb += self.done_o.eq(self.busy_o &
-                                   ~((self.wr.rel_o & ~self.wrmask).bool()))
+        m.d.comb += self.done_o.eq(self.busy_o & ~(self.wr.rel_o).bool())
         m.d.comb += wr_any.eq(self.wr.go_i.bool() | prev_wr_go.bool())
-        m.d.comb += req_done.eq(wr_any & ~self.alu.n.ready_i &
-                                ((req_l.q & self.wrmask) == 0))
+        m.d.comb += req_done.eq(wr_any & ~self.alu.n.i_ready & (req_l.q == 0))
         # argh, complicated hack: if there are no regs to write,
         # instead of waiting for regs that are never going to happen,
         # we indicate "done" when the ALU is "done"
         with m.If((self.wrmask == 0) &
-                  self.alu.n.ready_i & self.alu.n.valid_o & self.busy_o):
+                  self.alu.n.i_ready & self.alu.n.o_valid & self.busy_o):
             m.d.comb += req_done.eq(1)
 
         # shadow/go_die
@@ -233,24 +244,26 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         m.d.comb += reset_r.eq(self.rd.go_i | Repl(self.go_die_i, self.n_src))
 
         # read-done,wr-proceed latch
-        m.d.sync += rok_l.s.eq(self.issue_i)  # set up when issue starts
-        m.d.sync += rok_l.r.eq(self.alu.n.valid_o & self.busy_o)  # ALU done
+        rw_domain += rok_l.s.eq(self.issue_i)  # set up when issue starts
+        rw_domain += rok_l.r.eq(self.alu.n.o_valid & self.busy_o)  # ALU done
 
         # wr-done, back-to-start latch
-        m.d.sync += rst_l.s.eq(all_rd)     # set when read-phase is fully done
-        m.d.sync += rst_l.r.eq(rst_r)        # *off* on issue
+        rw_domain += rst_l.s.eq(all_rd)     # set when read-phase is fully done
+        rw_domain += rst_l.r.eq(rst_r)        # *off* on issue
 
         # opcode latch (not using go_rd_i) - inverted so that busy resets to 0
         m.d.sync += opc_l.s.eq(self.issue_i)       # set on issue
         m.d.sync += opc_l.r.eq(req_done)  # reset on ALU
 
-        # src operand latch (not using go_wr_i)
-        m.d.sync += src_l.s.eq(Repl(self.issue_i, self.n_src))
+        # src operand latch (not using go_wr_i) ANDed with rdmask
+        rdmaskn = Signal(self.n_src)
+        latchregister(m, self.rdmaskn, rdmaskn, self.issue_i, name="rdmask_l")
+        m.d.comb += src_l.s.eq(Repl(self.issue_i, self.n_src) & ~rdmaskn)
         m.d.sync += src_l.r.eq(reset_r)
 
         # dest operand latch (not using issue_i)
-        m.d.sync += req_l.s.eq(alu_pulsem & self.wrmask)
-        m.d.sync += req_l.r.eq(reset_w | prev_wr_go)
+        rw_domain += req_l.s.eq(alu_pulsem & self.wrmask)
+        m.d.comb += req_l.r.eq(reset_w | prev_wr_go)
 
         # pass operation to the ALU (sync: plenty time to wait for src reads)
         op = self.get_op()
@@ -264,20 +277,27 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
             name = "data_r%d" % i
             lro = self.get_out(i)
             ok = Const(1, 1)
+            data_r_ok = Const(1, 1)
             if isinstance(lro, Record):
+                print("wr fields", i, lro, lro.fields)
                 data_r = Record.like(lro, name=name)
-                print("wr fields", i, lro, data_r.fields)
                 # bye-bye abstract interface design..
-                fname = find_ok(data_r.fields)
+                fname = find_ok(lro.fields)
                 if fname:
                     ok = getattr(lro, fname)
+                    data_r_ok = getattr(data_r, fname)
+                # write-ok based on incoming output *and* whether the latched
+                # data was ok.
+                # XXX fails - wrok.append((ok|data_r_ok) & self.busy_o)
+                wrok.append(ok & self.busy_o)
             else:
-                data_r = Signal.like(lro, name=name, reset_less=True)
-            wrok.append(ok & self.busy_o)
-            with m.If(alu_pulse):
-                m.d.sync += data_r.eq(lro)
+                data_r = Signal.like(lro, name=name)
+                # really should retire this but it's part of unit tests
+                wrok.append(ok & self.busy_o)
+            #latchregister(m, lro, data_r, ok & self.busy_o, name=name)
+            latchregister(m, lro, data_r, alu_pulse, name=name)
             with m.If(self.issue_i):
-                m.d.sync += data_r.eq(0)
+                m.d.comb += data_r.eq(0)
             drl.append(data_r)
 
         # ok, above we collated anything with an "ok" on the output side
@@ -315,7 +335,10 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         # create a latch/register for src1/src2 (even if it is a copy of imm)
         for i in range(self.n_src):
             src, alusrc, latch, _ = sl[i]
-            latchregister(m, src, alusrc, latch, name="src_r%d" % i)
+            reg = latchregister(m, src, alusrc, latch, name="src_r%d" % i)
+            # rdmask stops src latches from being set.  clear all if not busy
+            with m.If(~self.busy_o):
+                m.d.sync += reg.eq(0)
 
         # -----
         # ALU connection / interaction
@@ -323,15 +346,15 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
 
         # on a go_read, tell the ALU we're accepting data.
         m.submodules.alui_l = alui_l = SRLatch(False, name="alui")
-        m.d.comb += self.alu.p.valid_i.eq(alui_l.q)
-        m.d.sync += alui_l.r.eq(self.alu.p.ready_o & alui_l.q)
+        m.d.comb += self.alu.p.i_valid.eq(alui_l.q)
+        m.d.sync += alui_l.r.eq(self.alu.p.o_ready & alui_l.q)
         m.d.comb += alui_l.s.eq(all_rd_pulse)
 
         # ALU output "ready" side.  alu "ready" indication stays hi until
         # ALU says "valid".
         m.submodules.alu_l = alu_l = SRLatch(False, name="alu")
-        m.d.comb += self.alu.n.ready_i.eq(alu_l.q)
-        m.d.sync += alu_l.r.eq(self.alu.n.valid_o & alu_l.q)
+        m.d.comb += self.alu.n.i_ready.eq(alu_l.q)
+        m.d.sync += alu_l.r.eq(self.alu.n.o_valid & alu_l.q)
         m.d.comb += alu_l.s.eq(all_rd_pulse)
 
         # -----
@@ -343,12 +366,15 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         m.d.comb += self.busy_o.eq(opc_l.q)  # busy out
 
         # read-release gated by busy (and read-mask)
-        bro = Repl(self.busy_o, self.n_src)
-        m.d.comb += self.rd.rel_o.eq(src_l.q & bro & slg & ~self.rdmaskn)
+        if True: #self.sync_rw: - experiment (doesn't work)
+            bro = Repl(self.busy_o, self.n_src)
+        else:
+            bro = Repl(self.busy_o|self.issue_i, self.n_src)
+        m.d.comb += self.rd.rel_o.eq(src_l.q & bro & slg)
 
         # write-release gated by busy and by shadow (and write-mask)
         brd = Repl(self.busy_o & self.shadown_i, self.n_dst)
-        m.d.comb += self.wr.rel_o.eq(req_l.q & brd & self.wrmask)
+        m.d.comb += self.wr.rel_o.eq(req_l.q_int & brd)
 
         # output the data from the latch on go_write
         for i in range(self.n_dst):
@@ -372,7 +398,7 @@ class MultiCompUnit(RegSpecALUAPI, Elaboratable):
         yield self.busy_o
         yield self.rd.rel_o
         yield self.wr.rel_o
-        yield self.data_o
+        yield self.o_data
 
     def ports(self):
         return list(self)
index 7dfdb15ecb45a0f41449418db612c3f774cddccc..d548f90c53c95fe1c85af5d10c88ea42d44780ed 100644 (file)
@@ -20,6 +20,11 @@ Loads are activated when Go_Write[0] is enabled.  The EA is computed,
 and (as long as there was no exception) the data comes out (at any
 time from the PortInterface), and is captured by the LDCompSTUnit.
 
+TODO: dcbz, yes, that's going to be complicated, has to be done
+ with great care, to detect the case when dcbz is set
+ and *not* expect to read any data, just the address.
+ so, wait for RA but not RB.
+
 Both LD and ST may request that the address be computed from summing
 operand1 (src[0]) with operand2 (src[1]) *or* by summing operand1 with
 the immediate (from the opcode).
@@ -53,6 +58,8 @@ the nested FSMs below are *combinatorial*).
 
     * A third FSM activates to cover ST.  it activates if op_is_st is true
 
+    * TODO document DCBZ (not complete yet)
+
     * The "overall" (fourth) FSM coordinates the progression and completion
       of the three other FSMs, firing "WR_RESET" which switches off "busy"
 
@@ -80,7 +87,7 @@ Terminology:
 
 from nmigen.compat.sim import run_simulation
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array, Repl
+from nmigen import Module, Signal, Mux, Cat, Elaboratable, Array, Repl, C
 from nmigen.hdl.rec import Record, Layout
 
 from nmutil.latch import SRLatch, latchregister
@@ -96,6 +103,10 @@ from openpower.decoder.power_enums import MicrOp, Function, LDSTMode
 from soc.fu.ldst.ldst_input_record import CompLDSTOpSubset
 from openpower.decoder.power_decoder2 import Data
 from openpower.consts import MSR
+from soc.config.test.test_loadstore import TestMemPspec
+
+# for debugging dcbz
+from nmutil.util import Display
 
 
 # TODO: LDSTInputData and LDSTOutputData really should be used
@@ -137,7 +148,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
 
     Data (outputs)
     --------------
-    * :data_o:  Dest out (LD)          - managed by wr[0] go/req
+    * :o_data:  Dest out (LD)          - managed by wr[0] go/req
     * :addr_o:  Address out (LD or ST) - managed by wr[1] go/req
     * :exc_o:   Address/Data Exception occurred.  LD/ST must terminate
 
@@ -178,17 +189,17 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
     TODO: use one module for the byte-reverse as it's quite expensive in gates
     """
 
-    def __init__(self, pi=None, rwid=64, awid=48, opsubset=CompLDSTOpSubset,
+    def __init__(self, pi=None, rwid=64, awid=64, opsubset=CompLDSTOpSubset,
                  debugtest=False, name=None):
         super().__init__(rwid)
         self.awid = awid
         self.pi = pi
         self.cu = cu = LDSTCompUnitRecord(rwid, opsubset, name=name)
-        self.debugtest = debugtest
+        self.debugtest = debugtest # enable debug output for unit testing
 
         # POWER-compliant LD/ST has index and update: *fixed* number of ports
         self.n_src = n_src = 3   # RA, RB, RT/RS
-        self.n_dst = n_dst = 2  # RA, RT/RS
+        self.n_dst = n_dst = 3  # RA, RT/RS, CR0
 
         # set up array of src and dest signals
         for i in range(n_src):
@@ -232,8 +243,9 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         self.oper_i = cu.oper_i
         self.src_i = cu._src_i
 
-        self.data_o = Data(self.data_wid, name="o")  # Dest1 out: RT
+        self.o_data = Data(self.data_wid, name="o")  # Dest1 out: RT
         self.addr_o = Data(self.data_wid, name="ea")  # Addr out: Update => RA
+        self.cr_o = Data(4, name="cr0")  # CR0 (for stdcx etc)
         self.exc_o = cu.exc_o
         self.done_o = cu.done_o
         self.busy_o = cu.busy_o
@@ -262,6 +274,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         m.submodules.sto_l = sto_l = SRLatch(sync=False, name="sto")
         m.submodules.wri_l = wri_l = SRLatch(sync=False, name="wri")
         m.submodules.upd_l = upd_l = SRLatch(sync=False, name="upd")
+        m.submodules.cr0_l = cr0_l = SRLatch(sync=False, name="cr0")
         m.submodules.rst_l = rst_l = SRLatch(sync=False, name="rst")
         m.submodules.lsd_l = lsd_l = SRLatch(sync=False, name="lsd") # done
 
@@ -271,6 +284,9 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         # opcode decode
         op_is_ld = Signal(reset_less=True)
         op_is_st = Signal(reset_less=True)
+        op_is_dcbz = Signal(reset_less=True)
+        op_is_st_or_dcbz = Signal(reset_less=True)
+        op_is_atomic = Signal(reset_less=True)
 
         # ALU/LD data output control
         alu_valid = Signal(reset_less=True)  # ALU operands are valid
@@ -281,6 +297,8 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         rda_any = Signal(reset_less=True)   # any read for address ops
         rd_done = Signal(reset_less=True)   # all *necessary* operands read
         wr_reset = Signal(reset_less=True)  # final reset condition
+        canceln = Signal(reset_less=True)   # cancel (active low)
+        store_done = Signal(reset_less=True) # store has been actioned
 
         # LD and ALU out
         alu_o = Signal(self.data_wid, reset_less=True)
@@ -293,26 +311,41 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         reset_o = Signal(reset_less=True)             # reset opcode
         reset_w = Signal(reset_less=True)             # reset write
         reset_u = Signal(reset_less=True)             # reset update
+        reset_c = Signal(reset_less=True)             # reset cr0
         reset_a = Signal(reset_less=True)             # reset adr latch
         reset_i = Signal(reset_less=True)             # issue|die (use a lot)
         reset_r = Signal(self.n_src, reset_less=True)  # reset src
         reset_s = Signal(reset_less=True)             # reset store
 
-        comb += reset_i.eq(issue_i | self.go_die_i)       # various
-        comb += reset_o.eq(self.done_o | self.go_die_i)      # opcode reset
-        comb += reset_w.eq(self.wr.go_i[0] | self.go_die_i)  # write reg 1
-        comb += reset_u.eq(self.wr.go_i[1] | self.go_die_i)  # update (reg 2)
-        comb += reset_s.eq(self.go_st_i | self.go_die_i)  # store reset
-        comb += reset_r.eq(self.rd.go_i | Repl(self.go_die_i, self.n_src))
-        comb += reset_a.eq(self.go_ad_i | self.go_die_i)
+        # end execution when a terminating condition is detected:
+        # - go_die_i: a speculative operation was cancelled
+        # - exc_o.happened: an exception has occurred
+        terminate = Signal()
+        comb += terminate.eq(self.go_die_i | self.exc_o.happened)
+
+        comb += reset_i.eq(issue_i | terminate)       # various
+        comb += reset_o.eq(self.done_o | terminate)      # opcode reset
+        comb += reset_w.eq(self.wr.go_i[0] | terminate)  # write reg 1
+        comb += reset_u.eq(self.wr.go_i[1] | terminate)  # update (reg 2)
+        comb += reset_c.eq(self.wr.go_i[2] | terminate)  # cr0 (reg 3)
+        comb += reset_s.eq(self.go_st_i | terminate)  # store reset
+        comb += reset_r.eq(self.rd.go_i | Repl(terminate, self.n_src))
+        comb += reset_a.eq(self.go_ad_i | terminate)
 
         p_st_go = Signal(reset_less=True)
         sync += p_st_go.eq(self.st.go_i)
 
         # decode bits of operand (latched)
         oper_r = CompLDSTOpSubset(name="oper_r")  # Dest register
-        comb += op_is_st.eq(oper_r.insn_type == MicrOp.OP_STORE)  # ST
-        comb += op_is_ld.eq(oper_r.insn_type == MicrOp.OP_LOAD)  # LD
+        comb += op_is_st.eq(oper_r.insn_type == MicrOp.OP_STORE)   # ST
+        comb += op_is_ld.eq(oper_r.insn_type == MicrOp.OP_LOAD)    # LD
+        comb += op_is_dcbz.eq(oper_r.insn_type == MicrOp.OP_DCBZ)  # DCBZ
+        comb += op_is_atomic.eq(oper_r.reserve) # atomic LR/SC
+        comb += op_is_st_or_dcbz.eq(op_is_st | op_is_dcbz)
+        # dcbz is special case of store
+        #uncomment if needed
+        #comb += Display("compldst_multi: op_is_dcbz = %i",
+        #                (oper_r.insn_type == MicrOp.OP_DCBZ))
         op_is_update = oper_r.ldst_mode == LDSTMode.update           # UPDATE
         op_is_cix = oper_r.ldst_mode == LDSTMode.cix           # cache-inhibit
         comb += self.load_mem_o.eq(op_is_ld & self.go_ad_i)
@@ -328,6 +361,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         #       - alu_l : looks after add of src1/2/imm (EA)
         #       - adr_l : waits for add (EA)
         #       - upd_l : waits for adr and Regfile (port 2)
+        #       - cr0_l : waits for Rc=1 and CR0 Regfile (port 3)
         #    - src_l[2] : ST
         # - lod_l       : waits for adr (EA) and for LD Data
         # - wri_l       : waits for LD Data and Regfile (port 1)
@@ -342,8 +376,9 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         sync += opc_l.r.eq(reset_o)  # XXX NOTE: INVERTED FROM book!
 
         # src operand latch
-        sync += src_l.s.eq(Repl(issue_i, self.n_src))
+        sync += src_l.s.eq(Repl(issue_i, self.n_src) & ~self.rdmaskn)
         sync += src_l.r.eq(reset_r)
+        #### sync += Display("reset_r = %i",reset_r)
 
         # alu latch.  use sync-delay between alu_ok and valid to generate pulse
         comb += alu_l.s.eq(reset_i)
@@ -365,12 +400,17 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
                             #self.done_o | (self.pi.busy_o & op_is_update),
                                           self.n_dst))
 
+        # CR0 operand latch (CR0 written to reg 3 if Rc=1)
+        op_is_rc1 = self.oper_i.rc.rc & self.oper_i.rc.ok
+        comb += cr0_l.s.eq(issue_i & op_is_rc1)
+        sync += cr0_l.r.eq(reset_c)
+
         # update-mode operand latch (EA written to reg 2)
         sync += upd_l.s.eq(reset_i)
         sync += upd_l.r.eq(reset_u)
 
         # store latch
-        comb += sto_l.s.eq(addr_ok & op_is_st)
+        comb += sto_l.s.eq(addr_ok & op_is_st_or_dcbz)
         sync += sto_l.r.eq(reset_s | p_st_go)
 
         # ld/st done.  needed to stop LD/ST from activating repeatedly
@@ -384,13 +424,18 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         # create a latch/register for the operand
         with m.If(self.issue_i):
             sync += oper_r.eq(self.oper_i)
-        with m.If(self.done_o):
+        with m.If(self.done_o | terminate):
             sync += oper_r.eq(0)
 
-        # and for LD
+        # and for LD and store-done
         ldd_r = Signal(self.data_wid, reset_less=True)  # Dest register
         latchregister(m, ldd_o, ldd_r, ld_ok, name="ldo_r")
 
+        # store actioned, communicate through CR0 (for atomic LR/SC)
+        latchregister(m, self.pi.store_done.data, store_done,
+                         self.pi.store_done.ok,
+                         name="std_r")
+
         # and for each input from the incoming src operands
         srl = []
         for i in range(self.n_src):
@@ -418,7 +463,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
 
         # now do the ALU addr add: one cycle, and say "ready" (next cycle, too)
         comb += alu_o.eq(src1_or_z + src2_or_imm)  # actual EA
-        m.d.sync += alu_ok.eq(alu_valid)             # keep ack in sync with EA
+        m.d.sync += alu_ok.eq(alu_valid & canceln) # keep ack in sync with EA
 
         ############################
         # Control Signal calculation
@@ -429,15 +474,16 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
 
         # 1st operand read-request only when zero not active
         # 2nd operand only needed when immediate is not active
-        slg = Cat(op_is_z, op_is_imm)
+        slg = Cat(op_is_z, op_is_imm) #is this correct ?
         bro = Repl(self.busy_o, self.n_src)
-        comb += self.rd.rel_o.eq(src_l.q & bro & ~slg & ~self.rdmaskn)
+        comb += self.rd.rel_o.eq(src_l.q & bro & ~slg)
 
         # note when the address-related read "go" signals are active
         comb += rda_any.eq(self.rd.go_i[0] | self.rd.go_i[1])
 
         # alu input valid when 1st and 2nd ops done (or imm not active)
-        comb += alu_valid.eq(busy_o & ~(self.rd.rel_o[0] | self.rd.rel_o[1]))
+        comb += alu_valid.eq(busy_o & ~(self.rd.rel_o[0] | self.rd.rel_o[1]) &
+                             canceln)
 
         # 3rd operand only needed when operation is a store
         comb += self.rd.rel_o[2].eq(src_l.q[2] & busy_o & op_is_st)
@@ -449,28 +495,33 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         comb += self.adr_rel_o.eq(alu_valid & adr_l.q & busy_o)
 
         # the write/store (etc) all must be cancelled if an exception occurs
-        cancel = Signal(reset_less=True)
-        comb += cancel.eq(self.exc_o.happened | self.shadown_i)
+        # note: cancel is active low, like shadown_i,
+        #       while exc_o.happpened is active high
+        comb += canceln.eq(~self.exc_o.happened & self.shadown_i)
 
         # store release when st ready *and* all operands read (and no shadow)
-        comb += self.st.rel_o.eq(sto_l.q & busy_o & rd_done & op_is_st &
-                               cancel)
+        # dcbz is special case of store -- TODO verify shadows
+        comb += self.st.rel_o.eq(sto_l.q & busy_o & rd_done & op_is_st_or_dcbz &
+                               canceln)
 
         # request write of LD result.  waits until shadow is dropped.
         comb += self.wr.rel_o[0].eq(rd_done & wri_l.q & busy_o & lod_l.qn &
-                                  op_is_ld & cancel)
+                                  op_is_ld & canceln)
 
         # request write of EA result only in update mode
         comb += self.wr.rel_o[1].eq(upd_l.q & busy_o & op_is_update &
-                                  alu_valid & cancel)
+                                  alu_valid & canceln)
+
+        # request write of CR0 result only in reserve and Rc=1
+        comb += self.wr.rel_o[2].eq(cr0_l.q & busy_o & op_is_atomic &
+                                  alu_valid & canceln)
 
         # provide "done" signal: select req_rel for non-LD/ST, adr_rel for LD/ST
         comb += wr_any.eq(self.st.go_i | p_st_go |
-                          self.wr.go_i[0] | self.wr.go_i[1])
-        comb += wr_reset.eq(rst_l.q & busy_o & cancel &
-                            ~(self.st.rel_o | self.wr.rel_o[0] |
-                              self.wr.rel_o[1]) &
-                            (lod_l.qn | op_is_st)
+                          self.wr.go_i.bool())
+        comb += wr_reset.eq(rst_l.q & busy_o & canceln &
+                            ~(self.st.rel_o | self.wr.rel_o.bool()) &
+                            (lod_l.qn | op_is_st_or_dcbz)
                             )
         comb += self.done_o.eq(wr_reset & (~self.pi.busy_o | op_is_ld))
 
@@ -478,18 +529,27 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         # Data/Address outputs
 
         # put the LD-output register directly onto the output bus on a go_write
-        comb += self.data_o.data.eq(self.dest[0])
+        comb += self.o_data.data.eq(self.dest[0])
+        comb += self.o_data.ok.eq(self.wr.rel_o[0])
         with m.If(self.wr.go_i[0]):
             comb += self.dest[0].eq(ldd_r)
 
         # "update" mode, put address out on 2nd go-write
         comb += self.addr_o.data.eq(self.dest[1])
+        comb += self.addr_o.ok.eq(self.wr.rel_o[1])
         with m.If(op_is_update & self.wr.go_i[1]):
             comb += self.dest[1].eq(addr_r)
 
+        # fun-fun-fun, calculate CR0 when Rc=1 requested.
+        cr0 = self.dest[2]
+        comb += self.cr_o.data.eq(cr0)
+        comb += self.cr_o.ok.eq(self.wr.rel_o[2])
+        with m.If(cr0_l.q):
+            comb += cr0.eq(Cat(C(0, 1), store_done, C(0, 2)))
+
         # need to look like MultiCompUnit: put wrmask out.
         # XXX may need to make this enable only when write active
-        comb += self.wrmask.eq(bro & Cat(op_is_ld, op_is_update))
+        comb += self.wrmask.eq(bro & Cat(op_is_ld, op_is_update, cr0_l.q))
 
         ###########################
         # PortInterface connections
@@ -497,15 +557,28 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
 
         # connect to LD/ST PortInterface.
         comb += pi.is_ld_i.eq(op_is_ld & busy_o)  # decoded-LD
-        comb += pi.is_st_i.eq(op_is_st & busy_o)  # decoded-ST
+        comb += pi.is_st_i.eq(op_is_st_or_dcbz & busy_o)  # decoded-ST
+        comb += pi.is_dcbz_i.eq(op_is_dcbz & busy_o)  # decoded-DCBZ
+        comb += pi.reserve.eq(oper_r.reserve & busy_o)  # atomic LR/SC
         comb += pi.data_len.eq(oper_r.data_len)  # data_len
         # address: use sync to avoid long latency
         sync += pi.addr.data.eq(addr_r)           # EA from adder
+        with m.If(op_is_dcbz):
+            sync += Display("LDSTCompUnit.DCBZ: EA from adder %x", addr_r)
+
         sync += pi.addr.ok.eq(alu_ok & lsd_l.q)  # "do address stuff" (once)
         comb += self.exc_o.eq(pi.exc_o)  # exception occurred
         comb += addr_ok.eq(self.pi.addr_ok_o)  # no exc, address fine
-        # connect MSR.PR for priv/virt operation
-        comb += pi.msr_pr.eq(oper_r.msr[MSR.PR])
+        # connect MSR.PR etc. for priv/virt operation
+        comb += pi.priv_mode.eq(~oper_r.msr[MSR.PR])
+        comb += pi.virt_mode.eq(oper_r.msr[MSR.DR])
+        comb += pi.mode_32bit.eq(~oper_r.msr[MSR.SF])
+        with m.If(self.issue_i): # display this only once
+            sync += Display("LDSTCompUnit: oper_r.msr %x pr=%x dr=%x sf=%x",
+                                      oper_r.msr,
+                                      oper_r.msr[MSR.PR],
+                                      oper_r.msr[MSR.DR],
+                                      oper_r.msr[MSR.SF])
 
         # byte-reverse on LD
         revnorev = Signal(64, reset_less=True)
@@ -539,6 +612,7 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
             comb += pi.st.data.eq(stdata_r)
         with m.Else():
             comb += pi.st.data.eq(op3)
+
         # store - data goes in based on go_st
         comb += pi.st.ok.eq(self.st.go_i)  # go store signals st data valid
 
@@ -549,9 +623,11 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         to LDSTOutputData o and o1 respectively.
         """
         if i == 0:
-            return self.data_o # LDSTOutputData.regspec o
+            return self.o_data # LDSTOutputData.regspec o
         if i == 1:
             return self.addr_o # LDSTOutputData.regspec o1
+        if i == 2:
+            return self.cr_o # LDSTOutputData.regspec cr_a
         # return self.dest[i]
 
     def get_fu_out(self, i):
@@ -572,8 +648,9 @@ class LDSTCompUnit(RegSpecAPI, Elaboratable):
         yield self.adr_rel_o
         yield self.sto_rel_o
         yield self.wr.rel_o
-        yield from self.data_o.ports()
+        yield from self.o_data.ports()
         yield from self.addr_o.ports()
+        yield from self.cr_o.ports()
         yield self.load_mem_o
         yield self.stwd_mem_o
 
@@ -603,9 +680,9 @@ def store(dut, src1, src2, src3, imm, imm_ok=True, update=False,
     yield dut.src1_i.eq(src1)
     yield dut.src2_i.eq(src2)
     yield dut.src3_i.eq(src3)
-    yield dut.oper_i.imm_data.imm.eq(imm)
+    yield dut.oper_i.imm_data.data.eq(imm)
     yield dut.oper_i.imm_data.ok.eq(imm_ok)
-    yield dut.oper_i.update.eq(update)
+    #guess: this one was removed -- yield dut.oper_i.update.eq(update)
     yield dut.issue_i.eq(1)
     yield
     yield dut.issue_i.eq(0)
@@ -620,9 +697,9 @@ def store(dut, src1, src2, src3, imm, imm_ok=True, update=False,
         if rel == active_rel:
             break
         yield
-    yield dut.rd.go.eq(active_rel)
+    yield dut.rd.go_i.eq(active_rel)
     yield
-    yield dut.rd.go.eq(0)
+    yield dut.rd.go_i.eq(0)
 
     yield from wait_for(dut.adr_rel_o, False, test1st=True)
     # yield from wait_for(dut.adr_rel_o)
@@ -659,7 +736,7 @@ def load(dut, src1, src2, imm, imm_ok=True, update=False, zero_a=False,
     yield dut.src1_i.eq(src1)
     yield dut.src2_i.eq(src2)
     yield dut.oper_i.zero_a.eq(zero_a)
-    yield dut.oper_i.imm_data.imm.eq(imm)
+    yield dut.oper_i.imm_data.data.eq(imm)
     yield dut.oper_i.imm_data.ok.eq(imm_ok)
     yield dut.issue_i.eq(1)
     yield
@@ -675,9 +752,9 @@ def load(dut, src1, src2, imm, imm_ok=True, update=False, zero_a=False,
 
     # wait for the operands (RA, RB, or both)
     if rd:
-        yield dut.rd.go.eq(rd)
+        yield dut.rd.go_i.eq(rd)
         yield from wait_for(dut.rd.rel_o)
-        yield dut.rd.go.eq(0)
+        yield dut.rd.go_i.eq(0)
 
     yield from wait_for(dut.adr_rel_o, False, test1st=True)
     # yield dut.ad.go.eq(1)
@@ -686,24 +763,24 @@ def load(dut, src1, src2, imm, imm_ok=True, update=False, zero_a=False,
 
     if update:
         yield from wait_for(dut.wr.rel_o[1])
-        yield dut.wr.go.eq(0b10)
+        yield dut.wr.go_i.eq(0b10)
         yield
         addr = yield dut.addr_o
         print("addr", addr)
-        yield dut.wr.go.eq(0)
+        yield dut.wr.go_i.eq(0)
     else:
         addr = None
 
     yield from wait_for(dut.wr.rel_o[0], test1st=True)
-    yield dut.wr.go.eq(1)
+    yield dut.wr.go_i.eq(1)
     yield
-    data = yield dut.data_o
-    print(data)
-    yield dut.wr.go.eq(0)
+    data = yield dut.o_data.o
+    data_ok = yield dut.o_data.o_ok
+    yield dut.wr.go_i.eq(0)
     yield from wait_for(dut.busy_o)
     yield
     # wait_for(dut.stwd_mem_o)
-    return data, addr
+    return data, data_ok, addr
 
 
 def ldst_sim(dut):
@@ -743,22 +820,31 @@ def ldst_sim(dut):
 
 class TestLDSTCompUnit(LDSTCompUnit):
 
-    def __init__(self, rwid):
+    def __init__(self, rwid, pspec):
         from soc.experiment.l0_cache import TstL0CacheBuffer
-        self.l0 = l0 = TstL0CacheBuffer()
-        pi = l0.l0.dports[0].pi
+        self.l0 = l0 = TstL0CacheBuffer(pspec)
+        pi = l0.l0.dports[0]
         LDSTCompUnit.__init__(self, pi, rwid, 4)
 
     def elaborate(self, platform):
         m = LDSTCompUnit.elaborate(self, platform)
         m.submodules.l0 = self.l0
-        m.d.comb += self.ad.go.eq(self.ad.rel)  # link addr-go direct to rel
+        # link addr-go direct to rel
+        m.d.comb += self.ad.go_i.eq(self.ad.rel_o)
         return m
 
 
 def test_scoreboard():
 
-    dut = TestLDSTCompUnit(16)
+    units = {}
+    pspec = TestMemPspec(ldst_ifacetype='bare_wb',
+                         imem_ifacetype='bare_wb',
+                         addr_wid=64,
+                         mask_wid=8,
+                         reg_wid=64,
+                         units=units)
+
+    dut = TestLDSTCompUnit(16,pspec)
     vl = rtlil.convert(dut, ports=dut.ports())
     with open("test_ldst_comp.il", "w") as f:
         f.write(vl)
@@ -768,24 +854,33 @@ def test_scoreboard():
 
 class TestLDSTCompUnitRegSpec(LDSTCompUnit):
 
-    def __init__(self):
+    def __init__(self, pspec):
         from soc.experiment.l0_cache import TstL0CacheBuffer
         from soc.fu.ldst.pipe_data import LDSTPipeSpec
         regspec = LDSTPipeSpec.regspec
-        self.l0 = l0 = TstL0CacheBuffer()
-        pi = l0.l0.dports[0].pi
+        self.l0 = l0 = TstL0CacheBuffer(pspec)
+        pi = l0.l0.dports[0]
         LDSTCompUnit.__init__(self, pi, regspec, 4)
 
     def elaborate(self, platform):
         m = LDSTCompUnit.elaborate(self, platform)
         m.submodules.l0 = self.l0
-        m.d.comb += self.ad.go.eq(self.ad.rel)  # link addr-go direct to rel
+        # link addr-go direct to rel
+        m.d.comb += self.ad.go_i.eq(self.ad.rel_o)
         return m
 
 
 def test_scoreboard_regspec():
 
-    dut = TestLDSTCompUnitRegSpec()
+    units = {}
+    pspec = TestMemPspec(ldst_ifacetype='bare_wb',
+                         imem_ifacetype='bare_wb',
+                         addr_wid=64,
+                         mask_wid=8,
+                         reg_wid=64,
+                         units=units)
+
+    dut = TestLDSTCompUnitRegSpec(pspec)
     vl = rtlil.convert(dut, ports=dut.ports())
     with open("test_ldst_comp.il", "w") as f:
         f.write(vl)
index 187918c0dbaa6f22ac02828a6d1dd1bdb4a953bc..bb9ff6e02e9dc8936960346c1cb37398d9c97878 100644 (file)
@@ -1,6 +1,6 @@
 from nmigen.compat.sim import run_simulation
 from nmigen.cli import verilog, rtlil
-from nmigen import Module, Const, Signal, Array, Cat, Elaboratable
+from nmigen import Module, Const, Signal, Cat, Elaboratable
 
 from regfile.regfile import RegFileArray, treereduce
 from scoreboard.fn_unit import IntFnUnit, FPFnUnit, LDFnUnit, STFnUnit
@@ -81,7 +81,7 @@ class Scoreboard(Elaboratable):
             int_src2_pend_v.append(fu.src2_pend_o)
             int_rd_pend_v.append(fu.int_rd_pend_o)
             int_wr_pend_v.append(fu.int_wr_pend_o)
-        int_fus = Array(if_l)
+        int_fus = if_l
 
         # Count of number of FUs
         n_int_fus = len(if_l)
@@ -217,7 +217,7 @@ class Scoreboard(Elaboratable):
         # merge (OR) all integer FU / ALU outputs to a single value
         # bit of a hack: treereduce needs a list with an item named "dest_o"
         dest_o = treereduce(int_alus)
-        m.d.sync += int_dest.data_i.eq(dest_o)
+        m.d.sync += int_dest.i_data.eq(dest_o)
 
         # connect ALUs
         for i, alu in enumerate(int_alus):
@@ -225,8 +225,8 @@ class Scoreboard(Elaboratable):
             m.d.comb += alu.go_wr_i.eq(intpick1.go_wr_o[i])
             m.d.comb += alu.issue_i.eq(fn_issue_l[i])
             # m.d.comb += fn_busy_l[i].eq(alu.busy_o)  # XXX ignore, use fnissue
-            m.d.comb += alu.src1_i.eq(int_src1.data_o)
-            m.d.comb += alu.src2_i.eq(int_src2.data_o)
+            m.d.comb += alu.src1_i.eq(int_src1.o_data)
+            m.d.comb += alu.src2_i.eq(int_src2.o_data)
             m.d.comb += if_l[i].req_rel_i.eq(alu.req_rel_o)  # pipe out ready
 
         return m
@@ -265,8 +265,12 @@ class RegSim:
         src2 = self.regs[src2]
         if op == IADD:
             val = (src1 + src2) & ((1 << (self.rwidth))-1)
+            print ("RegSim op: ADD", hex(src1), hex(src2), hex(val))
         elif op == ISUB:
             val = (src1 - src2) & ((1 << (self.rwidth))-1)
+            print ("RegSim op: SUB", hex(src1), hex(src2), hex(val))
+        else:
+            print ("RegSim op: UNSUPPORTED", op)
         self.regs[dest] = val
 
     def setval(self, dest, val):
index 8c002d28f542ca0c98c6702c2f23f2e2be28a64e..0d021e0daecb4856622aa5d8051c91d8f20ecf5d 100644 (file)
@@ -1,3 +1,17 @@
+#!/usr/bin/env python3
+#
+# Copyright (C) 2020,2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+# Copyright (C) 2020 Cole Poirier
+# Copyright (C) 2020,2021 Cesar Strauss
+# Copyright (C) 2021 Tobias Platen
+#
+# Original dcache.vhdl Copyright of its authors and licensed
+# by IBM under CC-BY 4.0
+# https://github.com/antonblanchard/microwatt
+#
+# Conversion to nmigen funded by NLnet and NGI POINTER under EU Grants
+# 871528 and 957073, under the LGPL-v3+ License
+
 """DCache
 
 based on Anton Blanchard microwatt dcache.vhdl
@@ -13,6 +27,8 @@ Links:
 
 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
 * https://bugs.libre-soc.org/show_bug.cgi?id=469
+* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
+  (discussion about brams for ECP5)
 
 """
 
@@ -24,12 +40,16 @@ sys.setrecursionlimit(1000000)
 
 from enum import Enum, unique
 
-from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
+from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
+                    Record, Memory)
 from nmutil.util import Display
+from nmigen.lib.coding import Decoder
 
 from copy import deepcopy
 from random import randint, seed
 
+from nmigen_soc.wishbone.bus import Interface
+
 from nmigen.cli import main
 from nmutil.iocontrol import RecordObject
 from nmigen.utils import log2_int
@@ -45,8 +65,8 @@ from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
                                 WBIOMasterOut, WBIOSlaveOut)
 
 from soc.experiment.cache_ram import CacheRam
-#from soc.experiment.plru import PLRU
-from nmutil.plru import PLRU
+from soc.experiment.plru import PLRU, PLRUs
+#from nmutil.plru import PLRU, PLRUs
 
 # for test
 from soc.bus.sram import SRAM
@@ -62,8 +82,8 @@ from nmutil.util import wrap
 
 # TODO: make these parameters of DCache at some point
 LINE_SIZE = 64    # Line size in bytes
-NUM_LINES = 16    # Number of lines in a set
-NUM_WAYS = 4      # Number of ways
+NUM_LINES = 64    # Number of lines in a set
+NUM_WAYS = 2      # Number of ways
 TLB_SET_SIZE = 64 # L1 DTLB entries per set
 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
@@ -72,7 +92,7 @@ LOG_LENGTH = 0    # Non-zero to enable log data collection
 # BRAM organisation: We never access more than
 #     -- WB_DATA_BITS at a time so to save
 #     -- resources we make the array only that wide, and
-#     -- use consecutive indices for to make a cache "line"
+#     -- use consecutive indices to make a cache "line"
 #     --
 #     -- ROW_SIZE is the width in bytes of the BRAM
 #     -- (based on WB, so 64-bits)
@@ -130,15 +150,18 @@ TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 WAY_BITS = log2_int(NUM_WAYS)
 
 # Example of layout for 32 lines of 64 bytes:
-layout = """\
+layout = f"""\
+  DCache Layout:
+ |.. -----------------------| REAL_ADDR_BITS ({REAL_ADDR_BITS})
+  ..         |--------------| SET_SIZE_BITS ({SET_SIZE_BITS})
   ..  tag    |index|  line  |
   ..         |   row   |    |
-  ..         |     |---|    | ROW_LINE_BITS  (3)
-  ..         |     |--- - --| LINE_OFF_BITS (6)
-  ..         |         |- --| ROW_OFF_BITS  (3)
-  ..         |----- ---|    | ROW_BITS      (8)
-  ..         |-----|        | INDEX_BITS    (5)
-  .. --------|              | TAG_BITS      (45)
+  ..         |     |---|    | ROW_LINE_BITS ({ROW_LINE_BITS})
+  ..         |     |--- - --| LINE_OFF_BITS ({LINE_OFF_BITS})
+  ..         |         |- --| ROW_OFF_BITS  ({ROW_OFF_BITS})
+  ..         |----- ---|    | ROW_BITS      ({ROW_BITS})
+  ..         |-----|        | INDEX_BITS    ({INDEX_BITS})
+  .. --------|              | TAG_BITS      ({TAG_BITS})
 """
 print (layout)
 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
@@ -151,19 +174,27 @@ print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 
 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
+print ("    TAG_WIDTH", TAG_WIDTH)
+print ("     NUM_WAYS", NUM_WAYS)
+print ("    NUM_LINES", NUM_LINES)
+
+
+def CacheTag(name=None):
+    tag_layout = [('valid', NUM_WAYS),
+                  ('tag', TAG_RAM_WIDTH),
+                 ]
+    return Record(tag_layout, name=name)
+
 
 def CacheTagArray():
-    return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
-                        for x in range(NUM_LINES))
+    return Array(CacheTag(name="tag%d" % x) for x in range(NUM_LINES))
 
-def CacheValidBitsArray():
-    return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
-                        for x in range(NUM_LINES))
 
 def RowPerLineValidArray():
     return Array(Signal(name="rows_valid%d" % x) \
                         for x in range(ROW_PER_LINE))
 
+
 # L1 TLB
 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
@@ -190,21 +221,24 @@ assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 
 
-def TLBValidBitsArray():
-    return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
-                for x in range(TLB_SET_SIZE))
+def TLBHit(name):
+    return Record([('valid', 1),
+                   ('way', TLB_WAY_BITS)], name=name)
 
 def TLBTagEAArray():
     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
                 for x in range (TLB_NUM_WAYS))
 
-def TLBTagsArray():
-    return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
-                for x in range (TLB_SET_SIZE))
+def TLBRecord(name):
+    tlb_layout = [('valid', TLB_NUM_WAYS),
+                  ('tag', TLB_TAG_WAY_BITS),
+                  ('pte', TLB_PTE_WAY_BITS)
+                 ]
+    return Record(tlb_layout, name=name)
 
-def TLBPtesArray():
-    return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
-                for x in range(TLB_SET_SIZE))
+def TLBValidArray():
+    return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
+                        for x in range(TLB_SET_SIZE))
 
 def HitWaySet():
     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
@@ -377,9 +411,8 @@ class RegStage1(RecordObject):
         self.cache_hit        = Signal()
 
         # TLB hit state
-        self.tlb_hit          = Signal()
-        self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
-        self.tlb_hit_index    = Signal(TLB_WAY_BITS)
+        self.tlb_hit          = TLBHit("tlb_hit")
+        self.tlb_hit_index    = Signal(TLB_SET_BITS)
 
         # 2-stage data buffer for data forwarded from writes to reads
         self.forward_data1    = Signal(64)
@@ -421,8 +454,8 @@ class RegStage1(RecordObject):
 
 # Reservation information
 class Reservation(RecordObject):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, name=None):
+        super().__init__(name=name)
         self.valid = Signal()
         self.addr  = Signal(64-LINE_OFF_BITS)
 
@@ -432,83 +465,162 @@ class DTLBUpdate(Elaboratable):
         self.tlbie    = Signal()
         self.tlbwe    = Signal()
         self.doall    = Signal()
-        self.updated  = Signal()
-        self.v_updated  = Signal()
-        self.tlb_hit    = Signal()
+        self.tlb_hit     = TLBHit("tlb_hit")
         self.tlb_req_index = Signal(TLB_SET_BITS)
 
-        self.tlb_hit_way     = Signal(TLB_WAY_BITS)
-        self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
-        self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
         self.repl_way        = Signal(TLB_WAY_BITS)
         self.eatag           = Signal(TLB_EA_TAG_BITS)
         self.pte_data        = Signal(TLB_PTE_BITS)
 
-        self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
-
-        self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
-        self.db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
-        self.pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+        # read from dtlb array
+        self.tlb_read       = Signal()
+        self.tlb_read_index = Signal(TLB_SET_BITS)
+        self.tlb_way        = TLBRecord("o_tlb_way")
 
     def elaborate(self, platform):
         m = Module()
         comb = m.d.comb
         sync = m.d.sync
 
-        tagset   = Signal(TLB_TAG_WAY_BITS)
-        pteset   = Signal(TLB_PTE_WAY_BITS)
-
-        tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
-        comb += db_out.eq(self.dv)
+        # there are 3 parts to this:
+        # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
+        # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
+        # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
+        # be a Memory because they can all be cleared (tlbie, doall), i mean,
+        # we _could_, in theory, by overriding the Reset Signal of the Memory,
+        # hmmm....
+
+        dtlb_valid = TLBValidArray()
+        tlb_req_index = self.tlb_req_index
+
+        print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
+        print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
+        print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
+        print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
+        print ("    TLB_PTE_BITS", TLB_PTE_BITS)
+        print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
+
+        # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
+        tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
+        m.submodules.rd_tagway = rd_tagway = tagway.read_port()
+        m.submodules.wr_tagway = wr_tagway = tagway.write_port(
+                                    granularity=TLB_EA_TAG_BITS)
+
+        pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
+        m.submodules.rd_pteway = rd_pteway = pteway.read_port()
+        m.submodules.wr_pteway = wr_pteway = pteway.write_port(
+                                    granularity=TLB_PTE_BITS)
+
+        # commented out for now, can be put in if Memory.reset can be
+        # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
+        #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
+        #m.submodules.rd_valid = rd_valid = validm.read_port()
+        #m.submodules.wr_valid = wr_valid = validm.write_port(
+                                    #granularity=1)
+
+        # connect up read and write addresses to Valid/PTE/TAG SRAMs
+        m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
+        m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
+        #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
+        m.d.comb += wr_tagway.addr.eq(tlb_req_index)
+        m.d.comb += wr_pteway.addr.eq(tlb_req_index)
+        #m.d.comb += wr_valid.addr.eq(tlb_req_index)
+
+        updated  = Signal()
+        v_updated  = Signal()
+        tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
+        db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
+        pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+        dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
+
+        comb += dv.eq(dtlb_valid[tlb_req_index])
+        comb += db_out.eq(dv)
 
         with m.If(self.tlbie & self.doall):
-            pass # clear all back in parent
+            # clear all valid bits at once
+            # XXX hmmm, validm _could_ use Memory reset here...
+            for i in range(TLB_SET_SIZE):
+                sync += dtlb_valid[i].eq(0)
         with m.Elif(self.tlbie):
-            with m.If(self.tlb_hit):
-                comb += db_out.bit_select(self.tlb_hit_way, 1).eq(0)
-                comb += self.v_updated.eq(1)
-
+            # invalidate just the hit_way
+            with m.If(self.tlb_hit.valid):
+                comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
+                comb += v_updated.eq(1)
         with m.Elif(self.tlbwe):
-
-            comb += tagset.eq(self.tlb_tag_way)
-            comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
-            comb += tb_out.eq(tagset)
-
-            comb += pteset.eq(self.tlb_pte_way)
-            comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
-            comb += pb_out.eq(pteset)
-
+            # write to the requested tag and PTE
+            comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
+            comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
+            # set valid bit
             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 
-            comb += self.updated.eq(1)
-            comb += self.v_updated.eq(1)
+            comb += updated.eq(1)
+            comb += v_updated.eq(1)
+
+        # above, sometimes valid is requested to be updated but data not
+        # therefore split them out, here.  note the granularity thing matches
+        # with the shift-up of the eatag/pte_data into the correct TLB way.
+        # thus is it not necessary to write the entire lot, just the portion
+        # being altered: hence writing the *old* copy of the row is not needed
+        with m.If(updated): # PTE and TAG to be written
+            comb += wr_pteway.data.eq(pb_out)
+            comb += wr_pteway.en.eq(1<<self.repl_way)
+            comb += wr_tagway.data.eq(tb_out)
+            comb += wr_tagway.en.eq(1<<self.repl_way)
+        with m.If(v_updated): # Valid to be written
+            sync += dtlb_valid[tlb_req_index].eq(db_out)
+            #comb += wr_valid.data.eq(db_out)
+            #comb += wr_valid.en.eq(1<<self.repl_way)
+
+        # select one TLB way, use a register here
+        r_delay = Signal()
+        sync += r_delay.eq(self.tlb_read)
+        # first deal with the valids, which are not in a Memory.
+        # tlb way valid is output on a 1 clock delay with sync,
+        # but have to explicitly deal with "forwarding" here
+        with m.If(self.tlb_read):
+            with m.If(v_updated): # write *and* read in same cycle: forward
+                sync += self.tlb_way.valid.eq(db_out)
+            with m.Else():
+                sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
+        # now deal with the Memory-read case. the output must remain
+        # valid (stable) even when a read-request is not made, but stable
+        # on a one-clock delay, hence the register
+        r_tlb_way        = TLBRecord("r_tlb_way")
+        with m.If(r_delay):
+            # on one clock delay, capture the contents of the read port(s)
+            comb += self.tlb_way.tag.eq(rd_tagway.data)
+            comb += self.tlb_way.pte.eq(rd_pteway.data)
+            sync += r_tlb_way.tag.eq(rd_tagway.data)
+            sync += r_tlb_way.pte.eq(rd_pteway.data)
+        with m.Else():
+            # ... so that the register can output it when no read is requested
+            # it's rather overkill but better to be safe than sorry
+            comb += self.tlb_way.tag.eq(r_tlb_way.tag)
+            comb += self.tlb_way.pte.eq(r_tlb_way.pte)
+            #comb += self.tlb_way.eq(r_tlb_way)
 
         return m
 
 
 class DCachePendingHit(Elaboratable):
 
-    def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
-                      cache_valid_idx, cache_tag_set,
-                    req_addr,
-                    hit_set):
+    def __init__(self, tlb_way,
+                      cache_i_validdx, cache_tag_set,
+                    req_addr):
 
         self.go          = Signal()
         self.virt_mode   = Signal()
         self.is_hit      = Signal()
-        self.tlb_hit     = Signal()
+        self.tlb_hit      = TLBHit("tlb_hit")
         self.hit_way     = Signal(WAY_BITS)
         self.rel_match   = Signal()
         self.req_index   = Signal(INDEX_BITS)
         self.reload_tag  = Signal(TAG_BITS)
 
-        self.tlb_hit_way = tlb_hit_way
-        self.tlb_pte_way = tlb_pte_way
-        self.tlb_valid_way = tlb_valid_way
-        self.cache_valid_idx = cache_valid_idx
+        self.tlb_way = tlb_way
+        self.cache_i_validdx = cache_i_validdx
         self.cache_tag_set = cache_tag_set
         self.req_addr = req_addr
-        self.hit_set = hit_set
 
     def elaborate(self, platform):
         m = Module()
@@ -518,19 +630,18 @@ class DCachePendingHit(Elaboratable):
         go = self.go
         virt_mode = self.virt_mode
         is_hit = self.is_hit
-        tlb_pte_way = self.tlb_pte_way
-        tlb_valid_way = self.tlb_valid_way
-        cache_valid_idx = self.cache_valid_idx
+        tlb_way = self.tlb_way
+        cache_i_validdx = self.cache_i_validdx
         cache_tag_set = self.cache_tag_set
         req_addr = self.req_addr
-        tlb_hit_way = self.tlb_hit_way
         tlb_hit = self.tlb_hit
-        hit_set = self.hit_set
         hit_way = self.hit_way
         rel_match = self.rel_match
         req_index = self.req_index
         reload_tag = self.reload_tag
 
+        hit_set     = Array(Signal(name="hit_set_%d" % i) \
+                                  for i in range(TLB_NUM_WAYS))
         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
                                     for i in range(TLB_NUM_WAYS))
         hit_way_set = HitWaySet()
@@ -544,35 +655,35 @@ class DCachePendingHit(Elaboratable):
         with m.If(virt_mode):
             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
-                s_hit       = Signal()
-                s_pte       = Signal(TLB_PTE_BITS)
-                s_ra        = Signal(REAL_ADDR_BITS)
-                comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
+                s_hit       = Signal(name="s_hit%d" % j)
+                s_pte       = Signal(TLB_PTE_BITS, name="s_pte%d" % j)
+                s_ra        = Signal(REAL_ADDR_BITS, name="s_ra%d" % j)
+                # read the PTE, calc the Real Address, get tge tag
+                comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
                 comb += s_tag.eq(get_tag(s_ra))
-
+                # for each way check tge tag against the cache tag set
                 for i in range(NUM_WAYS): # way_t
                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
-                    comb += is_tag_hit.eq(go & cache_valid_idx[i] &
+                    comb += is_tag_hit.eq(go & cache_i_validdx[i] &
                                   (read_tag(i, cache_tag_set) == s_tag)
-                                  & tlb_valid_way[j])
+                                  & (tlb_way.valid[j]))
                     with m.If(is_tag_hit):
                         comb += hit_way_set[j].eq(i)
                         comb += s_hit.eq(1)
                 comb += hit_set[j].eq(s_hit)
-                with m.If(s_tag == reload_tag):
-                    comb += rel_matches[j].eq(1)
-            with m.If(tlb_hit):
-                comb += is_hit.eq(hit_set[tlb_hit_way])
-                comb += hit_way.eq(hit_way_set[tlb_hit_way])
-                comb += rel_match.eq(rel_matches[tlb_hit_way])
+                comb += rel_matches[j].eq(s_tag == reload_tag)
+            with m.If(tlb_hit.valid):
+                comb += is_hit.eq(hit_set[tlb_hit.way])
+                comb += hit_way.eq(hit_way_set[tlb_hit.way])
+                comb += rel_match.eq(rel_matches[tlb_hit.way])
         with m.Else():
             s_tag       = Signal(TAG_BITS)
             comb += s_tag.eq(get_tag(req_addr))
             for i in range(NUM_WAYS): # way_t
                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
-                comb += is_tag_hit.eq(go & cache_valid_idx[i] &
+                comb += is_tag_hit.eq(go & cache_i_validdx[i] &
                           (read_tag(i, cache_tag_set) == s_tag))
                 with m.If(is_tag_hit):
                     comb += hit_way.eq(i)
@@ -592,7 +703,7 @@ class DCache(Elaboratable):
       at the end of line (this requires dealing with requests coming in
       while not idle...)
     """
-    def __init__(self):
+    def __init__(self, pspec=None):
         self.d_in      = LoadStore1ToDCacheType("d_in")
         self.d_out     = DCacheToLoadStore1Type("d_out")
 
@@ -600,12 +711,24 @@ class DCache(Elaboratable):
         self.m_out     = DCacheToMMUType("m_out")
 
         self.stall_out = Signal()
-
-        self.wb_out    = WBMasterOut("wb_out")
-        self.wb_in     = WBSlaveOut("wb_in")
+        self.any_stall_out = Signal()
+        self.dreq_when_stall = Signal()
+        self.mreq_when_stall = Signal()
+
+        # standard naming (wired to non-standard for compatibility)
+        self.bus = Interface(addr_width=32,
+                            data_width=64,
+                            granularity=8,
+                            features={'stall'},
+                            alignment=0,
+                            name="dcache")
 
         self.log_out   = Signal(20)
 
+        # test if microwatt compatibility is to be enabled
+        self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+                                 (pspec.microwatt_compat == True))
+
     def stage_0(self, m, r0, r1, r0_full):
         """Latch the request in r0.req as long as we're not stalling
         """
@@ -634,6 +757,7 @@ class DCache(Elaboratable):
             comb += r.doall.eq(m_in.doall)
             comb += r.tlbld.eq(m_in.tlbld)
             comb += r.mmu_req.eq(1)
+            comb += r.d_valid.eq(1)
             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
                                  m_in.addr, m_in.pte, r.req.load)
 
@@ -644,25 +768,25 @@ class DCache(Elaboratable):
             comb += r.doall.eq(0)
             comb += r.tlbld.eq(0)
             comb += r.mmu_req.eq(0)
+            comb += r.d_valid.eq(0)
+
+        sync += r0_full.eq(0)
         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
             sync += r0.eq(r)
             sync += r0_full.eq(r.req.valid)
+        with m.Elif(~r0.d_valid):
             # Sample data the cycle after a request comes in from loadstore1.
             # If another request has come in already then the data will get
             # put directly into req.data below.
-            with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
-                     ~r0.mmu_req):
-                sync += r0.req.data.eq(d_in.data)
-                sync += r0.d_valid.eq(1)
+            sync += r0.req.data.eq(d_in.data)
+            sync += r0.d_valid.eq(1)
         with m.If(d_in.valid):
             m.d.sync += Display("    DCACHE req cache "
                                 "virt %d addr %x data %x ld %d",
                                  r.req.virt_mode, r.req.addr,
                                  r.req.data, r.req.load)
 
-    def tlb_read(self, m, r0_stall, tlb_valid_way,
-                 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
-                 dtlb_tags, dtlb_ptes):
+    def tlb_read(self, m, r0_stall, tlb_way):
         """TLB
         Operates in the second cycle on the request latched in r0.req.
         TLB updates write the entry at the end of the second cycle.
@@ -671,7 +795,6 @@ class DCache(Elaboratable):
         sync = m.d.sync
         m_in, d_in = self.m_in, self.d_in
 
-        index    = Signal(TLB_SET_BITS)
         addrbits = Signal(TLB_SET_BITS)
 
         amin = TLB_LG_PGSZ
@@ -681,16 +804,15 @@ class DCache(Elaboratable):
             comb += addrbits.eq(m_in.addr[amin : amax])
         with m.Else():
             comb += addrbits.eq(d_in.addr[amin : amax])
-        comb += index.eq(addrbits)
 
         # If we have any op and the previous op isn't finished,
         # then keep the same output for next cycle.
-        with m.If(~r0_stall):
-            sync += tlb_valid_way.eq(dtlb_valid_bits[index])
-            sync += tlb_tag_way.eq(dtlb_tags[index])
-            sync += tlb_pte_way.eq(dtlb_ptes[index])
+        d = self.dtlb_update
+        comb += d.tlb_read_index.eq(addrbits)
+        comb += d.tlb_read.eq(~r0_stall)
+        comb += tlb_way.eq(d.tlb_way)
 
-    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
+    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
         """Generate TLB PLRUs
         """
         comb = m.d.comb
@@ -698,20 +820,19 @@ class DCache(Elaboratable):
 
         if TLB_NUM_WAYS == 0:
             return
-        for i in range(TLB_SET_SIZE):
-            # TLB PLRU interface
-            tlb_plru        = PLRU(TLB_WAY_BITS)
-            setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
-            tlb_plru_acc_en = Signal()
 
-            comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
-            comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
-            comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
-            comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
+        # suite of PLRUs with a selection and output mechanism
+        tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
+        m.submodules.tlb_plrus = tlb_plrus
+        comb += tlb_plrus.way.eq(r1.tlb_hit.way)
+        comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
+        comb += tlb_plrus.index.eq(r1.tlb_hit_index)
+        comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
+        comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 
     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
-                   tlb_valid_way, tlb_tag_way, tlb_hit_way,
-                   tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
+                   tlb_way,
+                   pte, tlb_hit, valid_ra, perm_attr, ra):
 
         comb = m.d.comb
 
@@ -726,18 +847,18 @@ class DCache(Elaboratable):
         for i in range(TLB_NUM_WAYS):
             is_tag_hit = Signal(name="is_tag_hit%d" % i)
             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
-            comb += tlb_tag.eq(read_tlb_tag(i, tlb_tag_way))
-            comb += is_tag_hit.eq(tlb_valid_way[i] & (tlb_tag == eatag))
+            comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
+            comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
             with m.If(is_tag_hit):
                 comb += hitway.eq(i)
                 comb += hit.eq(1)
 
-        comb += tlb_hit.eq(hit & r0_valid)
-        comb += tlb_hit_way.eq(hitway)
+        comb += tlb_hit.valid.eq(hit & r0_valid)
+        comb += tlb_hit.way.eq(hitway)
 
-        with m.If(tlb_hit):
-            comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
-        comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
+        with m.If(tlb_hit.valid):
+            comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
+        comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 
         with m.If(r0.req.virt_mode):
             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
@@ -761,7 +882,7 @@ class DCache(Elaboratable):
 
         with m.If(valid_ra):
             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
-                                r0.req.virt_mode, tlb_hit, ra, pte)
+                                r0.req.virt_mode, tlb_hit.valid, ra, pte)
             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
@@ -769,11 +890,8 @@ class DCache(Elaboratable):
             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 
-    def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
-                    tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
-                    dtlb_tags, tlb_pte_way, dtlb_ptes):
-
-        dtlb_valids = TLBValidBitsArray()
+    def tlb_update(self, m, r0_valid, r0, tlb_req_index,
+                    tlb_hit, tlb_plru_victim):
 
         comb = m.d.comb
         sync = m.d.sync
@@ -784,32 +902,18 @@ class DCache(Elaboratable):
         comb += tlbie.eq(r0_valid & r0.tlbie)
         comb += tlbwe.eq(r0_valid & r0.tlbld)
 
-        m.submodules.tlb_update = d = DTLBUpdate()
-        with m.If(tlbie & r0.doall):
-            # clear all valid bits at once
-            for i in range(TLB_SET_SIZE):
-                sync += dtlb_valid_bits[i].eq(0)
-        with m.If(d.updated):
-            sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
-            sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
-        with m.If(d.v_updated):
-            sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
-
-        comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
+        d = self.dtlb_update
 
         comb += d.tlbie.eq(tlbie)
         comb += d.tlbwe.eq(tlbwe)
         comb += d.doall.eq(r0.doall)
         comb += d.tlb_hit.eq(tlb_hit)
-        comb += d.tlb_hit_way.eq(tlb_hit_way)
-        comb += d.tlb_tag_way.eq(tlb_tag_way)
-        comb += d.tlb_pte_way.eq(tlb_pte_way)
         comb += d.tlb_req_index.eq(tlb_req_index)
 
-        with m.If(tlb_hit):
-            comb += d.repl_way.eq(tlb_hit_way)
+        with m.If(tlb_hit.valid):
+            comb += d.repl_way.eq(tlb_hit.way)
         with m.Else():
-            comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
+            comb += d.repl_way.eq(tlb_plru_victim)
         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
         comb += d.pte_data.eq(r0.req.data)
 
@@ -822,16 +926,13 @@ class DCache(Elaboratable):
         if TLB_NUM_WAYS == 0:
             return
 
-        for i in range(NUM_LINES):
-            # PLRU interface
-            plru        = PLRU(WAY_BITS)
-            setattr(m.submodules, "plru%d" % i, plru)
-            plru_acc_en = Signal()
-
-            comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
-            comb += plru.acc_en.eq(plru_acc_en)
-            comb += plru.acc_i.eq(r1.hit_way)
-            comb += plru_victim[i].eq(plru.lru_o)
+        # suite of PLRUs with a selection and output mechanism
+        m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
+        comb += plrus.way.eq(r1.hit_way)
+        comb += plrus.valid.eq(r1.cache_hit)
+        comb += plrus.index.eq(r1.hit_index)
+        comb += plrus.isel.eq(r1.store_index) # select victim
+        comb += plru_victim.eq(plrus.o_index) # selected victim
 
     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
         """Cache tag RAM read port
@@ -848,15 +949,14 @@ class DCache(Elaboratable):
             comb += index.eq(get_index(m_in.addr))
         with m.Else():
             comb += index.eq(get_index(d_in.addr))
-        sync += cache_tag_set.eq(cache_tags[index])
+        sync += cache_tag_set.eq(cache_tags[index].tag)
 
     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
-                       r0_valid, r1, cache_valids, replace_way,
+                       r0_valid, r1, cache_tags, replace_way,
                        use_forward1_next, use_forward2_next,
                        req_hit_way, plru_victim, rc_ok, perm_attr,
                        valid_ra, perm_ok, access_ok, req_op, req_go,
-                       tlb_pte_way,
-                       tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
+                       tlb_hit, tlb_way, cache_tag_set,
                        cancel_store, req_same_tag, r0_stall, early_req_row):
         """Cache request parsing and hit detection
         """
@@ -870,9 +970,7 @@ class DCache(Elaboratable):
         opsel       = Signal(3)
         go          = Signal()
         nc          = Signal()
-        hit_set     = Array(Signal(name="hit_set_%d" % i) \
-                                  for i in range(TLB_NUM_WAYS))
-        cache_valid_idx = Signal(NUM_WAYS)
+        cache_i_validdx = Signal(NUM_WAYS)
 
         # Extract line, row and tag from request
         comb += req_index.eq(get_index(r0.req.addr))
@@ -884,19 +982,17 @@ class DCache(Elaboratable):
                     r0.req.addr, ra, req_index, req_tag, req_row)
 
         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
-        comb += cache_valid_idx.eq(cache_valids[req_index])
-
-        m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
-                                tlb_valid_way, tlb_hit_way,
-                                cache_valid_idx, cache_tag_set,
-                                r0.req.addr,
-                                hit_set)
+        comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 
+        m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
+                                            cache_i_validdx, cache_tag_set,
+                                            r0.req.addr)
         comb += dc.tlb_hit.eq(tlb_hit)
         comb += dc.reload_tag.eq(r1.reload_tag)
         comb += dc.virt_mode.eq(r0.req.virt_mode)
         comb += dc.go.eq(go)
         comb += dc.req_index.eq(req_index)
+
         comb += is_hit.eq(dc.is_hit)
         comb += hit_way.eq(dc.hit_way)
         comb += req_same_tag.eq(dc.rel_match)
@@ -933,7 +1029,7 @@ class DCache(Elaboratable):
 
         # The way to replace on a miss
         with m.If(r1.write_tag):
-            comb += replace_way.eq(plru_victim[r1.store_index])
+            comb += replace_way.eq(plru_victim)
         with m.Else():
             comb += replace_way.eq(r1.store_way)
 
@@ -945,6 +1041,7 @@ class DCache(Elaboratable):
                            (perm_attr.wr_perm |
                               (r0.req.load & perm_attr.rd_perm)))
         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
+
         # Combine the request and cache hit status to decide what
         # operation needs to be done
         comb += nc.eq(r0.req.nc | perm_attr.nocache)
@@ -1004,7 +1101,6 @@ class DCache(Elaboratable):
 
     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                         reservation, r0):
-
         comb = m.d.comb
         sync = m.d.sync
 
@@ -1041,6 +1137,7 @@ class DCache(Elaboratable):
                 dsel = data_fwd.word_select(i, 8)
                 comb += data_out.word_select(i, 8).eq(dsel)
 
+        # DCache output to LoadStore
         comb += d_out.valid.eq(r1.ls_valid)
         comb += d_out.data.eq(data_out)
         comb += d_out.store_done.eq(~r1.stcx_fail)
@@ -1079,7 +1176,10 @@ class DCache(Elaboratable):
 
             # error cases complete without stalling
             with m.If(r1.ls_error):
-                sync += Display("completing ld/st with error")
+                with m.If(r1.dcbz):
+                    sync += Display("completing dcbz with error")
+                with m.Else():
+                    sync += Display("completing ld/st with error")
 
             # Slow ops (load miss, NC, stores)
             with m.If(r1.slow_valid):
@@ -1112,62 +1212,80 @@ class DCache(Elaboratable):
         account by using 1-cycle delayed signals for load hits.
         """
         comb = m.d.comb
-        wb_in = self.wb_in
+        bus = self.bus
+
+        # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
+        # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
+        m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
+        comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
+                   ~r1.write_bram))
+        comb += rwe.i.eq(replace_way)
+
+        m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
+        comb += hwe.i.eq(r1.hit_way)
+
+        # this one is gated with write_bram, and replace_way_e can never be
+        # set at the same time.  that means that do_write can OR the outputs
+        m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
+        comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
+        comb += hre.i.eq(r1.req.hit_way)
+
+        # common Signals
+        do_read  = Signal()
+        wr_addr  = Signal(ROW_BITS)
+        wr_data  = Signal(WB_DATA_BITS)
+        wr_sel   = Signal(ROW_SIZE)
+        rd_addr  = Signal(ROW_BITS)
+
+        comb += do_read.eq(1) # always enable
+        comb += rd_addr.eq(early_req_row)
+
+        # Write mux:
+        #
+        # Defaults to wishbone read responses (cache refill)
+        #
+        # For timing, the mux on wr_data/sel/addr is not
+        # dependent on anything other than the current state.
+
+        with m.If(r1.write_bram):
+            # Write store data to BRAM.  This happens one
+            # cycle after the store is in r0.
+            comb += wr_data.eq(r1.req.data)
+            comb += wr_sel.eq(r1.req.byte_sel)
+            comb += wr_addr.eq(get_row(r1.req.real_addr))
 
+        with m.Else():
+            # Otherwise, we might be doing a reload or a DCBZ
+            with m.If(r1.dcbz):
+                comb += wr_data.eq(0)
+            with m.Else():
+                comb += wr_data.eq(bus.dat_r)
+            comb += wr_addr.eq(r1.store_row)
+            comb += wr_sel.eq(~0) # all 1s
+
+        # set up Cache Rams
         for i in range(NUM_WAYS):
-            do_read  = Signal(name="do_rd%d" % i)
-            rd_addr  = Signal(ROW_BITS, name="rd_addr_%d" % i)
             do_write = Signal(name="do_wr%d" % i)
-            wr_addr  = Signal(ROW_BITS, name="wr_addr_%d" % i)
-            wr_data  = Signal(WB_DATA_BITS, name="din_%d" % i)
-            wr_sel   = Signal(ROW_SIZE)
-            wr_sel_m = Signal(ROW_SIZE)
-            _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
+            wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
+            d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
 
             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
-            setattr(m.submodules, "cacheram_%d" % i, way)
+            m.submodules["cacheram_%d" % i] = way
 
             comb += way.rd_en.eq(do_read)
             comb += way.rd_addr.eq(rd_addr)
-            comb += _d_out.eq(way.rd_data_o)
+            comb += d_out.eq(way.rd_data_o)
             comb += way.wr_sel.eq(wr_sel_m)
             comb += way.wr_addr.eq(wr_addr)
             comb += way.wr_data.eq(wr_data)
 
             # Cache hit reads
-            comb += do_read.eq(1)
-            comb += rd_addr.eq(early_req_row)
-            with m.If(r1.hit_way == i):
-                comb += cache_out_row.eq(_d_out)
-
-            # Write mux:
-            #
-            # Defaults to wishbone read responses (cache refill)
-            #
-            # For timing, the mux on wr_data/sel/addr is not
-            # dependent on anything other than the current state.
-
-            with m.If(r1.write_bram):
-                # Write store data to BRAM.  This happens one
-                # cycle after the store is in r0.
-                comb += wr_data.eq(r1.req.data)
-                comb += wr_sel.eq(r1.req.byte_sel)
-                comb += wr_addr.eq(get_row(r1.req.real_addr))
-
-                with m.If(i == r1.req.hit_way):
-                    comb += do_write.eq(1)
-            with m.Else():
-                # Otherwise, we might be doing a reload or a DCBZ
-                with m.If(r1.dcbz):
-                    comb += wr_data.eq(0)
-                with m.Else():
-                    comb += wr_data.eq(wb_in.dat)
-                comb += wr_addr.eq(r1.store_row)
-                comb += wr_sel.eq(~0) # all 1s
+            with m.If(hwe.o[i]):
+                comb += cache_out_row.eq(d_out)
 
-                with m.If((r1.state == State.RELOAD_WAIT_ACK)
-                          & wb_in.ack & (replace_way == i)):
-                    comb += do_write.eq(1)
+            # these are mutually-exclusive via their Decoder-enablers
+            # (note: Decoder-enable is inverted)
+            comb += do_write.eq(hre.o[i] | rwe.o[i])
 
             # Mask write selects with do_write since BRAM
             # doesn't have a global write-enable
@@ -1179,8 +1297,7 @@ class DCache(Elaboratable):
     # It also handles error cases (TLB miss, cache paradox)
     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
                         req_hit_way, req_index, req_tag, access_ok,
-                        tlb_hit, tlb_hit_way, tlb_req_index):
-
+                        tlb_hit, tlb_req_index):
         comb = m.d.comb
         sync = m.d.sync
 
@@ -1197,15 +1314,9 @@ class DCache(Elaboratable):
         sync += r1.hit_way.eq(req_hit_way)
         sync += r1.hit_index.eq(req_index)
 
-        with m.If(req_op == Op.OP_LOAD_HIT):
-            sync += r1.hit_load_valid.eq(1)
-        with m.Else():
-            sync += r1.hit_load_valid.eq(0)
-
-        with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
-            sync += r1.cache_hit.eq(1)
-        with m.Else():
-            sync += r1.cache_hit.eq(0)
+        sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
+        sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
+                                (req_op == Op.OP_STORE_HIT))
 
         with m.If(req_op == Op.OP_BAD):
             sync += Display("Signalling ld/st error "
@@ -1214,20 +1325,15 @@ class DCache(Elaboratable):
             sync += r1.ls_error.eq(~r0.mmu_req)
             sync += r1.mmu_error.eq(r0.mmu_req)
             sync += r1.cache_paradox.eq(access_ok)
-
         with m.Else():
             sync += r1.ls_error.eq(0)
             sync += r1.mmu_error.eq(0)
             sync += r1.cache_paradox.eq(0)
 
-        with m.If(req_op == Op.OP_STCX_FAIL):
-            sync += r1.stcx_fail.eq(1)
-        with m.Else():
-            sync += r1.stcx_fail.eq(0)
+        sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
 
         # Record TLB hit information for updating TLB PLRU
         sync += r1.tlb_hit.eq(tlb_hit)
-        sync += r1.tlb_hit_way.eq(tlb_hit_way)
         sync += r1.tlb_hit_index.eq(tlb_req_index)
 
     # Memory accesses are handled by this state machine:
@@ -1239,17 +1345,18 @@ class DCache(Elaboratable):
     # All wishbone requests generation is done here.
     # This machine operates at stage 1.
     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
-                    cache_valids, r0, replace_way,
+                    r0, replace_way,
                     req_hit_way, req_same_tag,
                     r0_valid, req_op, cache_tags, req_go, ra):
 
         comb = m.d.comb
         sync = m.d.sync
-        wb_in = self.wb_in
+        bus = self.bus
         d_in = self.d_in
 
         req         = MemAccessRequest("mreq_ds")
 
+        r1_next_cycle = Signal()
         req_row = Signal(ROW_BITS)
         req_idx = Signal(INDEX_BITS)
         req_tag = Signal(TAG_BITS)
@@ -1276,7 +1383,7 @@ class DCache(Elaboratable):
             with m.If(r1.dcbz):
                 sync += r1.forward_data1.eq(0)
             with m.Else():
-                sync += r1.forward_data1.eq(wb_in.dat)
+                sync += r1.forward_data1.eq(bus.dat_r)
             sync += r1.forward_sel1.eq(~0) # all 1s
             sync += r1.forward_way1.eq(replace_way)
             sync += r1.forward_row1.eq(r1.store_row)
@@ -1293,24 +1400,21 @@ class DCache(Elaboratable):
         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
 
         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
-            with m.If(~r0.mmu_req):
-                sync += r1.ls_valid.eq(1)
-            with m.Else():
+            with m.If(r0.mmu_req):
                 sync += r1.mmu_done.eq(1)
+            with m.Else():
+                sync += r1.ls_valid.eq(1)
 
         with m.If(r1.write_tag):
             # Store new tag in selected way
+            replace_way_onehot = Signal(NUM_WAYS)
+            comb += replace_way_onehot.eq(1<<replace_way)
             for i in range(NUM_WAYS):
-                with m.If(i == replace_way):
+                with m.If(replace_way_onehot[i]):
                     ct = Signal(TAG_RAM_WIDTH)
-                    comb += ct.eq(cache_tags[r1.store_index])
-                    """
-TODO: check this
-cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
-                    (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
-                    """
+                    comb += ct.eq(cache_tags[r1.store_index].tag)
                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
-                    sync += cache_tags[r1.store_index].eq(ct)
+                    sync += cache_tags[r1.store_index].tag.eq(ct)
             sync += r1.store_way.eq(replace_way)
             sync += r1.write_tag.eq(0)
 
@@ -1351,6 +1455,9 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                       | (req_op == Op.OP_STORE_HIT)):
                 sync += r1.req.eq(req)
                 sync += r1.full.eq(1)
+                # do not let r1.state RELOAD_WAIT_ACK or STORE_WAIT_ACK
+                # destroy r1.req by overwriting r1.full back to zero
+                comb += r1_next_cycle.eq(1)
 
         # Main state machine
         with m.Switch(r1.state):
@@ -1372,6 +1479,9 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                 with m.If(req.op == Op.OP_STORE_HIT):
                     sync += r1.store_way.eq(req.hit_way)
 
+                #with m.If(r1.dec_acks):
+                #    sync += r1.acks_pending.eq(r1.acks_pending - 1)
+
                 # Reset per-row valid bits,
                 # ready for handling OP_LOAD_MISS
                 for i in range(ROW_PER_LINE):
@@ -1410,12 +1520,13 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                             sync += r1.state.eq(State.STORE_WAIT_ACK)
                             sync += r1.acks_pending.eq(1)
                             sync += r1.full.eq(0)
+                            comb += r1_next_cycle.eq(0)
                             sync += r1.slow_valid.eq(1)
 
-                            with m.If(~req.mmu_req):
-                                sync += r1.ls_valid.eq(1)
-                            with m.Else():
+                            with m.If(req.mmu_req):
                                 sync += r1.mmu_done.eq(1)
+                            with m.Else():
+                                sync += r1.ls_valid.eq(1)
 
                             with m.If(req.op == Op.OP_STORE_HIT):
                                 sync += r1.write_bram.eq(1)
@@ -1447,7 +1558,7 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                 comb += ld_stbs_done.eq(~r1.wb.stb)
 
                 # If we are still sending requests, was one accepted?
-                with m.If((~wb_in.stall) & r1.wb.stb):
+                with m.If((~bus.stall) & r1.wb.stb):
                     # That was the last word?  We are done sending.
                     # Clear stb and set ld_stbs_done so we can handle an
                     # eventual last ack on the same cycle.
@@ -1463,8 +1574,8 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
 
                 # Incoming acks processing
-                sync += r1.forward_valid1.eq(wb_in.ack)
-                with m.If(wb_in.ack):
+                sync += r1.forward_valid1.eq(bus.ack)
+                with m.If(bus.ack):
                     srow = Signal(ROW_LINE_BITS)
                     comb += srow.eq(r1.store_row)
                     sync += r1.rows_valid[srow].eq(1)
@@ -1474,16 +1585,16 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                     # Compare the whole address in case the
                     # request in r1.req is not the one that
                     # started this refill.
-                    with m.If(req.valid & r1.req.same_tag &
-                              ((r1.dcbz & r1.req.dcbz) |
-                               (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
-                                (r1.store_row == get_row(req.real_addr))):
-                        sync += r1.full.eq(0)
+                    with m.If(r1.full & r1.req.same_tag &
+                              ((r1.dcbz & req.dcbz) |
+                               (r1.req.op == Op.OP_LOAD_MISS)) &
+                                (r1.store_row == get_row(r1.req.real_addr))):
+                        sync += r1.full.eq(r1_next_cycle)
                         sync += r1.slow_valid.eq(1)
-                        with m.If(~r1.mmu_req):
-                            sync += r1.ls_valid.eq(1)
-                        with m.Else():
+                        with m.If(r1.mmu_req):
                             sync += r1.mmu_done.eq(1)
+                        with m.Else():
+                            sync += r1.ls_valid.eq(1)
                         sync += r1.forward_sel.eq(~0) # all 1s
                         sync += r1.use_forward1.eq(1)
 
@@ -1495,9 +1606,9 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
 
                         # Cache line is now valid
                         cv = Signal(INDEX_BITS)
-                        comb += cv.eq(cache_valids[r1.store_index])
+                        comb += cv.eq(cache_tags[r1.store_index].valid)
                         comb += cv.bit_select(r1.store_way, 1).eq(1)
-                        sync += cache_valids[r1.store_index].eq(cv)
+                        sync += cache_tags[r1.store_index].valid.eq(cv)
 
                         sync += r1.state.eq(State.IDLE)
                         sync += Display("cache valid set %x "
@@ -1509,24 +1620,22 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
 
             with m.Case(State.STORE_WAIT_ACK):
                 st_stbs_done = Signal()
-                acks        = Signal(3)
                 adjust_acks = Signal(3)
 
                 comb += st_stbs_done.eq(~r1.wb.stb)
-                comb += acks.eq(r1.acks_pending)
 
                 with m.If(r1.inc_acks != r1.dec_acks):
                     with m.If(r1.inc_acks):
-                        comb += adjust_acks.eq(acks + 1)
+                        comb += adjust_acks.eq(r1.acks_pending + 1)
                     with m.Else():
-                        comb += adjust_acks.eq(acks - 1)
+                        comb += adjust_acks.eq(r1.acks_pending - 1)
                 with m.Else():
-                    comb += adjust_acks.eq(acks)
+                    comb += adjust_acks.eq(r1.acks_pending)
 
                 sync += r1.acks_pending.eq(adjust_acks)
 
                 # Clear stb when slave accepted request
-                with m.If(~wb_in.stall):
+                with m.If(~bus.stall):
                     # See if there is another store waiting
                     # to be done which is in the same real page.
                     with m.If(req.valid):
@@ -1536,14 +1645,16 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                         sync += r1.wb.sel.eq(req.byte_sel)
 
                     with m.If((adjust_acks < 7) & req.same_tag &
-                                ((req.op == Op.OP_STORE_MISS)
-                                 (req.op == Op.OP_STORE_HIT))):
+                                ((req.op == Op.OP_STORE_MISS) |
+                                 (req.op == Op.OP_STORE_HIT))):
                         sync += r1.wb.stb.eq(1)
                         comb += st_stbs_done.eq(0)
+                        sync += r1.store_way.eq(req.hit_way)
+                        sync += r1.store_row.eq(get_row(req.real_addr))
 
                         with m.If(req.op == Op.OP_STORE_HIT):
                             sync += r1.write_bram.eq(1)
-                        sync += r1.full.eq(0)
+                        sync += r1.full.eq(r1_next_cycle)
                         sync += r1.slow_valid.eq(1)
 
                         # Store requests never come from the MMU
@@ -1555,7 +1666,9 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                         comb += st_stbs_done.eq(1)
 
                 # Got ack ? See if complete.
-                with m.If(wb_in.ack):
+                sync += Display("got ack %d %d stbs %d adjust_acks %d",
+                                bus.ack, bus.ack, st_stbs_done, adjust_acks)
+                with m.If(bus.ack):
                     with m.If(st_stbs_done & (adjust_acks == 1)):
                         sync += r1.state.eq(State.IDLE)
                         sync += r1.wb.cyc.eq(0)
@@ -1564,45 +1677,44 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
 
             with m.Case(State.NC_LOAD_WAIT_ACK):
                 # Clear stb when slave accepted request
-                with m.If(~wb_in.stall):
+                with m.If(~bus.stall):
                     sync += r1.wb.stb.eq(0)
 
                 # Got ack ? complete.
-                with m.If(wb_in.ack):
+                with m.If(bus.ack):
                     sync += r1.state.eq(State.IDLE)
-                    sync += r1.full.eq(0)
+                    sync += r1.full.eq(r1_next_cycle)
                     sync += r1.slow_valid.eq(1)
 
-                    with m.If(~r1.mmu_req):
-                        sync += r1.ls_valid.eq(1)
-                    with m.Else():
+                    with m.If(r1.mmu_req):
                         sync += r1.mmu_done.eq(1)
+                    with m.Else():
+                        sync += r1.ls_valid.eq(1)
 
                     sync += r1.forward_sel.eq(~0) # all 1s
                     sync += r1.use_forward1.eq(1)
                     sync += r1.wb.cyc.eq(0)
                     sync += r1.wb.stb.eq(0)
 
-    def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
+    def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
 
         sync = m.d.sync
-        d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
+        d_out, bus, log_out = self.d_out, self.bus, self.log_out
 
-        sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
+        sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
                                stall_out, req_op[:3], d_out.valid, d_out.error,
-                               r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
+                               r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
                                r1.real_adr[3:6]))
 
     def elaborate(self, platform):
 
         m = Module()
-        comb = m.d.comb
-        d_in = self.d_in
+        comb, sync = m.d.comb, m.d.sync
+        m_in, d_in = self.m_in, self.d_in
 
         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
         cache_tags       = CacheTagArray()
         cache_tag_set    = Signal(TAG_RAM_WIDTH)
-        cache_valids = CacheValidBitsArray()
 
         # TODO attribute ram_style : string;
         # TODO attribute ram_style of cache_tags : signal is "distributed";
@@ -1610,9 +1722,6 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
         """note: these are passed to nmigen.hdl.Memory as "attributes".
            don't know how, just that they are.
         """
-        dtlb_valid_bits = TLBValidBitsArray()
-        dtlb_tags       = TLBTagsArray()
-        dtlb_ptes       = TLBPtesArray()
         # TODO attribute ram_style of
         #  dtlb_tags : signal is "distributed";
         # TODO attribute ram_style of
@@ -1623,7 +1732,7 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
 
         r1 = RegStage1("r1")
 
-        reservation = Reservation()
+        reservation = Reservation("rsrv")
 
         # Async signals on incoming request
         req_index    = Signal(INDEX_BITS)
@@ -1649,19 +1758,16 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
 
         cache_out_row     = Signal(WB_DATA_BITS)
 
-        plru_victim       = PLRUOut()
+        plru_victim       = Signal(WAY_BITS)
         replace_way       = Signal(WAY_BITS)
 
         # Wishbone read/write/cache write formatting signals
         bus_sel           = Signal(8)
 
         # TLB signals
-        tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
-        tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
-        tlb_valid_way = Signal(TLB_NUM_WAYS)
+        tlb_way       = TLBRecord("tlb_way")
         tlb_req_index = Signal(TLB_SET_BITS)
-        tlb_hit       = Signal()
-        tlb_hit_way   = Signal(TLB_WAY_BITS)
+        tlb_hit       = TLBHit("tlb_hit")
         pte           = Signal(TLB_PTE_BITS)
         ra            = Signal(REAL_ADDR_BITS)
         valid_ra      = Signal()
@@ -1670,7 +1776,7 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
         perm_ok       = Signal()
         access_ok     = Signal()
 
-        tlb_plru_victim = TLBPLRUOut()
+        tlb_plru_victim = Signal(TLB_WAY_BITS)
 
         # we don't yet handle collisions between loadstore1 requests
         # and MMU requests
@@ -1680,37 +1786,50 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
         comb += self.stall_out.eq(r0_stall)
-
-        # Wire up wishbone request latch out of stage 1
-        comb += self.wb_out.eq(r1.wb)
+        # debugging: detect if any stall ever requested, which is fine,
+        # but if a request comes in when stall requested, that's bad.
+        with m.If(r0_stall):
+            sync += self.any_stall_out.eq(1)
+            with m.If(d_in.valid):
+                sync += self.dreq_when_stall.eq(1)
+            with m.If(m_in.valid):
+                sync += self.mreq_when_stall.eq(1)
 
         # deal with litex not doing wishbone pipeline mode
         # XXX in wrong way.  FIFOs are needed in the SRAM test
-        # so that stb/ack match up
-        comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack)
+        # so that stb/ack match up. same thing done in icache.py
+        if not self.microwatt_compat:
+            comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
+
+        # Wire up wishbone request latch out of stage 1
+        comb += self.bus.we.eq(r1.wb.we)
+        comb += self.bus.adr.eq(r1.wb.adr)
+        comb += self.bus.sel.eq(r1.wb.sel)
+        comb += self.bus.stb.eq(r1.wb.stb)
+        comb += self.bus.dat_w.eq(r1.wb.dat)
+        comb += self.bus.cyc.eq(r1.wb.cyc)
+
+        # create submodule TLBUpdate
+        m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
 
         # call sub-functions putting everything together, using shared
         # signals established above
         self.stage_0(m, r0, r1, r0_full)
-        self.tlb_read(m, r0_stall, tlb_valid_way,
-                      tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
-                      dtlb_tags, dtlb_ptes)
+        self.tlb_read(m, r0_stall, tlb_way)
         self.tlb_search(m, tlb_req_index, r0, r0_valid,
-                        tlb_valid_way, tlb_tag_way, tlb_hit_way,
-                        tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
-        self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
-                        tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
-                        dtlb_tags, tlb_pte_way, dtlb_ptes)
+                        tlb_way,
+                        pte, tlb_hit, valid_ra, perm_attr, ra)
+        self.tlb_update(m, r0_valid, r0, tlb_req_index,
+                        tlb_hit, tlb_plru_victim)
         self.maybe_plrus(m, r1, plru_victim)
-        self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
+        self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
-                           r0_valid, r1, cache_valids, replace_way,
+                           r0_valid, r1, cache_tags, replace_way,
                            use_forward1_next, use_forward2_next,
                            req_hit_way, plru_victim, rc_ok, perm_attr,
                            valid_ra, perm_ok, access_ok, req_op, req_go,
-                           tlb_pte_way,
-                           tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
+                           tlb_hit, tlb_way, cache_tag_set,
                            cancel_store, req_same_tag, r0_stall, early_req_row)
         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
                            r0_valid, r0, reservation)
@@ -1720,12 +1839,12 @@ cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
                         req_hit_way, req_index, req_tag, access_ok,
-                        tlb_hit, tlb_hit_way, tlb_req_index)
+                        tlb_hit, tlb_req_index)
         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
-                    cache_valids, r0, replace_way,
+                    r0, replace_way,
                     req_hit_way, req_same_tag,
                          r0_valid, req_op, cache_tags, req_go, ra)
-        #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
+        #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
 
         return m
 
index 97f36a8f4529ab9515acea560cb33df73f0bbf51..8883a985f480ae8b2ea5a732ca1f4463a129665d 100644 (file)
@@ -39,19 +39,19 @@ class Driver(Elaboratable):
         # liveness counter
         live_cnt = Signal(5)
         # keep data and valid stable, until accepted
-        with m.If(Past(dut.p.valid_i) & ~Past(dut.p.ready_o)):
+        with m.If(Past(dut.p.i_valid) & ~Past(dut.p.o_ready)):
             comb += [
                 Assume(Stable(dut.op.sdir)),
                 Assume(Stable(dut.p.data_i.data)),
                 Assume(Stable(dut.p.data_i.shift)),
-                Assume(Stable(dut.p.valid_i)),
+                Assume(Stable(dut.p.i_valid)),
             ]
         # force reading the output in a reasonable time,
         # necessary to pass induction
-        with m.If(Past(dut.n.valid_o) & ~Past(dut.n.ready_i)):
-            comb += Assume(dut.n.ready_i)
+        with m.If(Past(dut.n.o_valid) & ~Past(dut.n.i_ready)):
+            comb += Assume(dut.n.i_ready)
         # capture transferred input data
-        with m.If(dut.p.ready_o & dut.p.valid_i):
+        with m.If(dut.p.o_ready & dut.p.i_valid):
             sync += [
                 data_i.eq(dut.p.data_i.data),
                 shift_i.eq(dut.p.data_i.shift),
@@ -71,18 +71,18 @@ class Driver(Elaboratable):
         # one work item ever in flight at any given time.
         # Whenever the unit is busy (not ready) the read and write counters
         # will differ by exactly one unit.
-        m.d.comb += Assert((read_cnt + ~dut.p.ready_o) & 0xF == write_cnt)
+        m.d.comb += Assert((read_cnt + ~dut.p.o_ready) & 0xF == write_cnt)
         # Check for liveness. It will ensure that the FSM is not stuck, and
         # will eventually produce some result.
-        # In this case, the delay between ready_o being negated and valid_o
+        # In this case, the delay between o_ready being negated and o_valid
         # being asserted has to be less than 16 cycles.
-        with m.If(~dut.p.ready_o & ~dut.n.valid_o):
+        with m.If(~dut.p.o_ready & ~dut.n.o_valid):
             m.d.sync += live_cnt.eq(live_cnt + 1)
         with m.Else():
             m.d.sync += live_cnt.eq(0)
         m.d.comb += Assert(live_cnt < 16)
         # check coverage as output data is accepted
-        with m.If(dut.n.ready_i & dut.n.valid_o):
+        with m.If(dut.n.i_ready & dut.n.o_valid):
             # increment read counter
             sync += read_cnt.eq(read_cnt + 1)
             # check result
@@ -123,9 +123,9 @@ class ALUFSMTestCase(FHDLTestCase):
         traces = [
             'clk',
             'p_data_i[7:0]', 'p_shift_i[7:0]', 'op__sdir',
-            'p_valid_i', 'p_ready_o',
+            'p_i_valid', 'p_o_ready',
             'n_data_o[7:0]',
-            'n_valid_o', 'n_ready_i',
+            'n_o_valid', 'n_i_ready',
             ('formal', {'module': 'top'}, [
                 'write_cnt[3:0]', 'read_cnt[3:0]', 'cov[7:0]'
             ])
index 1b8aa8586a761337cf5cb09359b807cd66576516..3f37b5bfa0c271b8c28fd7fd218f8ce862b5022a 100644 (file)
@@ -17,18 +17,27 @@ TODO (in no specific order):
   write TAG_BITS width which may not match full ram blocks and might
   cause muxes to be inferred for "partial writes".
 * Check if making the read size of PLRU a ROM helps utilization
+
+Links:
+
+* https://bugs.libre-soc.org/show_bug.cgi?id=485
+* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
+  (discussion about brams for ECP5)
+
 """
 
 from enum import (Enum, unique)
-from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
+from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
+                    Record)
 from nmigen.cli import main, rtlil
 from nmutil.iocontrol import RecordObject
 from nmigen.utils import log2_int
+from nmigen.lib.coding import Decoder
 from nmutil.util import Display
 
 #from nmutil.plru import PLRU
+from soc.experiment.plru import PLRU, PLRUs
 from soc.experiment.cache_ram import CacheRam
-from soc.experiment.plru import PLRU
 
 from soc.experiment.mem_types import (Fetch1ToICacheType,
                                       ICacheToDecode1Type,
@@ -37,8 +46,11 @@ from soc.experiment.mem_types import (Fetch1ToICacheType,
 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
                                      WB_SEL_BITS, WBAddrType, WBDataType,
                                      WBSelType, WBMasterOut, WBSlaveOut,
-                                     WBMasterOutVector, WBSlaveOutVector,
-                                     WBIOMasterOut, WBIOSlaveOut)
+                                     )
+
+from nmigen_soc.wishbone.bus import Interface
+from soc.minerva.units.fetch import FetchUnitInterface
+
 
 # for test
 from soc.bus.sram import SRAM
@@ -60,9 +72,9 @@ LINE_SIZE      = 64
 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
 ROW_SIZE       = WB_DATA_BITS // 8
 # Number of lines in a set
-NUM_LINES      = 16
+NUM_LINES      = 64
 # Number of ways
-NUM_WAYS       = 4
+NUM_WAYS       = 2
 # L1 ITLB number of entries (direct mapped)
 TLB_SIZE       = 64
 # L1 ITLB log_2(page_size)
@@ -176,13 +188,10 @@ assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
 # not handle a clean (commented) definition of the cache tags as a 3d
 # memory. For now, work around it by putting all the tags
 def CacheTagArray():
-    return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
-                 for x in range(NUM_LINES))
-
-# The cache valid bits
-def CacheValidBitsArray():
-    return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
-                 for x in range(NUM_LINES))
+    tag_layout = [('valid', NUM_WAYS),
+                  ('tag', TAG_RAM_WIDTH),
+                 ]
+    return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 
 def RowPerLineValidArray():
     return Array(Signal(name="rows_valid_%d" %x) \
@@ -193,18 +202,12 @@ def RowPerLineValidArray():
 # attribute ram_style : string;
 # attribute ram_style of cache_tags : signal is "distributed";
 
-
-def TLBValidBitsArray():
-    return Array(Signal(name="tlbvalid_%d" %x) \
-                 for x in range(TLB_SIZE))
-
-def TLBTagArray():
-    return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
-                 for x in range(TLB_SIZE))
-
-def TLBPtesArray():
-    return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
-                 for x in range(TLB_SIZE))
+def TLBArray():
+    tlb_layout = [('valid', 1),
+                  ('tag', TLB_EA_TAG_BITS),
+                  ('pte', TLB_PTE_BITS)
+                 ]
+    return Array(Record(tlb_layout, name="tlb%d" % x) for x in range(TLB_SIZE))
 
 # Cache RAM interface
 def CacheRamOut():
@@ -226,7 +229,7 @@ def get_row(addr):
 
 # Return the index of a row within a line
 def get_row_of_line(row):
-    return row[:ROW_LINE_BITS]
+    return row[:ROW_BITS][:ROW_LINE_BITS]
 
 # Returns whether this is the last row of a line
 def is_last_row_addr(addr, last):
@@ -255,7 +258,7 @@ def get_tag(addr):
 
 # Read a tag from a tag memory row
 def read_tag(way, tagset):
-    return tagset.word_select(way, TAG_BITS)
+    return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 
 # Write a tag to tag memory row
 def write_tag(way, tagset, tag):
@@ -263,11 +266,9 @@ def write_tag(way, tagset, tag):
 
 # Simple hash for direct-mapped TLB index
 def hash_ea(addr):
-    hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
-           TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
-          ] ^ addr[
-           TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
-          ]
+    hsh = (addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^
+           addr[TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS ] ^
+           addr[TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS])
     return hsh
 
 
@@ -283,7 +284,7 @@ class RegInternal(RecordObject):
     def __init__(self):
         super().__init__()
         # Cache hit state (Latches for 1 cycle BRAM access)
-        self.hit_way      = Signal(NUM_WAYS)
+        self.hit_way      = Signal(WAY_BITS)
         self.hit_nia      = Signal(64)
         self.hit_smark    = Signal()
         self.hit_valid    = Signal()
@@ -292,9 +293,9 @@ class RegInternal(RecordObject):
         self.state        = Signal(State, reset=State.IDLE)
         self.wb           = WBMasterOut("wb")
         self.req_adr      = Signal(64)
-        self.store_way    = Signal(NUM_WAYS)
-        self.store_index  = Signal(NUM_LINES)
-        self.store_row    = Signal(BRAM_ROWS)
+        self.store_way    = Signal(WAY_BITS)
+        self.store_index  = Signal(INDEX_BITS)
+        self.store_row    = Signal(ROW_BITS)
         self.store_tag    = Signal(TAG_BITS)
         self.store_valid  = Signal()
         self.end_row_ix   = Signal(ROW_LINE_BITS)
@@ -304,9 +305,10 @@ class RegInternal(RecordObject):
         self.fetch_failed = Signal()
 
 
-class ICache(Elaboratable):
+class ICache(FetchUnitInterface, Elaboratable):
     """64 bit direct mapped icache. All instructions are 4B aligned."""
-    def __init__(self):
+    def __init__(self, pspec):
+        FetchUnitInterface.__init__(self, pspec)
         self.i_in           = Fetch1ToICacheType(name="i_in")
         self.i_out          = ICacheToDecode1Type(name="i_out")
 
@@ -317,11 +319,21 @@ class ICache(Elaboratable):
         self.flush_in       = Signal()
         self.inval_in       = Signal()
 
-        self.wb_out         = WBMasterOut(name="wb_out")
-        self.wb_in          = WBSlaveOut(name="wb_in")
+        # standard naming (wired to non-standard for compatibility)
+        self.bus = Interface(addr_width=32,
+                            data_width=64,
+                            granularity=8,
+                            features={'stall'},
+                            alignment=0,
+                            name="icache_wb")
 
         self.log_out        = Signal(54)
 
+        # use FetchUnitInterface, helps keep some unit tests running
+        self.use_fetch_iface = False
+
+    def use_fetch_interface(self):
+        self.use_fetch_iface = True
 
     # Generate a cache RAM for each way
     def rams(self, m, r, cache_out_row, use_previous,
@@ -330,67 +342,72 @@ class ICache(Elaboratable):
         comb = m.d.comb
         sync = m.d.sync
 
-        wb_in, stall_in = self.wb_in, self.stall_in
+        bus, stall_in = self.bus, self.stall_in
+
+        # read condition (for every cache ram)
+        do_read  = Signal()
+        comb += do_read.eq(~(stall_in | use_previous))
+
+        rd_addr  = Signal(ROW_BITS)
+        wr_addr  = Signal(ROW_BITS)
+        comb += rd_addr.eq(req_row)
+        comb += wr_addr.eq(r.store_row)
+
+        # binary-to-unary converters: replace-way enabled by bus.ack,
+        # hit-way left permanently enabled
+        m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
+        m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
+        comb += re.i.eq(replace_way)
+        comb += re.n.eq(~bus.ack)
+        comb += he.i.eq(r.hit_way)
 
         for i in range(NUM_WAYS):
-            do_read  = Signal(name="do_rd_%d" % i)
             do_write = Signal(name="do_wr_%d" % i)
-            rd_addr  = Signal(ROW_BITS)
-            wr_addr  = Signal(ROW_BITS)
             d_out    = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
-            wr_sel   = Signal(ROW_SIZE)
+            wr_sel   = Signal(ROW_SIZE, name="wr_sel_%d" % i)
 
-            way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
-            setattr(m.submodules, "cacheram_%d" % i, way)
+            way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
+            m.submodules["cacheram_%d" % i] =  way
 
             comb += way.rd_en.eq(do_read)
             comb += way.rd_addr.eq(rd_addr)
             comb += d_out.eq(way.rd_data_o)
             comb += way.wr_sel.eq(wr_sel)
             comb += way.wr_addr.eq(wr_addr)
-            comb += way.wr_data.eq(wb_in.dat)
+            comb += way.wr_data.eq(bus.dat_r)
 
-            comb += do_read.eq(~(stall_in | use_previous))
-            comb += do_write.eq(wb_in.ack & (replace_way == i))
+            comb += do_write.eq(re.o[i])
 
             with m.If(do_write):
                 sync += Display("cache write adr: %x data: %lx",
                                 wr_addr, way.wr_data)
 
-            with m.If(r.hit_way == i):
+            with m.If(he.o[i]):
                 comb += cache_out_row.eq(d_out)
                 with m.If(do_read):
                     sync += Display("cache read adr: %x data: %x",
                                      req_row, d_out)
 
-            comb += rd_addr.eq(req_row)
-            comb += wr_addr.eq(r.store_row)
             comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
 
     # Generate PLRUs
     def maybe_plrus(self, m, r, plru_victim):
         comb = m.d.comb
 
-        with m.If(NUM_WAYS > 1):
-            for i in range(NUM_LINES):
-                plru_acc_i  = Signal(WAY_BITS)
-                plru_acc_en = Signal()
-                plru        = PLRU(WAY_BITS)
-                setattr(m.submodules, "plru_%d" % i, plru)
-
-                comb += plru.acc_i.eq(plru_acc_i)
-                comb += plru.acc_en.eq(plru_acc_en)
+        if NUM_WAYS == 0:
+            return
 
-                # PLRU interface
-                with m.If(get_index(r.hit_nia) == i):
-                    comb += plru.acc_en.eq(r.hit_valid)
 
-                comb += plru.acc_i.eq(r.hit_way)
-                comb += plru_victim[i].eq(plru.lru_o)
+        m.submodules.plrus = plru = PLRUs(NUM_LINES, WAY_BITS)
+        comb += plru.way.eq(r.hit_way)
+        comb += plru.valid.eq(r.hit_valid)
+        comb += plru.index.eq(get_index(r.hit_nia))
+        comb += plru.isel.eq(r.store_index) # select victim
+        comb += plru_victim.eq(plru.o_index) # selected victim
 
     # TLB hit detection and real address generation
-    def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
-                    real_addr, itlb_valid_bits, ra_valid, eaa_priv,
+    def itlb_lookup(self, m, tlb_req_index, itlb,
+                    real_addr, ra_valid, eaa_priv,
                     priv_fault, access_ok):
 
         comb = m.d.comb
@@ -401,8 +418,8 @@ class ICache(Elaboratable):
         ttag = Signal(TLB_EA_TAG_BITS)
 
         comb += tlb_req_index.eq(hash_ea(i_in.nia))
-        comb += pte.eq(itlb_ptes[tlb_req_index])
-        comb += ttag.eq(itlb_tags[tlb_req_index])
+        comb += pte.eq(itlb[tlb_req_index].pte)
+        comb += ttag.eq(itlb[tlb_req_index].tag)
 
         with m.If(i_in.virt_mode):
             comb += real_addr.eq(Cat(
@@ -411,7 +428,7 @@ class ICache(Elaboratable):
                     ))
 
             with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
-                comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
+                comb += ra_valid.eq(itlb[tlb_req_index].valid)
 
             comb += eaa_priv.eq(pte[3])
 
@@ -425,7 +442,7 @@ class ICache(Elaboratable):
         comb += access_ok.eq(ra_valid & ~priv_fault)
 
     # iTLB update
-    def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
+    def itlb_update(self, m, itlb):
         comb = m.d.comb
         sync = m.d.sync
 
@@ -437,33 +454,31 @@ class ICache(Elaboratable):
         with m.If(m_in.tlbie & m_in.doall):
             # Clear all valid bits
             for i in range(TLB_SIZE):
-                sync += itlb_valid_bits[i].eq(0)
+                sync += itlb[i].valid.eq(0)
 
         with m.Elif(m_in.tlbie):
             # Clear entry regardless of hit or miss
-            sync += itlb_valid_bits[wr_index].eq(0)
+            sync += itlb[wr_index].valid.eq(0)
 
         with m.Elif(m_in.tlbld):
-            sync += itlb_tags[wr_index].eq(
-                     m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
-                    )
-            sync += itlb_ptes[wr_index].eq(m_in.pte)
-            sync += itlb_valid_bits[wr_index].eq(1)
+            sync += itlb[wr_index].tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
+            sync += itlb[wr_index].pte.eq(m_in.pte)
+            sync += itlb[wr_index].valid.eq(1)
 
     # Cache hit detection, output to fetch2 and other misc logic
     def icache_comb(self, m, use_previous, r, req_index, req_row,
                     req_hit_way, req_tag, real_addr, req_laddr,
-                    cache_valid_bits, cache_tags, access_ok,
+                    cache_tags, access_ok,
                     req_is_hit, req_is_miss, replace_way,
                     plru_victim, cache_out_row):
 
         comb = m.d.comb
 
-        i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
+        i_in, i_out, bus = self.i_in, self.i_out, self.bus
         flush_in, stall_out = self.flush_in, self.stall_out
 
         is_hit  = Signal()
-        hit_way = Signal(NUM_WAYS)
+        hit_way = Signal(WAY_BITS)
 
         # i_in.sequential means that i_in.nia this cycle is 4 more than
         # last cycle.  If we read more than 32 bits at a time, had a
@@ -490,20 +505,25 @@ class ICache(Elaboratable):
                  & (req_index == r.store_index)
                  & r.rows_valid[req_row % ROW_PER_LINE]
                 )
-        with m.If(i_in.req):
-            cvb = Signal(NUM_WAYS)
-            ctag = Signal(TAG_RAM_WIDTH)
-            comb += ctag.eq(cache_tags[req_index])
-            comb += cvb.eq(cache_valid_bits[req_index])
-            for i in range(NUM_WAYS):
-                tagi = Signal(TAG_BITS, name="tag_i%d" % i)
-                comb += tagi.eq(read_tag(i, ctag))
-                hit_test = Signal(name="hit_test%d" % i)
-                comb += hit_test.eq(i == r.store_way)
-                with m.If((cvb[i] | (hitcond & hit_test))
-                          & (tagi == req_tag)):
-                    comb += hit_way.eq(i)
-                    comb += is_hit.eq(1)
+        # i_in.req asserts Decoder active
+        cvb = Signal(NUM_WAYS)
+        ctag = Signal(TAG_RAM_WIDTH)
+        comb += ctag.eq(cache_tags[req_index].tag)
+        comb += cvb.eq(cache_tags[req_index].valid)
+        m.submodules.store_way_e = se = Decoder(NUM_WAYS)
+        comb += se.i.eq(r.store_way)
+        comb += se.n.eq(~i_in.req)
+        for i in range(NUM_WAYS):
+            tagi = Signal(TAG_BITS, name="tag_i%d" % i)
+            hit_test = Signal(name="hit_test%d" % i)
+            is_tag_hit = Signal(name="is_tag_hit_%d" % i)
+            comb += tagi.eq(read_tag(i, ctag))
+            comb += hit_test.eq(se.o[i])
+            comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
+                                  (tagi == req_tag))
+            with m.If(is_tag_hit):
+                comb += hit_way.eq(i)
+                comb += is_hit.eq(1)
 
         # Generate the "hit" and "miss" signals
         # for the synchronous blocks
@@ -511,15 +531,11 @@ class ICache(Elaboratable):
             comb += req_is_hit.eq(is_hit)
             comb += req_is_miss.eq(~is_hit)
 
-        with m.Else():
-            comb += req_is_hit.eq(0)
-            comb += req_is_miss.eq(0)
-
         comb += req_hit_way.eq(hit_way)
 
         # The way to replace on a miss
         with m.If(r.state == State.CLR_TAG):
-            comb += replace_way.eq(plru_victim[r.store_index])
+            comb += replace_way.eq(plru_victim)
         with m.Else():
             comb += replace_way.eq(r.store_way)
 
@@ -542,7 +558,12 @@ class ICache(Elaboratable):
         comb += stall_out.eq(~(is_hit & access_ok))
 
         # Wishbone requests output (from the cache miss reload machine)
-        comb += wb_out.eq(r.wb)
+        comb += bus.we.eq(r.wb.we)
+        comb += bus.adr.eq(r.wb.adr)
+        comb += bus.sel.eq(r.wb.sel)
+        comb += bus.stb.eq(r.wb.stb)
+        comb += bus.dat_w.eq(r.wb.dat)
+        comb += bus.cyc.eq(r.wb.cyc)
 
     # Cache hit synchronous machine
     def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
@@ -567,14 +588,10 @@ class ICache(Elaboratable):
 
             with m.If(req_is_hit):
                 sync += r.hit_way.eq(req_hit_way)
-                sync += Display(
-                         "cache hit nia:%x IR:%x SM:%x idx:%x tag:%x " \
-                         "way:%x RA:%x", i_in.nia, i_in.virt_mode, \
-                         i_in.stop_mark, req_index, req_tag, \
-                         req_hit_way, real_addr
-                        )
-
-
+                sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
+                                "way:%x RA:%x", i_in.nia, i_in.virt_mode,
+                                 i_in.stop_mark, req_index, req_tag,
+                                 req_hit_way, real_addr)
 
         with m.If(~stall_in):
             # Send stop marks and NIA down regardless of validity
@@ -598,11 +615,10 @@ class ICache(Elaboratable):
                      "cache miss nia:%x IR:%x SM:%x idx:%x "
                      " way:%x tag:%x RA:%x", i_in.nia,
                      i_in.virt_mode, i_in.stop_mark, req_index,
-                     replace_way, req_tag, real_addr
-                    )
+                     replace_way, req_tag, real_addr)
 
             # Keep track of our index and way for subsequent stores
-            st_row = Signal(BRAM_ROWS)
+            st_row = Signal(ROW_BITS)
             comb += st_row.eq(get_row(req_laddr))
             sync += r.store_index.eq(req_index)
             sync += r.store_row.eq(st_row)
@@ -620,34 +636,34 @@ class ICache(Elaboratable):
             sync += r.state.eq(State.CLR_TAG)
 
     def icache_miss_clr_tag(self, m, r, replace_way,
-                            cache_valid_bits, req_index,
+                            req_index,
                             tagset, cache_tags):
-
         comb = m.d.comb
         sync = m.d.sync
 
         # Get victim way from plru
         sync += r.store_way.eq(replace_way)
+
         # Force misses on that way while reloading that line
         cv = Signal(INDEX_BITS)
-        comb += cv.eq(cache_valid_bits[req_index])
+        comb += cv.eq(cache_tags[req_index].valid)
         comb += cv.bit_select(replace_way, 1).eq(0)
-        sync += cache_valid_bits[req_index].eq(cv)
+        sync += cache_tags[req_index].valid.eq(cv)
 
         for i in range(NUM_WAYS):
             with m.If(i == replace_way):
-                comb += tagset.eq(cache_tags[r.store_index])
+                comb += tagset.eq(cache_tags[r.store_index].tag)
                 comb += write_tag(i, tagset, r.store_tag)
-                sync += cache_tags[r.store_index].eq(tagset)
+                sync += cache_tags[r.store_index].tag.eq(tagset)
 
         sync += r.state.eq(State.WAIT_ACK)
 
     def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
-                             stbs_done, cache_valid_bits):
+                             cache_tags, stbs_done):
         comb = m.d.comb
         sync = m.d.sync
 
-        wb_in = self.wb_in
+        bus = self.bus
 
         # Requests are all sent if stb is 0
         stbs_zero = Signal()
@@ -655,43 +671,36 @@ class ICache(Elaboratable):
         comb += stbs_done.eq(stbs_zero)
 
         # If we are still sending requests, was one accepted?
-        with m.If(~wb_in.stall & ~stbs_zero):
+        with m.If(~bus.stall & ~stbs_zero):
             # That was the last word? We are done sending.
             # Clear stb and set stbs_done so we can handle
             # an eventual last ack on the same cycle.
             with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
-                sync += Display(
-                         "IS_LAST_ROW_ADDR r.wb.addr:%x " \
-                         "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x " \
+                sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
+                         "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
                          "stbs_done:%x", r.wb.adr, r.end_row_ix,
-                         r.wb.stb, stbs_zero, stbs_done
-                        )
+                         r.wb.stb, stbs_zero, stbs_done)
                 sync += r.wb.stb.eq(0)
                 comb += stbs_done.eq(1)
 
             # Calculate the next row address
             rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
-            comb += rarange.eq(
-                     r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
-                    )
-            sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(
-                     rarange
-                    )
+            comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
+            sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
             sync += Display("RARANGE r.req_adr:%x rarange:%x "
                             "stbs_zero:%x stbs_done:%x",
                             r.req_adr, rarange, stbs_zero, stbs_done)
 
         # Incoming acks processing
-        with m.If(wb_in.ack):
+        with m.If(bus.ack):
             sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
                             "stbs_done:%x",
-                            wb_in.dat, stbs_zero, stbs_done)
+                            bus.dat_r, stbs_zero, stbs_done)
 
             sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
 
             # Check for completion
-            with m.If(stbs_done &
-                      is_last_row(r.store_row, r.end_row_ix)):
+            with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
                 # Complete wishbone cycle
                 sync += r.wb.cyc.eq(0)
                 # be nice, clear addr
@@ -699,28 +708,25 @@ class ICache(Elaboratable):
 
                 # Cache line is now valid
                 cv = Signal(INDEX_BITS)
-                comb += cv.eq(cache_valid_bits[r.store_index])
+                comb += cv.eq(cache_tags[r.store_index].valid)
                 comb += cv.bit_select(replace_way, 1).eq(
-                         r.store_valid & ~inval_in
-                        )
-                sync += cache_valid_bits[r.store_index].eq(cv)
+                         r.store_valid & ~inval_in)
+                sync += cache_tags[r.store_index].valid.eq(cv)
 
                 sync += r.state.eq(State.IDLE)
 
-            # not completed, move on to next request in row
-            with m.Else():
-                # Increment store row counter
-                sync += r.store_row.eq(next_row(r.store_row))
-
+            # move on to next request in row
+            # Increment store row counter
+            sync += r.store_row.eq(next_row(r.store_row))
 
     # Cache miss/reload synchronous machine
-    def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
+    def icache_miss(self, m, r, req_is_miss,
                     req_index, req_laddr, req_tag, replace_way,
                     cache_tags, access_ok, real_addr):
         comb = m.d.comb
         sync = m.d.sync
 
-        i_in, wb_in, m_in  = self.i_in, self.wb_in, self.m_in
+        i_in, bus, m_in  = self.i_in, self.bus, self.m_in
         stall_in, flush_in = self.stall_in, self.flush_in
         inval_in           = self.inval_in
 
@@ -733,31 +739,24 @@ class ICache(Elaboratable):
         # Process cache invalidations
         with m.If(inval_in):
             for i in range(NUM_LINES):
-                sync += cache_valid_bits[i].eq(0)
+                sync += cache_tags[i].valid.eq(0)
             sync += r.store_valid.eq(0)
 
         # Main state machine
         with m.Switch(r.state):
 
             with m.Case(State.IDLE):
-                self.icache_miss_idle(
-                    m, r, req_is_miss, req_laddr,
-                    req_index, req_tag, replace_way,
-                    real_addr
-                )
+                self.icache_miss_idle(m, r, req_is_miss, req_laddr,
+                                      req_index, req_tag, replace_way,
+                                      real_addr)
 
             with m.Case(State.CLR_TAG, State.WAIT_ACK):
                 with m.If(r.state == State.CLR_TAG):
-                    self.icache_miss_clr_tag(
-                        m, r, replace_way,
-                        cache_valid_bits, req_index,
-                        tagset, cache_tags
-                    )
-
-                self.icache_miss_wait_ack(
-                    m, r, replace_way, inval_in,
-                    stbs_done, cache_valid_bits
-                )
+                    self.icache_miss_clr_tag(m, r, replace_way,
+                                             req_index, tagset, cache_tags)
+
+                self.icache_miss_wait_ack(m, r, replace_way, inval_in,
+                                          cache_tags, stbs_done)
 
         # TLB miss and protection fault processing
         with m.If(flush_in | m_in.tlbld):
@@ -771,13 +770,13 @@ class ICache(Elaboratable):
         comb = m.d.comb
         sync = m.d.sync
 
-        wb_in, i_out       = self.wb_in, self.i_out
+        bus, i_out       = self.bus, self.i_out
         log_out, stall_out = self.log_out, self.stall_out
 
         # Output data to logger
         for i in range(LOG_LENGTH):
             log_data = Signal(54)
-            lway     = Signal(NUM_WAYS)
+            lway     = Signal(WAY_BITS)
             wstate   = Signal()
 
             sync += lway.eq(req_hit_way)
@@ -789,8 +788,8 @@ class ICache(Elaboratable):
             sync += log_data.eq(Cat(
                      ra_valid, access_ok, req_is_miss, req_is_hit,
                      lway, wstate, r.hit_nia[2:6], r.fetch_failed,
-                     stall_out, wb_in.stall, r.wb.cyc, r.wb.stb,
-                     r.real_addr[3:6], wb_in.ack, i_out.insn, i_out.valid
+                     stall_out, bus.stall, r.wb.cyc, r.wb.stb,
+                     r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
                     ))
             comb += log_out.eq(log_data)
 
@@ -801,11 +800,10 @@ class ICache(Elaboratable):
 
         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
         cache_tags       = CacheTagArray()
-        cache_valid_bits = CacheValidBitsArray()
 
-        itlb_valid_bits  = TLBValidBitsArray()
-        itlb_tags        = TLBTagArray()
-        itlb_ptes        = TLBPtesArray()
+        # TLB Array
+        itlb            = TLBArray()
+
         # TODO to be passed to nmigen as ram attributes
         # attribute ram_style of itlb_tags : signal is "distributed";
         # attribute ram_style of itlb_ptes : signal is "distributed";
@@ -816,15 +814,15 @@ class ICache(Elaboratable):
         r                = RegInternal()
 
         # Async signal on incoming request
-        req_index        = Signal(NUM_LINES)
-        req_row          = Signal(BRAM_ROWS)
-        req_hit_way      = Signal(NUM_WAYS)
+        req_index        = Signal(INDEX_BITS)
+        req_row          = Signal(ROW_BITS)
+        req_hit_way      = Signal(WAY_BITS)
         req_tag          = Signal(TAG_BITS)
         req_is_hit       = Signal()
         req_is_miss      = Signal()
         req_laddr        = Signal(64)
 
-        tlb_req_index    = Signal(TLB_SIZE)
+        tlb_req_index    = Signal(TLB_BITS)
         real_addr        = Signal(REAL_ADDR_BITS)
         ra_valid         = Signal()
         priv_fault       = Signal()
@@ -833,42 +831,75 @@ class ICache(Elaboratable):
 
         cache_out_row    = Signal(ROW_SIZE_BITS)
 
-        plru_victim      = PLRUOut()
-        replace_way      = Signal(NUM_WAYS)
+        plru_victim      = Signal(WAY_BITS)
+        replace_way      = Signal(WAY_BITS)
 
         # call sub-functions putting everything together,
         # using shared signals established above
         self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
         self.maybe_plrus(m, r, plru_victim)
-        self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags, real_addr,
-                         itlb_valid_bits, ra_valid, eaa_priv, priv_fault,
+        self.itlb_lookup(m, tlb_req_index, itlb, real_addr,
+                         ra_valid, eaa_priv, priv_fault,
                          access_ok)
-        self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
+        self.itlb_update(m, itlb)
         self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
-                         req_tag, real_addr, req_laddr, cache_valid_bits,
+                         req_tag, real_addr, req_laddr,
                          cache_tags, access_ok, req_is_hit, req_is_miss,
                          replace_way, plru_victim, cache_out_row)
         self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
                         req_index, req_tag, real_addr)
-        self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
+        self.icache_miss(m, r, req_is_miss, req_index,
                          req_laddr, req_tag, replace_way, cache_tags,
                          access_ok, real_addr)
         #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
         #                req_is_miss, req_is_hit, lway, wstate, r)
 
+        # don't connect up to FetchUnitInterface so that some unit tests
+        # can continue to operate
+        if not self.use_fetch_iface:
+            return m
+
+        # connect to FetchUnitInterface. FetchUnitInterface is undocumented
+        # so needs checking and iterative revising
+        i_in, bus, i_out = self.i_in, self.bus, self.i_out
+        comb += i_in.req.eq(self.a_i_valid)
+        comb += i_in.nia.eq(self.a_pc_i)
+        comb += self.stall_in.eq(self.a_stall_i)
+        comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
+        comb += self.f_badaddr_o.eq(i_out.nia)
+        comb += self.f_instr_o.eq(i_out.insn)
+        comb += self.f_busy_o.eq(~i_out.valid) # probably
+
+        # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
+        ibus = self.ibus
+        comb += ibus.adr.eq(self.bus.adr)
+        comb += ibus.dat_w.eq(self.bus.dat_w)
+        comb += ibus.sel.eq(self.bus.sel)
+        comb += ibus.cyc.eq(self.bus.cyc)
+        comb += ibus.stb.eq(self.bus.stb)
+        comb += ibus.we.eq(self.bus.we)
+
+        comb += self.bus.dat_r.eq(ibus.dat_r)
+        comb += self.bus.ack.eq(ibus.ack)
+        if hasattr(ibus, "stall"):
+            comb += self.bus.stall.eq(ibus.stall)
+        else:
+            # fake-up the wishbone stall signal to comply with pipeline mode
+            # same thing is done in dcache.py
+            comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
+
         return m
 
 
 def icache_sim(dut):
-    i_out = dut.i_in
-    i_in  = dut.i_out
+    i_in = dut.i_in
+    i_out  = dut.i_out
     m_out = dut.m_in
 
-    yield i_in.valid.eq(0)
-    yield i_out.priv_mode.eq(1)
-    yield i_out.req.eq(0)
-    yield i_out.nia.eq(0)
-    yield i_out.stop_mark.eq(0)
+    yield i_in.priv_mode.eq(1)
+    yield i_in.req.eq(0)
+    yield i_in.nia.eq(0)
+    yield i_in.stop_mark.eq(0)
     yield m_out.tlbld.eq(0)
     yield m_out.tlbie.eq(0)
     yield m_out.addr.eq(0)
@@ -877,107 +908,124 @@ def icache_sim(dut):
     yield
     yield
     yield
-    yield i_out.req.eq(1)
-    yield i_out.nia.eq(Const(0x0000000000000004, 64))
-    for i in range(30):
-        yield
+
+    # miss, stalls for a bit
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(Const(0x0000000000000004, 64))
     yield
-    valid = yield i_in.valid
+    valid = yield i_out.valid
+    while not valid:
+        yield
+        valid = yield i_out.valid
+    yield i_in.req.eq(0)
+
+    insn  = yield i_out.insn
     nia   = yield i_out.nia
-    insn  = yield i_in.insn
-    print(f"valid? {valid}")
-    assert valid
     assert insn == 0x00000001, \
         "insn @%x=%x expected 00000001" % (nia, insn)
-    yield i_out.req.eq(0)
+    yield i_in.req.eq(0)
     yield
 
     # hit
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(Const(0x0000000000000008, 64))
     yield
+    valid = yield i_out.valid
+    while not valid:
+        yield
+        valid = yield i_out.valid
+    yield i_in.req.eq(0)
+
+    nia   = yield i_out.nia
+    insn  = yield i_out.insn
     yield
-    yield i_out.req.eq(1)
-    yield i_out.nia.eq(Const(0x0000000000000008, 64))
-    yield
-    yield
-    valid = yield i_in.valid
-    nia   = yield i_in.nia
-    insn  = yield i_in.insn
-    assert valid
     assert insn == 0x00000002, \
         "insn @%x=%x expected 00000002" % (nia, insn)
-    yield
 
     # another miss
-    yield i_out.req.eq(1)
-    yield i_out.nia.eq(Const(0x0000000000000040, 64))
-    for i in range(30):
-        yield
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(Const(0x0000000000000040, 64))
     yield
-    valid = yield i_in.valid
-    nia   = yield i_out.nia
-    insn  = yield i_in.insn
-    assert valid
+    valid = yield i_out.valid
+    while not valid:
+        yield
+        valid = yield i_out.valid
+    yield i_in.req.eq(0)
+
+    nia   = yield i_in.nia
+    insn  = yield i_out.insn
     assert insn == 0x00000010, \
         "insn @%x=%x expected 00000010" % (nia, insn)
 
-    # test something that aliases
-    yield i_out.req.eq(1)
-    yield i_out.nia.eq(Const(0x0000000000000100, 64))
+    # test something that aliases (this only works because
+    # the unit test SRAM is a depth of 512)
+    yield i_in.req.eq(1)
+    yield i_in.nia.eq(Const(0x0000000000000100, 64))
     yield
     yield
-    valid = yield i_in.valid
+    valid = yield i_out.valid
     assert ~valid
     for i in range(30):
         yield
     yield
-    insn  = yield i_in.insn
-    valid = yield i_in.valid
-    insn  = yield i_in.insn
+    insn  = yield i_out.insn
+    valid = yield i_out.valid
+    insn  = yield i_out.insn
     assert valid
     assert insn == 0x00000040, \
          "insn @%x=%x expected 00000040" % (nia, insn)
-    yield i_out.req.eq(0)
-
+    yield i_in.req.eq(0)
 
 
 def test_icache(mem):
-     dut    = ICache()
+    from soc.config.test.test_loadstore import TestMemPspec
+    pspec = TestMemPspec(addr_wid=32,
+                         mask_wid=8,
+                         reg_wid=64,
+                         )
+    dut    = ICache(pspec)
 
-     memory = Memory(width=64, depth=512, init=mem)
-     sram   = SRAM(memory=memory, granularity=8)
+    memory = Memory(width=64, depth=512, init=mem)
+    sram   = SRAM(memory=memory, granularity=8)
 
-     m      = Module()
+    m      = Module()
 
-     m.submodules.icache = dut
-     m.submodules.sram   = sram
+    m.submodules.icache = dut
+    m.submodules.sram   = sram
 
-     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
-     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
-     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
-     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
-     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
-     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
+    m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
+    m.d.comb += sram.bus.stb.eq(dut.bus.stb)
+    m.d.comb += sram.bus.we.eq(dut.bus.we)
+    m.d.comb += sram.bus.sel.eq(dut.bus.sel)
+    m.d.comb += sram.bus.adr.eq(dut.bus.adr)
+    m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
 
-     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
-     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
+    m.d.comb += dut.bus.ack.eq(sram.bus.ack)
+    m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
 
-     # nmigen Simulation
-     sim = Simulator(m)
-     sim.add_clock(1e-6)
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
 
-     sim.add_sync_process(wrap(icache_sim(dut)))
-     with sim.write_vcd('test_icache.vcd'):
+    sim.add_sync_process(wrap(icache_sim(dut)))
+    with sim.write_vcd('test_icache.vcd'):
          sim.run()
 
+
 if __name__ == '__main__':
-    dut = ICache()
+    from soc.config.test.test_loadstore import TestMemPspec
+    pspec = TestMemPspec(addr_wid=64,
+                         mask_wid=8,
+                         reg_wid=64,
+                         )
+    dut = ICache(pspec)
     vl = rtlil.convert(dut, ports=[])
     with open("test_icache.il", "w") as f:
         f.write(vl)
 
+    # set up memory every 32-bits with incrementing values 0 1 2 ...
     mem = []
     for i in range(512):
         mem.append((i*2) | ((i*2+1)<<32))
 
     test_icache(mem)
-
index 3a9a1bc8e97b6f3cd7d49ae2734ae4a706b810d6..177e238c781205c813b8deec6173970b16ccce00 100644 (file)
@@ -23,7 +23,7 @@ class TestMemFetchUnit(FetchUnitInterface, Elaboratable):
         m.submodules.mem = mem = self.mem
 
         do_fetch = Signal()  # set when fetch while valid and not stalled
-        m.d.comb += do_fetch.eq(self.a_valid_i & ~self.a_stall_i)
+        m.d.comb += do_fetch.eq(self.a_i_valid & ~self.a_stall_i)
 
         # bit of a messy FSM that progresses from idle to in progress
         # to done.
@@ -37,7 +37,7 @@ class TestMemFetchUnit(FetchUnitInterface, Elaboratable):
         with m.If(~do_fetch):               # done
             m.d.sync += op_in_progress.eq(0)
 
-        m.d.comb += self.a_busy_o.eq(op_actioned & self.a_valid_i)
+        m.d.comb += self.a_busy_o.eq(op_actioned & self.a_i_valid)
         # fetch
         m.d.comb += mem.rdport.addr.eq(self.a_pc_i[adr_lsb:])
         m.d.comb += self.f_instr_o.eq(mem.rdport.data)
index 8414f77f75631691df9c358646fac454de950040..42ef061072d6b6b1511fa9e16061286744b27153 100644 (file)
@@ -43,7 +43,7 @@ import unittest
 
 class L0CacheBuffer2(Elaboratable):
     """L0CacheBuffer2"""
-    def __init__(self, n_units=8, regwid=64, addrwid=48):
+    def __init__(self, n_units=8, regwid=64, addrwid=64):
         self.n_units = n_units
         self.regwid = regwid
         self.addrwid = addrwid
@@ -59,7 +59,7 @@ class L0CacheBuffer2(Elaboratable):
         # connect the ports as modules
 
         for i in range(self.n_units):
-            d = LDSTSplitter(64, 48, 4, self.dports[i])
+            d = LDSTSplitter(64, 64, 4, self.dports[i])
             setattr(m.submodules, "ldst_splitter%d" % i, d)
 
         # state-machine latches TODO
@@ -126,8 +126,8 @@ class DataMerger(Elaboratable):
         :addr_array_i: an NxN Array of Signals with bits set indicating address
                        match.  bits across the diagonal (addr_array_i[x][x])
                        will always be set, to indicate "active".
-        :data_i: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
-        :data_o: an Output Record of same type
+        :i_data: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
+        :o_data: an Output Record of same type
                  {data: 128 bit, byte_enable: 16 bit}
         """
         self.array_size = array_size
@@ -141,8 +141,8 @@ class DataMerger(Elaboratable):
         ul = []
         for i in range(array_size):
             ul.append(DataMergerRecord())
-        self.data_i = Array(ul)
-        self.data_o = DataMergerRecord()
+        self.i_data = Array(ul)
+        self.o_data = DataMergerRecord()
 
     def elaborate(self, platform):
         m = Module()
@@ -160,10 +160,10 @@ class DataMerger(Elaboratable):
                 select = self.addr_array_i[idx][j]
                 r = DataMergerRecord()
                 with m.If(select):
-                    comb += r.eq(self.data_i[j])
+                    comb += r.eq(self.i_data[j])
                 l.append(r)
-            comb += self.data_o.data.eq(ortreereduce(l, "data"))
-            comb += self.data_o.en.eq(ortreereduce(l, "en"))
+            comb += self.o_data.data.eq(ortreereduce(l, "data"))
+            comb += self.o_data.en.eq(ortreereduce(l, "en"))
 
         return m
 
@@ -197,15 +197,15 @@ class TstDataMerger2(Elaboratable):
 
         for j in range(self.n_units):
             inp = self.input_array[j]
-            m.d.comb += dm_even.data_i[j].en.eq(inp.bytemask_even)
-            m.d.comb += dm_odd.data_i[j].en.eq(inp.bytemask_odd)
-            m.d.comb += dm_even.data_i[j].data.eq(inp.data_even)
-            m.d.comb += dm_odd.data_i[j].data.eq(inp.data_odd)
+            m.d.comb += dm_even.i_data[j].en.eq(inp.bytemask_even)
+            m.d.comb += dm_odd.i_data[j].en.eq(inp.bytemask_odd)
+            m.d.comb += dm_even.i_data[j].data.eq(inp.data_even)
+            m.d.comb += dm_odd.i_data[j].data.eq(inp.data_odd)
             m.d.comb += dm_even.addr_array_i[j].eq(self.addr_match(j,addr_even))
             m.d.comb += dm_odd.addr_array_i[j].eq(self.addr_match(j,addr_odd))
 
-        m.d.comb += self.data_odd.eq(dm_odd.data_o.data)
-        m.d.comb += self.data_even.eq(dm_even.data_o.data)
+        m.d.comb += self.data_odd.eq(dm_odd.o_data.data)
+        m.d.comb += self.data_even.eq(dm_even.o_data.data)
         return m
 
 
@@ -228,7 +228,7 @@ class L0CacheBuffer(Elaboratable):
     by this class.  That task is taken care of by LDSTCompUnit.
     """
 
-    def __init__(self, n_units, pimem, regwid=64, addrwid=48):
+    def __init__(self, n_units, pimem, regwid=64, addrwid=64):
         self.n_units = n_units
         self.pimem = pimem
         self.regwid = regwid
@@ -384,20 +384,20 @@ def l0_cache_ldst(arg, dut):
 def data_merger_merge(dut):
     # starting with all inputs zero
     yield Settle()
-    en = yield dut.data_o.en
-    data = yield dut.data_o.data
+    en = yield dut.o_data.en
+    data = yield dut.o_data.data
     assert en == 0, "en must be zero"
     assert data == 0, "data must be zero"
     yield
 
     yield dut.addr_array_i[0].eq(0xFF)
     for j in range(dut.array_size):
-        yield dut.data_i[j].en.eq(1 << j)
-        yield dut.data_i[j].data.eq(0xFF << (16*j))
+        yield dut.i_data[j].en.eq(1 << j)
+        yield dut.i_data[j].data.eq(0xFF << (16*j))
     yield Settle()
 
-    en = yield dut.data_o.en
-    data = yield dut.data_o.data
+    en = yield dut.o_data.en
+    data = yield dut.o_data.data
     assert data == 0xff00ff00ff00ff00ff00ff00ff00ff
     assert en == 0xff
     yield
@@ -414,7 +414,7 @@ class TestL0Cache(unittest.TestCase):
     def test_l0_cache_test_bare_wb(self):
 
         pspec = TestMemPspec(ldst_ifacetype='test_bare_wb',
-                             addr_wid=48,
+                             addr_wid=64,
                              mask_wid=8,
                              reg_wid=64)
         dut = TstL0CacheBuffer(pspec)
@@ -428,7 +428,7 @@ class TestL0Cache(unittest.TestCase):
     def test_l0_cache_testpi(self):
 
         pspec = TestMemPspec(ldst_ifacetype='testpi',
-                             addr_wid=48,
+                             addr_wid=64,
                              mask_wid=8,
                              reg_wid=64)
         dut = TstL0CacheBuffer(pspec)
index 08764232b6c4bc34cc092e2158a38d46ab355541..11a1ba81a14c020d15f9b3268604a7102b54b3e4 100644 (file)
@@ -19,8 +19,8 @@ class TestMemLoadStoreUnit(LoadStoreUnitInterface, Elaboratable):
         do_store = Signal() # set when store while valid and not stalled
 
         m.d.comb += [
-            do_load.eq(self.x_ld_i & (self.x_valid_i & ~self.x_stall_i)),
-            do_store.eq(self.x_st_i & (self.x_valid_i & ~self.x_stall_i)),
+            do_load.eq(self.x_ld_i & (self.x_i_valid & ~self.x_stall_i)),
+            do_store.eq(self.x_st_i & (self.x_i_valid & ~self.x_stall_i)),
             ]
         # bit of a messy FSM that progresses from idle to in progress
         # to done.
@@ -34,7 +34,7 @@ class TestMemLoadStoreUnit(LoadStoreUnitInterface, Elaboratable):
         with m.If(~(do_load | do_store)):               # done
             m.d.sync += op_in_progress.eq(0)
 
-        m.d.comb += self.x_busy_o.eq(op_actioned & self.x_valid_i)
+        m.d.comb += self.x_busy_o.eq(op_actioned & self.x_i_valid)
 
         m.d.comb += [
             # load
index 8e63bdec4a580971c4dd8a6272b17b687f6e1b0c..2176855d0efa2b4cf21beb0a709e34559158e893 100644 (file)
@@ -32,6 +32,45 @@ from soc.experiment.mem_types import (LoadStore1ToMMUType,
                                  DCacheToMMUType,
                                  MMUToICacheType)
 
+# Radix Tree Page Directory Entry Record, TODO put this somewhere sensible
+# v3.0C Book III p1015-1016 section 6.7.10.1
+class RTPDE(RecordObject):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.nls   = Signal(5)  # Nextded Access Auth bits 59:63 LSB0 0:4
+        self.rs1   = Signal(3)  # Reserved            bits 56:58 LSB0 5:7
+        self.nlb   = Signal(52) # Next Level Base     bit  4:55  LSB0 8:59
+        self.rs2   = Signal(2)  # Reserved            bit  2:3   LSB0 60:61
+        self.leaf  = Signal(1)  # leaf                bit  1     LSB0 62
+        self.valid = Signal(1)  # valid               bit  0     LSB0 63
+
+
+# Radix Tree Page Table Entry Record, TODO put this somewhere sensible
+# v3.0C Book III p1016 section 6.7.10.2
+class RTPTE(RecordObject):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.eaa   = Signal(4)  # Encoded Access Auth bits 60:63 LSB0 0:3
+        self.att   = Signal(2)  # Attributes          bits 58:59 LSB0 4:5
+        self.rs1   = Signal(1)  # Reserved            bit  57    LSB0 6
+        self.c     = Signal(1)  # Change              bit  56    LSB0 7
+        self.r     = Signal(1)  # Reference           bit  55    LSB0 8
+        self.sw    = Signal(3)  # SW bits 1:3         bits 52:54 LSB0 9:11
+        self.rpn   = Signal(45) # Real Page Number    bits 7:51  LSB0 12:56
+        self.rs2   = Signal(4)  # Reserved            bit  3:6   LSB0 57-60
+        self.sw0   = Signal(1)  # SW bit 0            bit  2     LSB0 61
+        self.leaf  = Signal(1)  # leaf                bit  1     LSB0 62
+        self.valid = Signal(1)  # valid               bit  0     LSB0 63
+
+# and these... which of course are turned round to LSB0 order.
+# TODO: sigh. use botchify and put them in openpower.consts
+EAA_PRIV = 3 # bit 0 (in MSB0) set ==> problem-state banned (priv=1 only)
+EAA_RD   = 2 # bit 1 (in MSB0) set ==> loads are permitted
+EAA_WR   = 1 # bit 2 (in MSB0) set ==> load and stores permitted
+EAA_EXE  = 0 # bit 3 (in MSB0) set ==> execute permitted
+
+# for debugging
+display_invalid = True
 
 @unique
 class State(Enum):
@@ -47,6 +86,19 @@ class State(Enum):
     RADIX_FINISH = 9
 
 
+# Process Table Record - near-identical to Page Table Record (same format)
+# v3.0C Book III Section 6.7.6.2 p1004
+class PRTBL(RecordObject):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.rpds  = Signal(5)  # Root Page Directory Size  59:63 LSB0 0:4
+        self.rts2  = Signal(3)  # Radix Tree Size part 2    56:58 LSB0 5:7
+        self.rpdb  = Signal(52) # Root Page Directory Base  4:55  LSB0 8:59
+        self.rsv2  = Signal(1)  # reserved                  3     LSB0 60
+        self.rts1  = Signal(2)  # Radix Tree Size part 1    1:2   LSB0 61:62
+        self.rsv1  = Signal(1)  # reserved                  0     LSB0 63
+
+
 class RegStage(RecordObject):
     def __init__(self, name=None):
         super().__init__(name=name)
@@ -57,17 +109,26 @@ class RegStage(RecordObject):
         self.priv = Signal()
         self.addr = Signal(64)
         self.inval_all = Signal()
+
         # config SPRs
         self.prtbl = Signal(64)
         self.pid = Signal(32)
+
         # internal state
         self.state = Signal(State) # resets to IDLE
         self.done = Signal()
         self.err = Signal()
+
+        # there are 4 quadrants (0-3): here we only support 2 (pt0 and pt3)
+        # these are bits 62-63 of any given address.
+        # except in segment_check, bit 62 is ignored
+        # Quadrant Select can be seen in v3.0C 6.7.10 p1015 book III figure 36
+        # and is further described in 6.7.11.3 p1019
         self.pgtbl0 = Signal(64)
         self.pt0_valid = Signal()
         self.pgtbl3 = Signal(64)
         self.pt3_valid = Signal()
+
         self.shift = Signal(6)
         self.mask_size = Signal(5)
         self.pgbase = Signal(56)
@@ -79,6 +140,20 @@ class RegStage(RecordObject):
         self.rc_error = Signal()
 
 
+# Page Table Record - note that HR bit is treated as part of rts below
+# (near-identical to Process Table Record - same format)
+# v3.0C Book III Section 6.7.6.1 p1003
+class PGTBL(RecordObject):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.rpds  = Signal(5)  # Root Page Directory Size  59:63 LSB0 0:4
+        self.rts2  = Signal(3)  # Radix Tree Size part 2    56:58 LSB0 5:7
+        self.rpdb  = Signal(52) # Root Page Directory Base  4:55  LSB0 8:59
+        self.s     = Signal(1)  # Host Secure               3     LSB0 60
+        self.rts1  = Signal(2)  # Radix Tree Size part 1    1:2   LSB0 61:62
+        self.hr    = Signal(1)  # Host Radix                0     LSB0 63
+
+
 class MMU(Elaboratable):
     """Radix MMU
 
@@ -87,41 +162,52 @@ class MMU(Elaboratable):
     (i.e. there is no gRA -> hRA translation).
     """
     def __init__(self):
-        self.l_in  = LoadStore1ToMMUType()
-        self.l_out = MMUToLoadStore1Type()
-        self.d_out = MMUToDCacheType()
-        self.d_in  = DCacheToMMUType()
-        self.i_out = MMUToICacheType()
+        self.l_in  = LoadStore1ToMMUType("l_in")
+        self.l_out = MMUToLoadStore1Type("l_out")
+        self.d_out = MMUToDCacheType("d_out")
+        self.d_in  = DCacheToMMUType("d_in")
+        self.i_out = MMUToICacheType("i_out")
 
     def radix_tree_idle(self, m, l_in, r, v):
+        """radix_tree_idle - the main decision-point.  valid actions include:
+        * LDST incoming TLBIE request (invalidate TLB entry)
+        * LDST incoming RADIX walk request
+        * set either PRTBL or PID SPRs (which then fires a TLB invalidate)
+        """
         comb = m.d.comb
         sync = m.d.sync
 
         pt_valid = Signal()
-        pgtbl = Signal(64)
+        pgtbl = PGTBL("pgtbl")
         rts = Signal(6)
-        mbits = Signal(6)
+        mbits = Signal(6, name="mbits_idle")
 
-        with m.If(~l_in.addr[63]):
-            comb += pgtbl.eq(r.pgtbl0)
-            comb += pt_valid.eq(r.pt0_valid)
-        with m.Else():
+        with m.If(l_in.addr[63]): # quadrant 3
             comb += pgtbl.eq(r.pgtbl3)
             comb += pt_valid.eq(r.pt3_valid)
+        with m.Else():
+            comb += pgtbl.eq(r.pgtbl0)
+            comb += pt_valid.eq(r.pt0_valid)
 
         # rts == radix tree size, number of address bits
-        # being translated
-        comb += rts.eq(Cat(pgtbl[5:8], pgtbl[61:63]))
+        # being translated.  takes bits 5:7 and 61:62
+        comb += rts.eq(Cat(pgtbl.rts2, pgtbl.rts1, C(0)))
 
         # mbits == number of address bits to index top
-        # level of tree
-        comb += mbits.eq(pgtbl[0:5])
+        # level of tree.  takes bits 0:4
+        comb += mbits.eq(pgtbl.rpds)
 
         # set v.shift to rts so that we can use finalmask
-        # for the segment check
+        # for the segment check.
+        # note: rpdb (52 bits long) is truncated to 48 bits
         comb += v.shift.eq(rts)
         comb += v.mask_size.eq(mbits[0:5])
-        comb += v.pgbase.eq(Cat(C(0, 8), pgtbl[8:56]))
+
+        # create the page base from root page directory base (48 bits with 8 0s)
+        comb += v.pgbase.eq(Cat(C(0, 8), pgtbl.rpdb[:48])) # bits 8:55
+
+        # request either TLB invalidate
+        # or start a RADIX walk
 
         with m.If(l_in.valid):
             comb += v.addr.eq(l_in.addr)
@@